Esempio n. 1
0
def pslSelectQuery(options):

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    value, field = options.select.split("-")

    if field == "nmatches":
        f = lambda x: x.mNMatches
    elif field == "nmismatches":
        f = lambda x: x.mNMisMatches

    for data in Blat.iterator_per_query(Blat.iterator(options.stdin)):

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        data.sort(key=f)

        if value == "most":
            options.stdout.write("%s\n" % str(data[-1]))
        elif value == "least":
            options.stdout.write("%s\n" % str(data[0]))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 2
0
def pslSelectQuery(options):

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    value, field = options.select.split("-")

    if field == "nmatches":
        f = lambda x: x.mNMatches
    elif field == "nmismatches":
        f = lambda x: x.mNMisMatches

    for data in Blat.iterator_per_query(Blat.iterator(options.stdin)):

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        data.sort(key=f)

        if value == "most":
            options.stdout.write("%s\n" % str(data[-1]))
        elif value == "least":
            options.stdout.write("%s\n" % str(data[0]))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"]
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    for psl in Blat.iterator(options.stdin):
        ninput += 1
        if psl.strand == "-":
            qstart, qend = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom
        else:
            qstart, qend = psl.mQueryFrom, psl.mQueryTo

        options.stdout.write(
            "chain %i %s %i %s %i %i %s %i %s %i %i %i\n"
            % (
                psl.mNMatches,
                psl.mSbjctId,
                psl.mSbjctLength,
                "+",
                psl.mSbjctFrom,
                psl.mSbjctTo,
                psl.mQueryId,
                psl.mQueryLength,
                psl.strand,
                qstart,
                qend,
                ninput,
            )
        )

        size, tend, qend = 0, None, None
        for qstart, tstart, size in psl.getBlocks():
            if tend != None:
                options.stdout.write("\t%i\t%i\n" % (tstart - tend, qstart - qend))
            qend, tend = qstart + size, tstart + size
            options.stdout.write("%i" % (size,))
        options.stdout.write("\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    for psl in Blat.iterator(options.stdin):
        ninput += 1
        if psl.strand == "-":
            qstart, qend = psl.mQueryLength - \
                psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom
        else:
            qstart, qend = psl.mQueryFrom, psl.mQueryTo

        options.stdout.write("chain %i %s %i %s %i %i %s %i %s %i %i %i\n" %
                             (psl.mNMatches,
                              psl.mSbjctId,
                              psl.mSbjctLength,
                              "+",
                              psl.mSbjctFrom,
                              psl.mSbjctTo,
                              psl.mQueryId,
                              psl.mQueryLength,
                              psl.strand,
                              qstart,
                              qend,
                              ninput))

        size, tend, qend = 0, None, None
        for qstart, tstart, size in psl.getBlocks():
            if tend is not None:
                options.stdout.write(
                    "\t%i\t%i\n" % (tstart - tend, qstart - qend))
            qend, tend = qstart + size, tstart + size
            options.stdout.write("%i" % (size,))
        options.stdout.write("\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--filter-query", dest="filename_filter_query",
                      type="string",
                      help="filename with intervals in the query "
                      "to filter (in gff format) [default=%default].")

    parser.add_option("--filter-target", dest="filename_filter_target",
                      type="string",
                      help="filename with intervals in the target to "
                      "filter (in gff format) [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("map", "merge",
                               "add-sequence", "complement",
                               "select-query", "test",
                               "filter-keep", "filter-remove",
                               "rename-query",
                               "sanitize",
                               "filter-fasta",
                               "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select", dest="select", type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header-names", dest="header", type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--queries-tsv-file", dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option("--id-format", dest="id_format", type="string",
                      help="format of new identifiers for the rename "
                      "function [default=%default].")

    parser.add_option("--unique", dest="unique", action="store_true",
                      help="in the rename function, make each match "
                      "unique [default=%default].")

    parser.add_option("--output-filename-map", dest="output_filename_map",
                      type="string",
                      help="filename with map of old to new labels for "
                      "rename function [default=%default].")

    parser.add_option("--complement-min-length", dest="complement_min_length",
                      type="int",
                      help="minimum length for complemented blocks "
                      "[default=%default].")

    parser.add_option("--complement-border", dest="complement_border",
                      type="int",
                      help="number of residues to exclude before alignment "
                      "at either end [default=%default].")

    parser.add_option("--complement-aligner", dest="complement_aligner",
                      type="choice",
                      choices=("clustal", "dba", "dialign", "dialign-lgs"),
                      help="aligner for complemented segments "
                      "[default=%default].")

    parser.add_option("--threshold-merge-distance",
                      dest="threshold_merge_distance", type="int",
                      help="distance in nucleotides at which two adjacent "
                      "reads shall be merged even if they are not "
                      "overlapping [%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="for debugging purposes - stop after x "
                      "iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and \
       (sbjct_fasta is None or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and "
            "target/genome sequence data.")

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(
                iterator, query_fasta, sbjct_fasta, options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(
                iterator, query_fasta, sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.stop()
Esempio n. 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--filter-query", dest="filename_filter_query",
                      type="string",
                      help="filename with intervals in the query "
                      "to filter (in gff format) [default=%default].")

    parser.add_option("--filter-target", dest="filename_filter_target",
                      type="string",
                      help="filename with intervals in the target to "
                      "filter (in gff format) [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("map", "merge",
                               "add-sequence", "complement",
                               "select-query", "test",
                               "filter-keep", "filter-remove",
                               "rename-query",
                               "sanitize",
                               "filter-fasta",
                               "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select", dest="select", type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header-names", dest="header", type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--queries-tsv-file", dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option("--id-format", dest="id_format", type="string",
                      help="format of new identifiers for the rename "
                      "function [default=%default].")

    parser.add_option("--unique", dest="unique", action="store_true",
                      help="in the rename function, make each match "
                      "unique [default=%default].")

    parser.add_option("--output-filename-map", dest="output_filename_map",
                      type="string",
                      help="filename with map of old to new labels for "
                      "rename function [default=%default].")

    parser.add_option("--complement-min-length", dest="complement_min_length",
                      type="int",
                      help="minimum length for complemented blocks "
                      "[default=%default].")

    parser.add_option("--complement-border", dest="complement_border",
                      type="int",
                      help="number of residues to exclude before alignment "
                      "at either end [default=%default].")

    parser.add_option("--complement-aligner", dest="complement_aligner",
                      type="choice",
                      choices=("clustal", "dba", "dialign", "dialign-lgs"),
                      help="aligner for complemented segments "
                      "[default=%default].")

    parser.add_option("--threshold-merge-distance",
                      dest="threshold_merge_distance", type="int",
                      help="distance in nucleotides at which two adjacent "
                      "reads shall be merged even if they are not "
                      "overlapping [%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="for debugging purposes - stop after x "
                      "iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and \
       (sbjct_fasta is None or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and "
            "target/genome sequence data.")

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(
                iterator, query_fasta, sbjct_fasta, options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(
                iterator, query_fasta, sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.Stop()
Esempio n. 8
0
        if method == "counts":
            counters.append( SequencePairProperties.SequencePairPropertiesCountsNa() )
        elif method == "query-counts":
            counters.append( QueriesCounter() )
        elif method == "sbjct-counts":
            counters.append( SbjctsCounter() )
        elif method == "baseml":
            counters.append( SequencePairProperties.SequencePairPropertiesBaseML( options ) )
        elif method == "match":
            counters_plain.append( CounterMatch( options ) )
            
    if counters:
        iterator = Blat.iterator_pslx( options.stdin )
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator( options.stdin )
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write( "\t".join( 
            [header,] + 
            [ "\t".join(x.getHeaders()) for x in counters] +
            [ "\t".join(x.getHeaders()) for x in counters_plain] ) + "\n" )

    ninput, noutput, nskipped = 0, 0, 0

#     ## setup totals
#     totals = {}
#     for section in options.sections:
Esempio n. 9
0
def main():

    parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("--random-proportion", dest="random_proportion", type="float",
                      help="mask randomly columns in multiple alignments [default=%default]" )

    parser.add_option("--random", dest="random", action="store_true",
                      help="shuffle quality scores before masking [default=%default]" )

    parser.set_defaults(
        quality_threshold = 40,
        quality_file = "quality",
        filename_map = None,
        frame = 3,
        )

    (options, args) = E.Start( parser )

    ##################################################
    ##################################################
    ##################################################
    ## read map
    ##################################################
    infile = open(options.filename_map) 
    map_genes2genome = {}
    for match in Blat.iterator( infile ):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    ## get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta( options.quality_file )
    quality.setTranslator( IndexedFasta.TranslatorBytes() )

    ##################################################
    ##################################################
    ##################################################
    ## main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write( "cluster_id\tstart\tend\n" )

    for line in options.stdin:
        if line.startswith("cluster_id"): continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn( "gene_id %s not found in map." % gene_id )
            continue
        
        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are 
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative: 
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment( map_gene2mali, alignment )

        # get quality scores
        quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)


        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR )
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp,c in enumerate(alignment):
                if c == "-": continue
                y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom 
                if y < 0: continue
                positions.append( y )
            scores = [ quality_scores[ x ] for x in positions ]
            random.shuffle(scores)
            for p,q in zip( positions,scores): quality_scores[p] = q

        # negative strand
        to_mask = []
        ## reverse position
        rp = len(alignment)
        for fp,c in enumerate(alignment):
            rp -= 1
            if c == "-": continue
            y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom
            if y < 0: continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative: p = rp
                else: p = fp
                E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \
                             (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) )
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend( list( range(start, start + options.frame) ) )
                else:
                    to_mask.append( p ) 

        regions = Iterators.group_by_distance( sorted(to_mask) )
            
        for start,end in regions:
            options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) )

    E.Stop()
Esempio n. 10
0
        forward_query = False,
        )
    
    (options, args) = E.Start( parser )

    if options.filename_query:
        query = IndexedFasta.IndexedFasta( options.filename_query )

    if options.filename_target:
        target = IndexedFasta.IndexedFasta( options.filename_target )

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator( options.stdin ):        
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) )
        q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo )
        t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo )
        query_ali, sbjct_ali = getAlignment( m, q, t, options )

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement( query_ali )
            sbjct_ali = Genomics.complement( sbjct_ali )

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
                                 (options.query_prefix, 
Esempio n. 11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--mask-lowercase", dest="mask_lowercase", action="store_true",
                      help="mask lowercase characters before computing properties [default=%default]")

    parser.add_option("--with-match", dest="with_match", action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option("--without-match", dest="with_match", action="store_false",
                      help="do not echo the match in output [default=%default]")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=(
                          "counts", "baseml", "match", "query-counts", "sbjct-counts"),
                      help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write("\t".join(
        [header, ] +
        ["\t".join(x.getHeaders()) for x in counters] +
        ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0



    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write("\t" +
                                 "\t".join(
                                     [str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write("\t" +
                                 "\t".join(
                                     [str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    query_bitsets, target_bitsets = {}, {}

    def addRange(bitset, id, size, iterator):

        if id not in bitset:
            bitset[id] = bx.bitset.BinnedBitSet(size)
        b = bitset[id]

        for start, end in iterator:
            b.set_range(start, end - start)

    for psl in Blat.iterator(options.stdin):

        addRange(query_bitsets, psl.mQueryId, psl.mQueryLength,
                 psl.iterator_query_exons())

        addRange(target_bitsets, psl.mSbjctId, psl.mSbjctLength,
                 psl.iterator_sbjct_exons())

    def printBitset(outfile, bitsets):

        outfile.write("contig\tcovered\tsize\tpcovered\n")
        total, total_len = 0, 0
        for chrom in sorted(bitsets):

            l = bitsets[chrom].size
            s = bitsets[chrom].count_range(0, l)
            if l > 0:
                outfile.write("%s\t%i\t%i\t%6.4f\n" %
                              (chrom, s, l, 100.0 * s / l))
            total += s
            total_len += l

        if total_len > 0:
            outfile.write("total\t%i\t%i\t%6.4f\n" %
                          (total, total_len, 100.0 * total / total_len))

    options.stdout.write("# query\n")
    printBitset(options.stdout, query_bitsets)
    options.stdout.write("# target\n")
    printBitset(options.stdout, target_bitsets)

    E.Stop()
Esempio n. 13
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--random-proportion",
        dest="random_proportion",
        type="float",
        help="mask randomly columns in multiple alignments [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.Start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = open(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        quality_scores = quality.getSequence(match.mSbjctId, "+",
                                             match.mSbjctFrom, match.mSbjctTo)

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores
        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.Stop()
Esempio n. 14
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file",
                      dest="filename_query",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file",
                      dest="filename_target",
                      type="string",
                      help="fasta filename with target.")

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("full", "pileup-query", "pileup-target", "gapless"),
        help="method to use for constructing the alignment [%default].")

    parser.add_option(
        "--forward-query",
        dest="forward_query",
        action="store_true",
        help=
        "reverse-complement sequences such that query is always on forward strand [%default]"
    )

    parser.add_option("--target-prefix",
                      dest="target_prefix",
                      type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix",
                      dest="query_prefix",
                      type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id",
                      dest="id",
                      type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts),
                        -min(match.mSbjctBlockStarts))
        q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom,
                              match.mQueryTo)
        t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom,
                               match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(
            ">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
            (options.query_prefix, options.output_format_id % id,
             match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali,
             options.target_prefix, options.output_format_id % id,
             match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
             sbjct_ali))
        id += 1

    E.Stop()
Esempio n. 15
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $",
                                    usage = globals()["__doc__"])

    parser.set_defaults(
        )
    
    (options, args) = E.Start( parser )

    query_bitsets, target_bitsets = {}, {}

    def addRange( bitset, id, size, iterator ):
        
        if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet( size )
        b = bitset[id]

        for start, end in iterator:
            b.set_range( start, end-start )

    for psl in Blat.iterator( options.stdin ):

        addRange( query_bitsets, 
                  psl.mQueryId, 
                  psl.mQueryLength,
                  psl.iterator_query_exons() )

        addRange( target_bitsets, 
                  psl.mSbjctId, 
                  psl.mSbjctLength,
                  psl.iterator_sbjct_exons() )
        
    def printBitset( outfile, bitsets ):


        outfile.write( "contig\tcovered\tsize\tpcovered\n" )
        total, total_len = 0, 0
        for chrom in sorted(bitsets):
            
            l = bitsets[chrom].size 
            s = bitsets[chrom].count_range( 0, l )
            if l > 0:
                outfile.write( "%s\t%i\t%i\t%6.4f\n" % (chrom, s,l,100.0 * s / l) )
            total += s
            total_len += l

        if total_len > 0:
            outfile.write("total\t%i\t%i\t%6.4f\n" % (total,total_len, 100.0 * total / total_len))        
        
    options.stdout.write("# query\n" )
    printBitset( options.stdout, query_bitsets )
    options.stdout.write("# target\n" )
    printBitset( options.stdout, target_bitsets )

    E.Stop()
Esempio n. 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file", dest="filename_query", type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_target", type="string",
                      help="fasta filename with target.")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "full", "pileup-query", "pileup-target", "gapless"),
                      help="method to use for constructing the alignment [%default].")

    parser.add_option("--forward-query", dest="forward_query", action="store_true",
                      help="reverse-complement sequences such that query is always on forward strand [%default]")

    parser.add_option("--target-prefix", dest="target_prefix", type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix", dest="query_prefix", type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id", dest="id", type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts), -
                        min(match.mSbjctBlockStarts))
        q = query.getSequence(
            match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
        t = target.getSequence(
            match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
                             (options.query_prefix,
                              options.output_format_id % id,
                              match.mQueryId, match.mQueryFrom, match.mQueryTo,
                              query_ali,
                              options.target_prefix,
                              options.output_format_id % id,
                              match.mSbjctId, match.strand,
                              match.mSbjctFrom, match.mSbjctTo,
                              sbjct_ali))
        id += 1

    E.Stop()