Esempio n. 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))
Esempio n. 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-q",
                      "--query",
                      dest="query",
                      type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t",
                      "--target",
                      dest="target",
                      type="string",
                      help="sequence to use for target [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(
        genome_file=None,
        query=None,
        target=None,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    psl = Blat.Match()

    for bed in Bed.iterator(options.stdin):

        ninput += 1

        start, end = bed.start, bed.end

        if "blockSizes" in bed:
            psl.mQueryId = bed["name"]
            blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]]
            sbjctblockstarts = [
                int(x) + start for x in bed["blockStarts"].split(",")[:-1]
            ]
            strand = bed["strand"]
        else:
            psl.mQueryId = "%i" % ninput
            blocksizes = [end - start]
            sbjctblockstarts = [
                start,
            ]

            strand = "+"

        psl.mSbjctId = bed.contig
        psl.mSbjctFrom, psl.mSbjctTo = start, end
        psl.mQueryFrom, psl.mQueryTo = 0, end - start

        psl.mBlockSizes = blocksizes
        psl.mNBlocks = len(blocksizes)
        psl.strand = strand
        q, qp = [], 0
        for x in blocksizes:
            q.append(qp)
            qp += x

        psl.mQueryBlockStarts = q
        psl.mSbjctBlockStarts = sbjctblockstarts
        psl.mQueryLength = sum(psl.mBlockSizes)
        if fasta:
            psl.mSbjctLength = fasta.getLength(bed.contig)

        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))

        assert qs == options.query
        assert ts == options.target
        psl.mQueryId = qcontig
        psl.mSbjctId = tcontig

        psl.fromPair(query.start, query.src_size, query.strand, query.text.upper(),
                     target.start, target.src_size, target.strand, target.text.upper())

        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (qs, qcontig, query.start, query.src_size, query.strand, query.text))
        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (ts, tcontig, target.start, target.src_size, target.strand, target.text))
        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries - required for polyA analysis [%default].")

    parser.add_option("--polyA", dest="polyA", action="store_true",
                      help="detect polyA tails [%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string",
                      help="OUTPUT filename with queries for which all matches have been discarded [%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("map", "psl"),
                      help="output format to choose [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float",
                      help="minimum thresholds for pid [%default].")

    parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int",
                      help="minimum threshold for number of matching residues [%default].")

    parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float",
                      help="maximum threshold for error of aligned part [%default].")

    parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float",
                      help="minimum query coverage for segments to be counted as good [%default].")

    parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float",
                      help="minimum query coverage for segments to be accepted [%default].")

    parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int",
                      help="maximum number of gap characters  in query[%default].")

    parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int",
                      help="maximum number of gaps  in query[%default].")

    parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int",
                      help="maximum number of gap characters  in sbjct[%default].")

    parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true",
                      help="ignore filters for unique matches [%default].")

    parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true",
                      help="when sorting matches, keep all matches within the collection threshold [%default].")

    parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true",
                      help="keep only the best entry per sbjct (for transcript mapping) [%default].")

    parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int",
                      help="maximum number of gaps  in sbjct[%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing[%default].")

    parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice",
                      choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage",
                               "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid",
                               "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid",
                               "unique", "all"),
                      help="determines how to selecte the best match [%default].")

    parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string",
                      help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].")

    parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true",
                      help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].")

    parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true",
                      help="use forward coordinates for query, strand will refer to sbjct [%default].")

    parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true",
                      help="if there are multiple best matches, ignore all those to chrUn and _random [%default].")

    parser.add_option("--collection-threshold", dest="collection_threshold", type="float",
                      help="threshold for collecting matches, percent of best score [%default].")

    parser.add_option("--collection-distance", dest="collection_distance", type="float",
                      help="threshold for collecting matches, difference to best score [%default].")

    parser.set_defaults(input_filename_domains=None,
                        input_filename_queries=None,
                        threshold_good_query_coverage=90.0,
                        threshold_min_pid=30.0,
                        threshold_min_matches=0,
                        threshold_max_error_rate=None,
                        output_filename_pattern="%s",
                        keep_unique_matches=False,
                        output_format="map",
                        print_matched=["full", "partial", "good"],
                        from_zipped=False,
                        combine_overlaps=True,
                        min_length_domain=30,
                        threshold_min_query_coverage=50,
                        min_length_singletons=30,
                        new_family_id=10000000,
                        add_singletons=False,
                        matching_mode="best-coverage",
                        best_per_sbjct=False,
                        threshold_max_query_gapchars=None,
                        threshold_max_query_gaps=None,
                        threshold_max_sbjct_gapchars=None,
                        threshold_max_sbjct_gaps=None,
                        filename_filter_sbjct=None,
                        keep_forbidden=False,
                        keep_all_best=False,
                        test=None,
                        query_forward_coordinates=False,
                        output_filename_empty=None,
                        collection_threshold=1.0,
                        collection_distance=0,
                        polyA=False,
                        # max residues missing from non polyA end
                        polyA_max_unaligned=3,
                        # min residues in tail
                        polyA_min_unaligned=10,
                        # min percent residues that are A/T in tail
                        polyA_min_percent=70.0,
                        # ignore duplicate matches if they are on Un or
                        # _random
                        ignore_all_random=False,
                        )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 1:
        if options.from_zipped or args[0][-3:] == ".gz":
            import gzip
            infile = gzip.open(args[0], "r")
        else:
            infile = IOTools.openFile(args[0], "r")
    else:
        infile = sys.stdin

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    if options.filename_filter_sbjct:

        try:
            import bx.intervals.intersection
        except ImportError:
            raise ValueError("filtering for intervals requires the bx tools")

        intervals = GTF.readGFFFromFileAsIntervals(
           IOTools.openFile(options.filename_filter_sbjct, "r"))

        intersectors = {}

        for contig, values in list(intervals.items()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in values:
                intersector.add_interval(bx.intervals.Interval(start, end))
            intersectors[contig] = intersector

        if options.loglevel >= 1:
            options.stdlog.write("# read %i intervals for %i contigs.\n" %
                                 (sum([len(x) for x in list(intervals.values())]),
                                  len(intersectors)))
    else:
        intersectors = None

    ################################################
    ################################################
    ################################################
    # processing of a chunk (matches of same query)
    ################################################
    ninput, noutput, nskipped = 0, 0, 0

    # number of sequences with full/partial/good matches
    nfull_matches, npartial_matches, ngood_matches = 0, 0, 0
    # number of sequences which are fully/good/partially matched
    # i.e., after combining all aligned regions
    nfully_matched, npartially_matched, nwell_matched = 0, 0, 0

    nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0
    nremoved_gaps, nremoved_nmatches = 0, 0
    nremoved_regions = 0
    nqueries_removed_region = 0

    aggregate_coverages = []
    mapped_coverages = []
    fully_matched = []
    well_matched = []
    partially_matched = []
    new_family_id = options.new_family_id

    if options.output_filename_empty:
        outfile_empty = IOTools.openFile(options.output_filename_empty, "w")
        outfile_empty.write("read_id\tcomment\n")
    else:
        outfile_empty = None

    if options.polyA:
        options.outfile_polyA = IOTools.openFile(
            options.output_filename_pattern % "polyA", "w")
        options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n")

    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * \
                    math.power(
                        options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        # Remove queries matching to a forbidden region. This section
        # will remove the full query if any of its matches matches in a
        # forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write(
                    "%s\toverlap with forbidden region\n" % query_id)
            return

        # check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        # compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * \
            (match.mQueryLength -
             sum([x[1] - x[0] for x in rest])) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        # select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write("%s\n" %
                                         "\t".join(map(str, (
                                             match.mQueryId, match.mSbjctId,
                                             match.strand,
                                             "%5.2f" % match.mQueryCoverage,
                                             "%5.2f" % match.mSbjctCoverage,
                                             "%5.2f" % match.mPid,
                                             match.mQueryLength,
                                             match.mSbjctLength,
                                             match.mQueryFrom, match.mQueryTo,
                                             match.mSbjctFrom, match.mSbjctTo,
                                             ",".join(
                                                 map(str, match.mBlockSizes)),
                                             ",".join(
                                                 map(str, match.mQueryBlockStarts)),
                                             ",".join(
                                                 map(str, match.mSbjctBlockStarts)),
                                         ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write(
                    "%s\tno matches selected: %s\n" % (query_id, msg))
            nempty += 1

    if options.output_format == "map":
        options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage",
                                        "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n")
    elif options.output_format == "psl":
        options.stdout.write(Blat.Match().getHeader() + "\n")

    ################################################
    ################################################
    ################################################
    # main loop
    ################################################
    nfully_covered = None
    matches = []
    last_query_id = None
    is_complete = True
    ninput_lines = 0

    skip = 0

    iterator = Blat.BlatIterator(infile)

    while 1:

        try:
            match = next(iterator)
        except Blat.ParsingError:
            iterator = Blat.BlatIterator(infile)
            continue

        if match is None:
            break

        ninput_lines += 1

        if options.test and ninput_lines > options.test:
            break

        if match.mQueryId != last_query_id:
            if last_query_id:
                processChunk(last_query_id, matches)
            matches = []
            last_query_id = match.mQueryId

        matches.append(match)

    processChunk(last_query_id, matches)

    printHistogram(aggregate_coverages, "aggregate", options)

    printHistogram(mapped_coverages, "mapped", options)

    if "full" in options.print_matched:
        printMatched(fully_matched, "full", options)

    if "good" in options.print_matched:
        printMatched(well_matched, "good", options)

    if "partial" in options.print_matched:
        printMatched(partially_matched, "partial", options)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete)))
        options.stdlog.write(
            "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput))
        options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % (
            nfull_matches, ngood_matches, npartial_matches))
        options.stdlog.write("# aggregate  coverage: full=%i, good=%i, partial=%i\n" % (
            len(fully_matched), len(well_matched), len(partially_matched)))
        options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %
                             (nskipped + nqueries_removed_region + nempty,
                              nskipped, nqueries_removed_region, nempty))
        options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % (
            nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches))

    E.Stop()
Esempio n. 8
0
def main( argv = None ):
    
    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: align_pairs.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("--skip-statistics", dest="skip_stats", action="store_true",
                      help="do not compute alignment statistics [%default]."  )

    parser.add_option("--method", dest="methods", type="choice", action="append",
                      choices=("dialign", "clustal", "blastz", "nw", "sw", "dba", "dialignlgs" ),
                      help="alignment method [%default]."  )

    parser.add_option("--anchor-alignment", dest="anchor_alignment", type="int",
                      help="anchor alignmet with xxx residues [%default]."  )

    parser.add_option("--output-format", dest="output_formats", type="choice", action="append",
                      choices=("fasta", "stats", "psl" ),
                      help="anchor alignment with xxx residues [%default]."  )

    parser.add_option("--input-format", dest="input_format", type="choice", 
                      choices=("fasta", "list" ),
                      help="input format of stdin [%default]."  )

    parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="output pattern for multiple files [%default]."  )

    parser.add_option("--filename-sequences1", dest="filename_sequences1", type="string",
                      help="first indexed input filename with sequences [%default]."  )

    parser.add_option("--filename-sequences2", dest="filename_sequences2", type="string",
                      help="second indexed input filename with sequences [%default]."  )

    parser.add_option("--options-blastz", dest="options_blastz", type="string",
                      help="command line options for blastz [%default]."  )

    parser.set_defaults( 
        skip_stats = False,
        methods = [],
        output_formats = [],
        input_format = "fasta",
        output_filename_pattern = None,
        filename_sequences1 = None,
        filename_sequences2 = None,
        anchor_alignment = 0,
        options_blastz = "C=2 B=1 T=0 W=6 K=2200" )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(options.methods) == 0:
        print USAGE
        print "please specify an alignment method."
        sys.exit(1)

    if len(options.output_formats) == 0:
        print USAGE
        print "please specify at least one output format."
        sys.exit(1)

    if len(args) == 2:
        iterator = iterate_double_fasta( args[0], args[1] )        
    elif options.filename_sequences1 and options.filename_sequences2:
        if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
            infile = options.stdin
        elif len(args) == 1:
            infile = open( args[0], "r") 
                
        iterator = iterate_list( infile, options.filename_sequences1, options.filename_sequences2 )
    else:
        iterator = iterate_single_fasta( options.stdin )
        

    npairs, ntoken_pairs = 0, 0
    ninput, nskipped, nerrors = 0, 0, 0

    outfile_table = None
    outfile_fasta = None
    outfile_psl = None
    if "table" in options.output_formats:
        outfile_table = getFile( "table ", options )
        outfile_table.write( """# CATEGORY:       category [intron|exon]
# METHOD:         alignment method
# TOKEN:          name
# ID:             segment id
# TOTAL:          number of segments
# LEN:            length of segment
# NALIGNED:       number of aligned positions
# PALIGNED:       percentage of aligned positions
# IDENT:          number of identical positions
# TRANSIT:        number of transitions
# TRANSVERS:      number of transversion
# MATCHES:        number of matching positions
# PIDENT:         percentage of identical positions
# PTRANSIT:       precentage of transitions
# PTRANSVERS:     precentage of transversion
# BLOCKSIZES:     alignment, length of blocks
# GAPS:           gap sizes in sequence 1/2
CATEGORY\tMETHOD\tTOKEN1\tID1\tTOTAL1\tLEN1\tTOKEN2\tID2\tTOTAL2\tLEN2\tNALIGNED\tPALIGNED\tIDENT\tTRANSIT\tTRANSVER\tMATCHES\tPIDENT\tPTRANSVIT\tPTRANVER\tBLOCKSIZES\tGAPSIZES\tGAPSIZES\tTYPE1\tTYPE2\n""")

    if "fasta" in options.output_formats:
        outfile_fasta = getFile( "fasta", options )

    if "psl" in options.output_formats:
        outfile_psl = getFile( "psl", options )

    ## setup alignment objects
    for unaligned_pair in iterator:

        ninput += 1
        
        for method in options.methods:

            pair = AlignedPairs.AlignedPair( unaligned_pair )
            pair.mOptionsBlastZ = options.options_blastz

            try:
                pair.Align( method, anchor = options.anchor_alignment )
            except AlignedPairs.AlignmentError, msg:
                
                if options.loglevel >= 1:
                    options.stdlog.write( "# %s - %s: %s\n" % (msg, unaligned_pair.mToken1, unaligned_pair.mToken2))
                    if options.loglevel >= 2:
                        options.stdlog.write( "# input=%s\n" % (str(unaligned_pair)))

                nskipped += 1
                continue

            if outfile_table:
                outfile_table.write( str(pair) + "\n" )
            
            if outfile_fasta:
                outfile_fasta.write( ">%s\n%s\n>%s\n%s\n" % (pair.mToken1, pair.mAlignedSequence1, pair.mToken2, pair.mAlignedSequence2 ) )

            if outfile_psl:
                entry = Blat.Match()
                entry.mQueryId, entry.mSbjctId = pair.mToken1, pair.mToken2
                entry.strand = pair.strand
                entry.fromMap( pair.mAlignment )
                outfile_psl.write( str(entry) + "\n" )

            npairs += 1
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator(infile):
        lines = []
        for line in options.stdin:

            if line.startswith("#"): continue
            if line.strip() == "": continue
            if line.startswith("chain"):
                if lines: yield lines
                lines = []
            lines.append(line)

        yield lines

    for lines in chain_iterator(options.stdin):

        ninput += 1
        psl = Blat.Match()

        (_, _, psl.mSbjctId, target_length, target_strand, target_start,
         target_end, psl.mQueryId, query_length, query_strand, query_start,
         query_end, alignment_id) = lines[0][:-1].split()

        ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
          psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \
        [ int(x) for x in
          (query_start,
           query_end,
           query_length,
           target_start,
           target_end,
           target_length) ]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        qstart, tstart = psl.mQueryStart, psl.mSbjctStart

        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split()]
            map_query2target.addDiagonal(qstart, qstart + size,
                                         tstart - qstart)
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart)

        psl.fromMap(map_query2target)

        # sort out strand
        # target_strand is always positive
        assert (target_strand == "+")

        # if query strand is negative
        if query_strand == "-":
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 10
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: maq2psl.py 2781 2009-09-10 11:33:14Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-c", "--filename-coordinates", dest="filename_coordinates", type="string",
                      help="filename with coordinates."  )

    parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string" ,
                       help="OUTPUT filename pattern for additional data [%default].")

    parser.set_defaults(
        genome_file = "genome",
        filename_coordinates = None,
        segment_length = 32,
        )

    (options, args) = E.Start( parser )

    if options.genome_file:
        genome = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        genome = None

    ninput, noutput = 0, 0

    if options.filename_coordinates:
        segment_length = options.segment_length
        a = matchby_sequence(  iterator_segments( open( options.filename_coordinates, "r"), options.segment_length ),
                               Maq.iterator( options.stdin ),
                               lambda x: (x.mSegment),
                               lambda x: (x.contig) )
        
        for segments, maqs in a:
            pairs = match_smaller( segments, maqs,
                                   lambda x: x.start, 
                                   lambda x: x.start ) 

            for segment, maq in pairs:
                ninput += 1

                assert maq.start >= segment.start, "maq start < segment start: %i < %i" % (maq.start, segment.start)
                assert maq.start + maq.mLength <= segment.start + 2 * segment_length, "maq end > segment end: %i < %i" % (maq.start + maq.mLength, segment.start + 2 * segment_length)
        
                psl = Blat.Match()
                psl.fromMaq( maq )

                match_start = maq.start
                segment_start = segment.start
                contig, left_start, right_start = segment.contig, segment.mLeftStart, segment.mRightStart

                if options.loglevel >= 2:
                    options.stdlog.write("# mapping: name=%s, match_start=%i, segment=%s\n" % (maq.contig, match_start, str(segment)))

                # build positions of the two blocks
                left_size = segment_length - (match_start - segment_start)
                right_size = segment_length - left_size
                mapped1_start = left_start + match_start - segment_start
                mapped1_end   = left_start + segment_length
                mapped2_start = right_start
                mapped2_end   = right_start + right_size

                if options.loglevel >= 3:
                    options.stdlog.write("# mapped: match_start=%i, segment_start=%i, left_size=%i, right_size=%i, mapped1=(%i-%i), mapped2=(%i-%i)\n" %\
                                             (match_start, segment_start, left_size, right_size, mapped1_start, mapped1_end, mapped2_start, mapped2_end) )


                psl.mSbjctId = contig
                if genome: psl.mSbjctLength = genome.getLength( contig )
                psl.mSbjctFrom = mapped1_start
                psl.mSbjctTo = mapped2_end
                psl.mNBlocks = 2
                psl.mBlockSizes= [left_size, right_size]
                psl.mQueryBlockStarts = [0, left_size]
                psl.mSbjctBlockStarts = [mapped1_start, mapped2_start]
                psl.mSbjctNGapsCounts = 1
                psl.mSbjctNGapsBases = mapped2_start - mapped1_end

                options.stdout.write( str(psl) + "\n" )
                noutput += 1

    else:

        for maq in Maq.iterator( options.stdin ):
            ninput += 1

            psl = Blat.Match()
            psl.fromMaq( maq )

            options.stdout.write( str(psl) + "\n" )
            noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i\n" % (ninput, noutput) )

    E.Stop()