Python Blat.Match Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: Blat

Method/Function: Match

Examples at hotexamples.com: 10

Python Blat.Match - 10 examples found. These are the top rated real world Python examples of CGAT.Blat.Match extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BlatIterator(10)

Match(10)

iterator(7)

MatchPSLX(2)

iterator_per_query(1)

iterator_pslx(1)

iterator_query_overlap(1)

iterator_target_overlap(1)

iterator_test(1)

Example #1

Show file

File: maf2psl.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError, msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))

Example #2

Show file

File: psl2table.py Project: kathrinjansen/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--mask-lowercase",
        dest="mask_lowercase",
        action="store_true",
        help=
        "mask lowercase characters before computing properties [default=%default]"
    )

    parser.add_option("--with-match",
                      dest="with_match",
                      action="store_true",
                      help="echo the match in output [default=%default]")

    parser.add_option(
        "--without-match",
        dest="with_match",
        action="store_false",
        help="do not echo the match in output [default=%default]")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"),
        help="methods to compute properties between sequence pairs.")

    WrapperCodeML.BaseML().AddOptions(parser)

    parser.set_defaults(
        methods=[],
        mask_lowercase=False,
        is_pslx=True,
        with_match=True,
    )

    (options, args) = E.Start(parser)

    counters_plain = []
    counters = []

    for method in options.methods:
        if method == "counts":
            counters.append(
                SequencePairProperties.SequencePairPropertiesCountsNa())
        elif method == "query-counts":
            counters.append(QueriesCounter())
        elif method == "sbjct-counts":
            counters.append(SbjctsCounter())
        elif method == "baseml":
            counters.append(
                SequencePairProperties.SequencePairPropertiesBaseML(options))
        elif method == "match":
            counters_plain.append(CounterMatch(options))

    if counters:
        iterator = Blat.iterator_pslx(options.stdin)
        header = "\t".join(Blat.MatchPSLX().getHeaders())
    else:
        iterator = Blat.iterator(options.stdin)
        header = "\t".join(Blat.Match().getHeaders())

    if not options.with_match:
        header = "qName"

    options.stdout.write(
        "\t".join([
            header,
        ] + ["\t".join(x.getHeaders()) for x in counters] +
                  ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n")

    ninput, noutput, nskipped = 0, 0, 0

    for match in iterator:
        ninput += 1

        if options.with_match:
            options.stdout.write(str(match))
        else:
            options.stdout.write(match.mQueryId)

        if counters:

            qseq = match.mQuerySequence
            sseq = match.mSbjctSequence

            # mask non printable characters - sometimes
            # appear after using pslToPslX
            qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq]
            sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq]

            if options.mask_lowercase:
                qseq = [re.sub("[a-z]", "N", x) for x in qseq]
                sseq = [re.sub("[a-z]", "N", x) for x in sseq]

            match.mQuerySequence = qseq
            match.mSbjctSequence = sseq

            qseq = "".join(match.mQuerySequence).upper()
            sseq = "".join(match.mSbjctSequence).upper()

            if len(qseq) != len(sseq):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: two sequences of unequal length in match\n# %s\n"
                        % str(match))
                nskipped += 1
                continue

            for counter in counters:
                counter(qseq, sseq)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters]))

        if counters_plain:

            for counter in counters_plain:
                counter(match)

            options.stdout.write(
                "\t" + "\t".join([str(counter) for counter in counters_plain]))

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()

Example #3

Show file

File: gff2psl.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()

Example #4

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-q",
                      "--query",
                      dest="query",
                      type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t",
                      "--target",
                      dest="target",
                      type="string",
                      help="sequence to use for target [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(
        genome_file=None,
        query=None,
        target=None,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    psl = Blat.Match()

    for bed in Bed.iterator(options.stdin):

        ninput += 1

        start, end = bed.start, bed.end

        if "blockSizes" in bed:
            psl.mQueryId = bed["name"]
            blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]]
            sbjctblockstarts = [
                int(x) + start for x in bed["blockStarts"].split(",")[:-1]
            ]
            strand = bed["strand"]
        else:
            psl.mQueryId = "%i" % ninput
            blocksizes = [end - start]
            sbjctblockstarts = [
                start,
            ]

            strand = "+"

        psl.mSbjctId = bed.contig
        psl.mSbjctFrom, psl.mSbjctTo = start, end
        psl.mQueryFrom, psl.mQueryTo = 0, end - start

        psl.mBlockSizes = blocksizes
        psl.mNBlocks = len(blocksizes)
        psl.strand = strand
        q, qp = [], 0
        for x in blocksizes:
            q.append(qp)
            qp += x

        psl.mQueryBlockStarts = q
        psl.mSbjctBlockStarts = sbjctblockstarts
        psl.mQueryLength = sum(psl.mBlockSizes)
        if fasta:
            psl.mSbjctLength = fasta.getLength(bed.contig)

        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()

Example #6

Show file

File: maf2psl.py Project: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-q", "--query", dest="query", type="string",
                      help="sequence to use for query [default=%default].")

    parser.add_option("-t", "--target", dest="target", type="string",
                      help="sequence to use for target [default=%default].")

    parser.set_defaults(
        query=None,
        target=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.query is None or options.target is None:
        if len(args) != 2:
            raise ValueError(
                "please supply two sequence identifiers for query and target")
        options.query, options.target = args

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    reader = maf.Reader(options.stdin)

    psl = Blat.Match()
    for cc in threaditer(reader, (options.query, options.target)):

        ninput += 1
        query, target = cc

        # treat identfiers like Hsap.GL000223.1
        try:
            data = query.src.split(".")
            qs, qcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse query %s: msg=%s" % (query.src, msg))

        try:
            data = target.src.split(".")
            ts, tcontig = data[0], ".".join(data[1:])
        except ValueError as msg:
            raise ValueError(
                "error: could not parse target %s: msg=%s" % (target.src, msg))

        assert qs == options.query
        assert ts == options.target
        psl.mQueryId = qcontig
        psl.mSbjctId = tcontig

        psl.fromPair(query.start, query.src_size, query.strand, query.text.upper(),
                     target.start, target.src_size, target.strand, target.text.upper())

        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (qs, qcontig, query.start, query.src_size, query.strand, query.text))
        E.debug("%s\t%s\t%i\t%i\t%s\t%s" %
                (ts, tcontig, target.start, target.src_size, target.strand, target.text))
        options.stdout.write("%s\n" % str(psl))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example #7

Show file

File: psl2map.py Project: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries - required for polyA analysis [%default].")

    parser.add_option("--polyA", dest="polyA", action="store_true",
                      help="detect polyA tails [%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string",
                      help="OUTPUT filename with queries for which all matches have been discarded [%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("map", "psl"),
                      help="output format to choose [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float",
                      help="minimum thresholds for pid [%default].")

    parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int",
                      help="minimum threshold for number of matching residues [%default].")

    parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float",
                      help="maximum threshold for error of aligned part [%default].")

    parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float",
                      help="minimum query coverage for segments to be counted as good [%default].")

    parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float",
                      help="minimum query coverage for segments to be accepted [%default].")

    parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int",
                      help="maximum number of gap characters  in query[%default].")

    parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int",
                      help="maximum number of gaps  in query[%default].")

    parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int",
                      help="maximum number of gap characters  in sbjct[%default].")

    parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true",
                      help="ignore filters for unique matches [%default].")

    parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true",
                      help="when sorting matches, keep all matches within the collection threshold [%default].")

    parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true",
                      help="keep only the best entry per sbjct (for transcript mapping) [%default].")

    parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int",
                      help="maximum number of gaps  in sbjct[%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing[%default].")

    parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice",
                      choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage",
                               "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid",
                               "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid",
                               "unique", "all"),
                      help="determines how to selecte the best match [%default].")

    parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string",
                      help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].")

    parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true",
                      help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].")

    parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true",
                      help="use forward coordinates for query, strand will refer to sbjct [%default].")

    parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true",
                      help="if there are multiple best matches, ignore all those to chrUn and _random [%default].")

    parser.add_option("--collection-threshold", dest="collection_threshold", type="float",
                      help="threshold for collecting matches, percent of best score [%default].")

    parser.add_option("--collection-distance", dest="collection_distance", type="float",
                      help="threshold for collecting matches, difference to best score [%default].")

    parser.set_defaults(input_filename_domains=None,
                        input_filename_queries=None,
                        threshold_good_query_coverage=90.0,
                        threshold_min_pid=30.0,
                        threshold_min_matches=0,
                        threshold_max_error_rate=None,
                        output_filename_pattern="%s",
                        keep_unique_matches=False,
                        output_format="map",
                        print_matched=["full", "partial", "good"],
                        from_zipped=False,
                        combine_overlaps=True,
                        min_length_domain=30,
                        threshold_min_query_coverage=50,
                        min_length_singletons=30,
                        new_family_id=10000000,
                        add_singletons=False,
                        matching_mode="best-coverage",
                        best_per_sbjct=False,
                        threshold_max_query_gapchars=None,
                        threshold_max_query_gaps=None,
                        threshold_max_sbjct_gapchars=None,
                        threshold_max_sbjct_gaps=None,
                        filename_filter_sbjct=None,
                        keep_forbidden=False,
                        keep_all_best=False,
                        test=None,
                        query_forward_coordinates=False,
                        output_filename_empty=None,
                        collection_threshold=1.0,
                        collection_distance=0,
                        polyA=False,
                        # max residues missing from non polyA end
                        polyA_max_unaligned=3,
                        # min residues in tail
                        polyA_min_unaligned=10,
                        # min percent residues that are A/T in tail
                        polyA_min_percent=70.0,
                        # ignore duplicate matches if they are on Un or
                        # _random
                        ignore_all_random=False,
                        )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 1:
        if options.from_zipped or args[0][-3:] == ".gz":
            import gzip
            infile = gzip.open(args[0], "r")
        else:
            infile = IOTools.openFile(args[0], "r")
    else:
        infile = sys.stdin

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    if options.filename_filter_sbjct:

        try:
            import bx.intervals.intersection
        except ImportError:
            raise ValueError("filtering for intervals requires the bx tools")

        intervals = GTF.readGFFFromFileAsIntervals(
           IOTools.openFile(options.filename_filter_sbjct, "r"))

        intersectors = {}

        for contig, values in list(intervals.items()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in values:
                intersector.add_interval(bx.intervals.Interval(start, end))
            intersectors[contig] = intersector

        if options.loglevel >= 1:
            options.stdlog.write("# read %i intervals for %i contigs.\n" %
                                 (sum([len(x) for x in list(intervals.values())]),
                                  len(intersectors)))
    else:
        intersectors = None

    ################################################
    ################################################
    ################################################
    # processing of a chunk (matches of same query)
    ################################################
    ninput, noutput, nskipped = 0, 0, 0

    # number of sequences with full/partial/good matches
    nfull_matches, npartial_matches, ngood_matches = 0, 0, 0
    # number of sequences which are fully/good/partially matched
    # i.e., after combining all aligned regions
    nfully_matched, npartially_matched, nwell_matched = 0, 0, 0

    nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0
    nremoved_gaps, nremoved_nmatches = 0, 0
    nremoved_regions = 0
    nqueries_removed_region = 0

    aggregate_coverages = []
    mapped_coverages = []
    fully_matched = []
    well_matched = []
    partially_matched = []
    new_family_id = options.new_family_id

    if options.output_filename_empty:
        outfile_empty = IOTools.openFile(options.output_filename_empty, "w")
        outfile_empty.write("read_id\tcomment\n")
    else:
        outfile_empty = None

    if options.polyA:
        options.outfile_polyA = IOTools.openFile(
            options.output_filename_pattern % "polyA", "w")
        options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n")

    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * \
                    math.power(
                        options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        # Remove queries matching to a forbidden region. This section
        # will remove the full query if any of its matches matches in a
        # forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write(
                    "%s\toverlap with forbidden region\n" % query_id)
            return

        # check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        # compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * \
            (match.mQueryLength -
             sum([x[1] - x[0] for x in rest])) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        # select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write("%s\n" %
                                         "\t".join(map(str, (
                                             match.mQueryId, match.mSbjctId,
                                             match.strand,
                                             "%5.2f" % match.mQueryCoverage,
                                             "%5.2f" % match.mSbjctCoverage,
                                             "%5.2f" % match.mPid,
                                             match.mQueryLength,
                                             match.mSbjctLength,
                                             match.mQueryFrom, match.mQueryTo,
                                             match.mSbjctFrom, match.mSbjctTo,
                                             ",".join(
                                                 map(str, match.mBlockSizes)),
                                             ",".join(
                                                 map(str, match.mQueryBlockStarts)),
                                             ",".join(
                                                 map(str, match.mSbjctBlockStarts)),
                                         ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write(
                    "%s\tno matches selected: %s\n" % (query_id, msg))
            nempty += 1

    if options.output_format == "map":
        options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage",
                                        "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n")
    elif options.output_format == "psl":
        options.stdout.write(Blat.Match().getHeader() + "\n")

    ################################################
    ################################################
    ################################################
    # main loop
    ################################################
    nfully_covered = None
    matches = []
    last_query_id = None
    is_complete = True
    ninput_lines = 0

    skip = 0

    iterator = Blat.BlatIterator(infile)

    while 1:

        try:
            match = next(iterator)
        except Blat.ParsingError:
            iterator = Blat.BlatIterator(infile)
            continue

        if match is None:
            break

        ninput_lines += 1

        if options.test and ninput_lines > options.test:
            break

        if match.mQueryId != last_query_id:
            if last_query_id:
                processChunk(last_query_id, matches)
            matches = []
            last_query_id = match.mQueryId

        matches.append(match)

    processChunk(last_query_id, matches)

    printHistogram(aggregate_coverages, "aggregate", options)

    printHistogram(mapped_coverages, "mapped", options)

    if "full" in options.print_matched:
        printMatched(fully_matched, "full", options)

    if "good" in options.print_matched:
        printMatched(well_matched, "good", options)

    if "partial" in options.print_matched:
        printMatched(partially_matched, "partial", options)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete)))
        options.stdlog.write(
            "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput))
        options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % (
            nfull_matches, ngood_matches, npartial_matches))
        options.stdlog.write("# aggregate  coverage: full=%i, good=%i, partial=%i\n" % (
            len(fully_matched), len(well_matched), len(partially_matched)))
        options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %
                             (nskipped + nqueries_removed_region + nempty,
                              nskipped, nqueries_removed_region, nempty))
        options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % (
            nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches))

    E.Stop()

Example #8

Show file

File: align_pairs.py Project: yangjl/cgat

def main( argv = None ):
    
    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: align_pairs.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("--skip-statistics", dest="skip_stats", action="store_true",
                      help="do not compute alignment statistics [%default]."  )

    parser.add_option("--method", dest="methods", type="choice", action="append",
                      choices=("dialign", "clustal", "blastz", "nw", "sw", "dba", "dialignlgs" ),
                      help="alignment method [%default]."  )

    parser.add_option("--anchor-alignment", dest="anchor_alignment", type="int",
                      help="anchor alignmet with xxx residues [%default]."  )

    parser.add_option("--output-format", dest="output_formats", type="choice", action="append",
                      choices=("fasta", "stats", "psl" ),
                      help="anchor alignment with xxx residues [%default]."  )

    parser.add_option("--input-format", dest="input_format", type="choice", 
                      choices=("fasta", "list" ),
                      help="input format of stdin [%default]."  )

    parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="output pattern for multiple files [%default]."  )

    parser.add_option("--filename-sequences1", dest="filename_sequences1", type="string",
                      help="first indexed input filename with sequences [%default]."  )

    parser.add_option("--filename-sequences2", dest="filename_sequences2", type="string",
                      help="second indexed input filename with sequences [%default]."  )

    parser.add_option("--options-blastz", dest="options_blastz", type="string",
                      help="command line options for blastz [%default]."  )

    parser.set_defaults( 
        skip_stats = False,
        methods = [],
        output_formats = [],
        input_format = "fasta",
        output_filename_pattern = None,
        filename_sequences1 = None,
        filename_sequences2 = None,
        anchor_alignment = 0,
        options_blastz = "C=2 B=1 T=0 W=6 K=2200" )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(options.methods) == 0:
        print USAGE
        print "please specify an alignment method."
        sys.exit(1)

    if len(options.output_formats) == 0:
        print USAGE
        print "please specify at least one output format."
        sys.exit(1)

    if len(args) == 2:
        iterator = iterate_double_fasta( args[0], args[1] )        
    elif options.filename_sequences1 and options.filename_sequences2:
        if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
            infile = options.stdin
        elif len(args) == 1:
            infile = open( args[0], "r") 
                
        iterator = iterate_list( infile, options.filename_sequences1, options.filename_sequences2 )
    else:
        iterator = iterate_single_fasta( options.stdin )
        

    npairs, ntoken_pairs = 0, 0
    ninput, nskipped, nerrors = 0, 0, 0

    outfile_table = None
    outfile_fasta = None
    outfile_psl = None
    if "table" in options.output_formats:
        outfile_table = getFile( "table ", options )
        outfile_table.write( """# CATEGORY:       category [intron|exon]
# METHOD:         alignment method
# TOKEN:          name
# ID:             segment id
# TOTAL:          number of segments
# LEN:            length of segment
# NALIGNED:       number of aligned positions
# PALIGNED:       percentage of aligned positions
# IDENT:          number of identical positions
# TRANSIT:        number of transitions
# TRANSVERS:      number of transversion
# MATCHES:        number of matching positions
# PIDENT:         percentage of identical positions
# PTRANSIT:       precentage of transitions
# PTRANSVERS:     precentage of transversion
# BLOCKSIZES:     alignment, length of blocks
# GAPS:           gap sizes in sequence 1/2
CATEGORY\tMETHOD\tTOKEN1\tID1\tTOTAL1\tLEN1\tTOKEN2\tID2\tTOTAL2\tLEN2\tNALIGNED\tPALIGNED\tIDENT\tTRANSIT\tTRANSVER\tMATCHES\tPIDENT\tPTRANSVIT\tPTRANVER\tBLOCKSIZES\tGAPSIZES\tGAPSIZES\tTYPE1\tTYPE2\n""")

    if "fasta" in options.output_formats:
        outfile_fasta = getFile( "fasta", options )

    if "psl" in options.output_formats:
        outfile_psl = getFile( "psl", options )

    ## setup alignment objects
    for unaligned_pair in iterator:

        ninput += 1
        
        for method in options.methods:

            pair = AlignedPairs.AlignedPair( unaligned_pair )
            pair.mOptionsBlastZ = options.options_blastz

            try:
                pair.Align( method, anchor = options.anchor_alignment )
            except AlignedPairs.AlignmentError, msg:
                
                if options.loglevel >= 1:
                    options.stdlog.write( "# %s - %s: %s\n" % (msg, unaligned_pair.mToken1, unaligned_pair.mToken2))
                    if options.loglevel >= 2:
                        options.stdlog.write( "# input=%s\n" % (str(unaligned_pair)))

                nskipped += 1
                continue

            if outfile_table:
                outfile_table.write( str(pair) + "\n" )
            
            if outfile_fasta:
                outfile_fasta.write( ">%s\n%s\n>%s\n%s\n" % (pair.mToken1, pair.mAlignedSequence1, pair.mToken2, pair.mAlignedSequence2 ) )

            if outfile_psl:
                entry = Blat.Match()
                entry.mQueryId, entry.mSbjctId = pair.mToken1, pair.mToken2
                entry.strand = pair.strand
                entry.fromMap( pair.mAlignment )
                outfile_psl.write( str(entry) + "\n" )

            npairs += 1

Example #9

Show file

File: chain2psl.py Project: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator(infile):
        lines = []
        for line in options.stdin:

            if line.startswith("#"): continue
            if line.strip() == "": continue
            if line.startswith("chain"):
                if lines: yield lines
                lines = []
            lines.append(line)

        yield lines

    for lines in chain_iterator(options.stdin):

        ninput += 1
        psl = Blat.Match()

        (_, _, psl.mSbjctId, target_length, target_strand, target_start,
         target_end, psl.mQueryId, query_length, query_strand, query_start,
         query_end, alignment_id) = lines[0][:-1].split()

        ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
          psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \
        [ int(x) for x in
          (query_start,
           query_end,
           query_length,
           target_start,
           target_end,
           target_length) ]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        qstart, tstart = psl.mQueryStart, psl.mSbjctStart

        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split()]
            map_query2target.addDiagonal(qstart, qstart + size,
                                         tstart - qstart)
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart)

        psl.fromMap(map_query2target)

        # sort out strand
        # target_strand is always positive
        assert (target_strand == "+")

        # if query strand is negative
        if query_strand == "-":
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()

Example #10

Show file

File: maq2psl.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: maq2psl.py 2781 2009-09-10 11:33:14Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-c", "--filename-coordinates", dest="filename_coordinates", type="string",
                      help="filename with coordinates."  )

    parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string" ,
                       help="OUTPUT filename pattern for additional data [%default].")

    parser.set_defaults(
        genome_file = "genome",
        filename_coordinates = None,
        segment_length = 32,
        )

    (options, args) = E.Start( parser )

    if options.genome_file:
        genome = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        genome = None

    ninput, noutput = 0, 0

    if options.filename_coordinates:
        segment_length = options.segment_length
        a = matchby_sequence(  iterator_segments( open( options.filename_coordinates, "r"), options.segment_length ),
                               Maq.iterator( options.stdin ),
                               lambda x: (x.mSegment),
                               lambda x: (x.contig) )
        
        for segments, maqs in a:
            pairs = match_smaller( segments, maqs,
                                   lambda x: x.start, 
                                   lambda x: x.start ) 

            for segment, maq in pairs:
                ninput += 1

                assert maq.start >= segment.start, "maq start < segment start: %i < %i" % (maq.start, segment.start)
                assert maq.start + maq.mLength <= segment.start + 2 * segment_length, "maq end > segment end: %i < %i" % (maq.start + maq.mLength, segment.start + 2 * segment_length)
        
                psl = Blat.Match()
                psl.fromMaq( maq )

                match_start = maq.start
                segment_start = segment.start
                contig, left_start, right_start = segment.contig, segment.mLeftStart, segment.mRightStart

                if options.loglevel >= 2:
                    options.stdlog.write("# mapping: name=%s, match_start=%i, segment=%s\n" % (maq.contig, match_start, str(segment)))

                # build positions of the two blocks
                left_size = segment_length - (match_start - segment_start)
                right_size = segment_length - left_size
                mapped1_start = left_start + match_start - segment_start
                mapped1_end   = left_start + segment_length
                mapped2_start = right_start
                mapped2_end   = right_start + right_size

                if options.loglevel >= 3:
                    options.stdlog.write("# mapped: match_start=%i, segment_start=%i, left_size=%i, right_size=%i, mapped1=(%i-%i), mapped2=(%i-%i)\n" %\
                                             (match_start, segment_start, left_size, right_size, mapped1_start, mapped1_end, mapped2_start, mapped2_end) )


                psl.mSbjctId = contig
                if genome: psl.mSbjctLength = genome.getLength( contig )
                psl.mSbjctFrom = mapped1_start
                psl.mSbjctTo = mapped2_end
                psl.mNBlocks = 2
                psl.mBlockSizes= [left_size, right_size]
                psl.mQueryBlockStarts = [0, left_size]
                psl.mSbjctBlockStarts = [mapped1_start, mapped2_start]
                psl.mSbjctNGapsCounts = 1
                psl.mSbjctNGapsBases = mapped2_start - mapped1_end

                options.stdout.write( str(psl) + "\n" )
                noutput += 1

    else:

        for maq in Maq.iterator( options.stdin ):
            ninput += 1

            psl = Blat.Match()
            psl.fromMaq( maq )

            options.stdout.write( str(psl) + "\n" )
            noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i\n" % (ninput, noutput) )

    E.Stop()