Esempio n. 1
0
    def ProcessChunk(chunk, eliminated_predictions, exons):
        """process a cluster of overlapping predictions.

        Chunks are sorted by first position.
        Thus, only former can span later.
        """

        eliminated = []
        for x in range(0, len(chunk) - 1):
            xfrom, xto, xid, xquality = chunk[x]
            if xquality in options.quality_keep_gene_spanners:
                continue
            for y in range(x + 1, len(chunk)):
                yfrom, yto, yid, yquality = chunk[y]
                # print xid, yid, xfrom < yfrom, xto > yto,
                # Exons.CheckOverlap(exons[xid], exons[yid] ), xquality,
                # yquality
                if xfrom < yfrom and \
                        xto > yto and \
                        not Exons.CheckOverlap(exons[str(xid)], exons[str(yid)] ) and \
                        yquality in options.quality_remove_gene_spanners:
                    eliminated_predictions[xid] = 0
                    eliminated.append((xid, "g"))
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# elimination: %s(%s) spans %s(%s)\n" %
                            (str(xid), xquality, str(yid), yquality))
                    break
        return eliminated
Esempio n. 2
0
def ResolveExonOverlaps(gene_id, predictions):
    """resolve overlaps between predictions based
    on exonic overlap."""

    all_exons = []
    n = 1
    for p in predictions:
        exons = Exons.Alignment2ExonBoundaries(Genomics.String2Alignment(
            p.mAlignmentString),
                                               query_from=0,
                                               sbjct_from=p.mSbjctGenomeFrom)

        for exon in exons:
            all_exons.append((exon.mGenomeFrom, exon.mGenomeTo, n))
        n += 1

    map_prediction2gene = range(0, len(predictions) + 1)
    map_gene2predictions = [None]
    for x in range(1, len(predictions) + 1):
        map_gene2predictions.append([x])

    all_exons.sort()
    # print all_exons

    # cluster exons by overlap
    last_exon_from, last_exon_to, last_p = all_exons[0]

    for exon_from, exon_to, p in all_exons[1:]:
        # if overlap
        if min(exon_to, last_exon_to) - max(exon_from, last_exon_from) > 0:
            # print "# overlap between %i and %i" % (p, last_p)
            # rewire pointers to point to gene of previous prediction
            # if they belong to different genes
            new_g = map_prediction2gene[last_p]
            old_g = map_prediction2gene[p]

            if new_g != old_g:
                for x in map_gene2predictions[old_g]:
                    map_gene2predictions[new_g].append(x)
                    map_prediction2gene[x] = new_g
                map_gene2predictions[old_g] = []

        # if no overlap: create new gene, if predictions has no gene
        # associated with it yet.
        else:
            # print "# no overlap between %i and %i" % (p, last_p)
            if not map_prediction2gene[p]:
                map_prediction2gene[p] = len(map_gene2predictions)
                map_gene2predictions.append([p])

        last_exon_to = max(last_exon_to, exon_to)
        last_p = p

    for x in range(1, len(map_gene2predictions)):
        if map_gene2predictions[x]:
            for p in map_gene2predictions[x]:
                print "%i\t%i" % (gene_id, predictions[p - 1].mPredictionId)
            gene_id += 1

    return gene_id
Esempio n. 3
0
def ClusterByExonCorrespondence(lengths={}, peptide_sequences=None):

    exons = Exons.ReadExonBoundaries(sys.stdin)
    if param_loglevel >= 1:
        print "# read exons for %i transcripts" % len(exons)

    if not lengths:
        for k in exons:
            lengths[k] = (exons[k][0].mPeptideTo / 3) + 1
            for e in exons[k][1:]:
                lengths[k] = max(lengths[k], (e.mPeptideTo / 3) + 1)

        if param_loglevel >= 1:
            print "# lengths for %i transcripts" % len(lengths)

    map_region2transcript = {}
    map_transcript2region = {}
    map_transcript2transcript = {}
    ## build map of regions to transcripts
    for t in exons:
        map_transcript2region[t] = []
        for e in exons[t]:
            r = "%s-%s-%i-%i" % (e.mSbjctToken, e.mSbjctStrand, e.mGenomeFrom,
                                 e.mGenomeTo)
            if r not in map_region2transcript: map_region2transcript[r] = []
            map_region2transcript[r].append(t)
            map_transcript2region[t].append(r)

    ## build map of transcript to transcript
    map_transcript2transcript = {}

    for t in map_transcript2region:
        map_transcript2transcript[t] = []
        for r in map_transcript2region[t]:
            for tt in map_region2transcript[r]:
                map_transcript2transcript[t].append(tt)

    for t in map_transcript2transcript:
        map_transcript2transcript[t].sort()
        l = None
        n = []
        for tt in map_transcript2transcript[t]:
            if t == tt: continue
            if l != tt: n.append(tt)
            l = tt
        map_transcript2transcript[t] = n

    ## cluster greedily, take longest transcript
    cluster_id = 1
    for t in map_transcript2region:
        if t not in map_transcript2transcript: continue
        cluster = CollectCluster(map_transcript2transcript, t)
        PrintCluster(cluster, cluster_id, lengths, peptide_sequences,
                     param_regex_preferred)
        cluster_id += 1

    if param_loglevel >= 1:
        print "# RESULT: %i transcripts in %i genes" % (
            len(map_transcript2region), cluster_id - 1)
Esempio n. 4
0
def CheckSuboptimal(rep_id, exons, eliminated_predictions, other_ids,
                    map_prediction2data, options):

    overlaps = []

    # get predictions which overlap by exons (but not completely):
    for id in other_ids:
        if id == rep_id:
            continue
        if id in eliminated_predictions:
            continue
        if Exons.CheckOverlap( exons[rep_id], exons[id]) and \
            not Exons.CheckCoverage(exons[rep_id],
                                    exons[id],
                                    max_slippage=options.max_slippage):
            overlaps.append(id)

    rep = map_prediction2data[rep_id]
    identity = rep.mPid + options.suboptimal_min_identity_difference

    for x in range(0, len(overlaps) - 1):
        id1 = overlaps[x]
        d1 = map_prediction2data[id1]
        for y in range(x + 1, len(overlaps)):
            id2 = overlaps[y]
            d2 = map_prediction2data[id2]
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# suboptimal: %s ? %s + %s: %s %s %s %s %i %i %i\n" % (
                        rep_id,
                        id1,
                        id2,
                        d1.mQuality in options.quality_remove_suboptimal,
                        d2.mQuality in options.quality_remove_suboptimal,
                        not Exons.CheckOverlap(exons[id1], exons[id2]),
                        Exons.CheckCoverageAinB(
                            exons[rep_id],
                            exons[id1] + exons[id2],
                            min_terminal_exon_coverage=0.0),
                        rep.mPid,
                        d1.mPid,
                        d2.mPid,
                    ))

            if (d1.mQuality in options.quality_remove_suboptimal and
                    d2.mQuality in options.quality_remove_suboptimal ) and \
                    not Exons.CheckOverlap( exons[id1], exons[id2] ) and \
                    Exons.CheckContainedAinB(exons[rep_id], exons[id1] + exons[id2],
                                             min_terminal_exon_coverage=0.0 ) and \
                    (identity < d1.mPid) and \
                    (identity < d2.mPid):
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# elimination: %s(%s) joins %s(%s) and %s(%s)\n" %
                        (rep_id, rep.mPid, id1, d1.mPid, id2, d2.mPid))
                return True

    return False
Esempio n. 5
0
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2,
               transcript2, peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(
            alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(cds1,
                                         cds2,
                                         map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
#######################################################################
# write alignment of introns for orthologous introns
# orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(
                            intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons,
                                                          nskipped_introns)

    return nerrors, nskipped
Esempio n. 6
0
def ReadTranscriptsAndCds(transcript_ids1, transcript_ids2):

    if param_loglevel >= 1:
        print "# reading %i left and %i right transcripts" % (
            len(transcript_ids1), len(transcript_ids2))
        sys.stdout.flush()
    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds1 = Exons.ReadExonBoundaries(open(param_filename_cds1, "r"),
                                    filter=transcript_ids1,
                                    reset=True)
    cds2 = Exons.ReadExonBoundaries(open(param_filename_cds2, "r"),
                                    filter=transcript_ids2,
                                    reset=True)

    if param_loglevel >= 1:
        print "# read %i left and %i right cds" % (len(cds1), len(cds2))
        sys.stdout.flush()

    if param_loglevel >= 2:
        if len(cds1) != len(transcript_ids1):
            print "# missed in left:  %s" % ":".join(
                set(transcript_ids1.keys()).difference(cds1.keys()))
        if len(cds2) != len(transcript_ids2):
            print "# missed in right: %s" % ":".join(
                set(transcript_ids2.keys()).difference(cds2.keys()))

    if param_loglevel >= 1:
        print "# reading genomic sequences."
        sys.stdout.flush()

    transcripts1 = {}
    if param_filename_transcripts1:
        if param_mode_genome1 == "indexed":
            transcripts1 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts1, filter=transcript_ids1)
        else:
            transcripts1 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts1, "r"),
                do_reverse=0,
                filter=transcript_ids1,
                mask=param_mask)
    transcripts2 = {}
    if param_filename_transcripts2:
        if param_mode_genome2 == "indexed":
            transcripts2 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts2, filter=transcript_ids2)
        else:
            transcripts2 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts2, "r"),
                do_reverse=0,
                filter=transcript_ids2,
                mask=param_mask)
    if param_loglevel >= 1:
        print "# read %i left and %i right transcript sequences" % (
            len(transcripts1), len(transcripts2))
        sys.stdout.flush()

    return transcripts1, transcripts2, cds1, cds2
Esempio n. 7
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed)." )

    parser.add_option("--coordinate-format", dest="coordinate_format", type="string",
                      help="input type of coordinates." )

    parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help="output forward coordinates." )

    parser.add_option("-e", "--extract-id", dest="extract_id", type="string",
                      help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" )

    parser.set_defaults(
        coordinate_format = "zero-forward",
        forward_coordinates = False,
        genome_file = None,
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
                l = contig_sizes[e.mSbjctToken]
                e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom

            if e.mGenomeFrom < 0:
                has_error = True
                if options.loglevel >= 1:
                    options.stderr.write( "# Error: %s\n" % str(e) )
                break

            options.stdout.write( str(e) + "\n" )
            nexons += 1
                
        if has_error:
            nerrors += 1
            continue
    
    if options.loglevel >= 1:
        options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors))
    
    E.Stop()
Esempio n. 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--benchmark",
                      dest="filename_benchmark",
                      type="string",
                      help="")

    parser.add_option("-y",
                      "--benchmark-synonyms",
                      dest="benchmark_synonyms",
                      type="string",
                      help="")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="")

    parser.add_option("-c",
                      "--min-coverage-query",
                      dest="min_coverage_query",
                      type="float",
                      help="")

    parser.add_option("-s",
                      "--min-score",
                      dest="min_total_score",
                      type="float",
                      help="")

    parser.add_option("-i",
                      "--min-percent-identity",
                      dest="min_percent_identity",
                      type="float",
                      help="")

    parser.add_option("-o",
                      "--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-score",
                      dest="overlap_min_score",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-coverage",
                      dest="overlap_min_coverage",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-identity",
                      dest="overlap_min_identity",
                      type="float",
                      help="")

    parser.add_option("--overlap-max-coverage",
                      dest="overlap_max_coverage",
                      type="float",
                      help="")

    parser.add_option("-m",
                      "--max-matches",
                      dest="max_matches",
                      type="int",
                      help="")

    parser.add_option("-j",
                      "--join-regions",
                      dest="join_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-regions",
                      dest="join_regions_max_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-coverage",
                      dest="join_regions_max_coverage",
                      type="float",
                      help="")

    parser.add_option("--min-length", dest="min_length", type="int", help="")

    parser.add_option("--test", dest="test", type="int", help="")

    parser.add_option("--filter-queries",
                      dest="filename_filter_queries",
                      type="string",
                      help="")

    parser.add_option("--filter-regions",
                      dest="filter_regions",
                      type="string",
                      help="")

    parser.add_option("--conserve-memory",
                      dest="conserve_memory",
                      action="store_true",
                      help="")

    parser.add_option("--filter-suboptimal",
                      dest="filter_suboptimal",
                      action="store_true",
                      help="")

    parser.set_defaults(
        # overlap allowed for matches on genomic region
        max_percent_overlap=20,
        gop=-10.0,
        gep=-2.0,
        # thresholds for joining regions
        overlap_min_score=80,
        overlap_min_coverage=80,
        overlap_max_coverage=90,
        overlap_min_identity=50,
        # threshold for filtering bad predictions:
        # minimum score
        min_total_score=80,
        # joining regions
        join_regions=0,
        # maximum coverage of query for predictions to be joined
        # (This is to ensure not to join duplications. A range check
        # would be better, but runs into trouble with repeats).
        join_regions_max_coverage=90,
        # minimum coverage of query
        min_coverage_query=10,
        # conserve memory
        conserve_memory=0,
        # minimum percent identity
        min_percent_identity=0,
        # minimum length
        min_length=0,
        max_matches=0,
        filename_peptides=None,
        filename_filter_queries=None,
        # turn on/off various filters
        filter_suboptimal=False,
        filter_regions=False,
        # parameters for filter of suboptimal predictions
        min_relative_coverage=0.5,
        min_relative_score=0.5,
        min_relative_percent_identity=0.5,
        # minimum difference between non-correlated conflicts to keep them
        # both.
        conflicts_min_difference=0.1,
        # benchmarking data
        benchmarks=None,
        benchmark_synonyms=None,
        filename_benchmark=None,
        filename_benchmark_synonyms=None,
        test=None,
        max_intron=50000)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    ##########################################################################
    # read filtering
    filter_queries = {}
    if options.filename_filter_queries:
        for line in open(options.filename_filter_queries, "r"):
            if line[0] == "#":
                continue
            query_token = line[:-1].split("\t")[0]
            filter_queries[query_token] = True

    if options.loglevel >= 1:
        options.stdlog.write("# filtering for %i queries.\n" %
                             len(filter_queries))

    ##########################################################################
    # read benchmarking regions
    if options.filename_benchmark:
        options.benchmarks = ReadBenchmarkingRegions(
            open(options.filename_benchmark, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read benchmarking regions for %i tokens\n" %
                len(options.benchmarks))
            sys.stdout.flush()
        if options.filename_benchmark_synonyms:
            infile = open(options.filename_benchmark_synonyms, "r")
            options.benchmark_synonyms = {}
            for line in infile:
                if line[0] == "#":
                    continue
                value, key = line[:-1].split("\t")
                options.benchmark_synonyms[key] = value
        else:
            options.benchmark_synonyms = {}
    else:
        options.benchmarks = {}
        options.benchmark_synonyms = {}

    ##########################################################################
    # read peptide sequences
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if options.conserve_memory:
        old_predictions, filename_old_predictions = tempfile.mkstemp()
        os.close(old_predictions)
        old_predictions = PredictionFile.PredictionFile()
        old_predictions.open(filename_old_predictions, "w")
    else:
        # array with final predictions
        old_predictions = []

    if options.loglevel >= 1:
        options.stdlog.write("# reading predictions.\n")
        sys.stdout.flush()

    nread = 0
    ninput = 0
    for line in sys.stdin:

        if line[0] == "#":
            continue

        entry = PredictionParser.PredictionParserEntry(expand=0)
        entry.Read(line)
        nread += 1

        # set prediction id
        if not entry.mPredictionId:
            entry.mPredictionId = nread

        # filter bad predictions right here in order to save memory:
        if entry.score < options.min_total_score:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: score below minimum: removing: %s\n" %
                    str(entry))
            continue
        elif entry.mQueryCoverage < options.min_coverage_query:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: coverage below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mPercentIdentity < options.min_percent_identity:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: percent identity below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: length of transcript below minimum: removing: %s\n"
                    % str(entry))
            continue

        ninput += 1

        if options.test and ninput > options.test:
            break

        old_predictions.append(entry)

    if options.loglevel >= 1:
        options.stdlog.write("# predictions after input: %i\n" % ninput)
        sys.stdout.flush()

    if options.loglevel >= 10:

        options.stdlog.write(
            "############## start: predictions after input ###################################\n"
        )
        for x in old_predictions:
            options.stdlog.write("# %s\n" % str(x))
        options.stdlog.write(
            "############## end: predictions after input #####################################\n"
        )
        sys.stdout.flush()

    if ninput == 0:
        options.stdlog.write("# ERROR: no predictions\n")
        sys.exit(1)

    ##########################################################################
    # set up stacks of regions
    if options.conserve_memory:
        old_predictions.close()
        old_predictions.open(mode="r")
        removed_predictions, filename_removed_predictions = tempfile.mkstemp()
        os.close(removed_predictions)
        removed_predictions = PredictionFile.PredictionFile()
        removed_predictions.open(filename_removed_predictions, "w")

        new_predictions, filename_new_predictions = tempfile.mkstemp()
        os.close(new_predictions)
        new_predictions = PredictionFile.PredictionFile()
        new_predictions.open(filename_new_predictions, "w")
    else:
        removed_predictions = []
        new_predictions = []

    if options.benchmarks:
        EvaluateBenchmark(old_predictions)

    ##########################################################################
    # join regions
    if options.join_regions and options.join_regions_max_coverage:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n"
                % (options.join_regions, options.join_regions_max_coverage))
            sys.stdout.flush()
        njoined = JoinRegions(old_predictions, new_predictions)
        if options.conserve_memory:
            ExchangeStreams(old_predictions, new_predictions)
        else:
            old_predictions = new_predictions
            new_predictions = []

        if options.loglevel >= 1:
            options.stdlog.write("# predictions after joining: %i\n" % njoined)
            sys.stdout.flush()

        if options.loglevel >= 10:
            options.stdlog.write(
                "############## start: predictions after joining ###################################\n"
            )
            for x in old_predictions:
                options.stdlog.write("# %s" % str(x))
            options.stdlog.write(
                "############## end: predictions after joining #####################################\n"
            )
            sys.stdout.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# joining regions: skipped\n")
            sys.stdout.flush()

        njoined = ninput

    ##########################################################################
    # build map of best predictions
    if options.filter_suboptimal:
        if options.loglevel >= 1:
            options.stdlog.write("# calculating best predictions\n")
            sys.stdout.flush()
        best_predictions = GetBestPredictions(old_predictions)
    else:
        best_predictions = {}

    if options.loglevel >= 1:
        options.stdlog.write("# calculated best predictions: %i\n" %
                             len(best_predictions))
        sys.stdout.flush()

    ##########################################################################
    # get regions to eliminate
    filter_regions = {}
    if options.filter_regions:

        entry = PredictionParser.PredictionParserEntry(expand=0)

        filenames = options.filter_regions.split(",")

        for filename in filenames:
            if options.loglevel >= 1:
                options.stdlog.write("# reading regions to filter from %s.\n" %
                                     (filename))
                sys.stdout.flush()

            if filename.endswith(".gz"):
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            for line in infile:

                if line[0] == "#":
                    continue

                entry.Read(line)

                exons = Exons.Alignment2Exons(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, entry.mSbjctGenomeFrom)

                key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand)

                if key not in filter_regions:
                    filter_regions[key] = []

                for exon in exons:
                    filter_regions[key].append(
                        (exon.mGenomeFrom, exon.mGenomeTo))

            infile.close()

        for k in filter_regions.keys():
            filter_regions[k].sort()

    ##########################################################################
    # bipartite graph construction

    ##########################################################################
    # sort predictions by genomic region
    if options.conserve_memory:
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    min_from, max_from = None, None
    min_to, max_to = None, None
    region_id = 0
    noverlaps = 0
    last_prediction = None
    predictions = []
    region = Region()
    nclusters = 0
    neliminated_suboptimal = 0
    neliminated_overlap = 0

    noutput, nfiltered = 0, 0

    for this_prediction in old_predictions:

        # Filter 1: skip suboptimal predictions
        if this_prediction.mQueryToken in best_predictions:

            best_prediction = best_predictions[this_prediction.mQueryToken]

            neliminated_suboptimal += 1
            if float(
                    this_prediction.mQueryCoverage
            ) / best_prediction.mQueryCoverage < options.min_relative_coverage:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: coverage below best: removing %s\n"
                        % str(this_prediction))
                continue

            if float(this_prediction.score
                     ) / best_prediction.score < options.min_relative_score:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: score below best: removing %s\n" %
                        str(this_prediction))
                continue

            if float(
                    this_prediction.mPercentIdentity
            ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: percent identity below best: removing %s\n"
                        % str(this_prediction))
                continue

            neliminated_suboptimal -= 1

        # Filter 2: remove predictions overlapping with certain segments
        key = "%s-%s" % (this_prediction.mSbjctToken,
                         this_prediction.mSbjctStrand)

        if key in filter_regions:

            exons = Exons.Alignment2Exons(
                Genomics.String2Alignment(this_prediction.mAlignmentString),
                this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom)

            if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons),
                            filter_regions[key]):
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: overlapping with taboo region: removing %s\n"
                        % str(this_prediction))
                neliminated_overlap += 1
                continue

        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        # process first entry
        if min_from is None:
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            max_to = this_prediction.mSbjctGenomeTo
            min_to = this_prediction.mSbjctGenomeTo
            predictions.append(this_prediction)
            last_prediction = this_prediction
            continue

        overlap = min_to > this_prediction.mSbjctGenomeFrom and \
            last_prediction.mSbjctToken == this_prediction.mSbjctToken and \
            last_prediction.mSbjctStrand == this_prediction.mSbjctStrand

        if options.loglevel >= 4:
            options.stdlog.write("# from=%i, to=%i, working on: %s\n" %
                                 (min_from, max_to, str(this_prediction)))
            options.stdlog.flush()

        # resolve overlap between different genes
        if overlap:
            noverlaps += 1
        else:
            region.mSbjctToken = last_prediction.mSbjctToken
            region.mSbjctStrand = last_prediction.mSbjctStrand
            region.mSbjctGenomeFrom = min_from
            region.mSbjctGenomeTo = max_to

            region_id, nxoutput, nxfiltered = ProcessRegion(
                predictions, region_id, region, peptide_sequences,
                filter_queries)

            noutput += nxoutput
            nfiltered += nxfiltered
            nclusters += 1
            predictions = []
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            min_to = this_prediction.mSbjctGenomeTo
            max_to = this_prediction.mSbjctGenomeTo

        predictions.append(this_prediction)

        min_from = min(min_from, this_prediction.mSbjctGenomeFrom)
        max_from = max(max_from, this_prediction.mSbjctGenomeFrom)
        min_to = min(min_to, this_prediction.mSbjctGenomeTo)
        max_to = max(max_to, this_prediction.mSbjctGenomeTo)

        last_prediction = this_prediction

    if last_prediction:
        region.mSbjctToken = last_prediction.mSbjctToken
        region.mSbjctStrand = last_prediction.mSbjctStrand
        region.mSbjctGenomeFrom = min_from
        region.mSbjctGenomeTo = max_to

        region_id, nxoutput, nxfiltered = ProcessRegion(
            predictions, region_id, region, peptide_sequences, filter_queries)
        noutput += nxoutput
        nfiltered += nxfiltered

        nclusters += 1

    if options.conserve_memory:
        os.remove(filename_old_predictions)
        os.remove(filename_new_predictions)
        os.remove(filename_removed_predictions)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n"
            %
            (nread, ninput, njoined, nclusters, region_id,
             neliminated_suboptimal, neliminated_overlap, noutput, nfiltered))

    E.Stop()
Esempio n. 11
0
        elif o == "--report-step":
            param_report_step = int(a)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    print E.GetHeader()
    print E.GetParams()
    sys.stdout.flush()

    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r"))

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue
        if line[0] == ">":
            print line[:-1]
            continue

        ninput += 1
Esempio n. 12
0
            if line[0] == "#":
                continue
            if line[0] == ">":
                continue
            a, b = line[:-1].split("\t")[:2]
            if b not in components:
                components[b] = []
            components[b].append(a)

        if param_loglevel >= 1:
            print "# read %i components." % len(components)
    else:
        components = {'all': all_identifiers}

    if param_filename_exons:
        exons = Exons.ReadExonBoundaries(
            open(param_filename_exons, "r"), filter=all_mali)
        if param_loglevel >= 2:
            print "# read %i exons." % len(exons)
    else:
        exons = {}

    print "# PREFIX\tsummary\tNSEQUENCES\tNASSIGNED\tNCLUSTERS\tNASSIGNED\tUNASSIGNED"
    print "# PREFIX\tcluster\tNMEMBERS\tMEMBERS"
    print "# PREFIX\tfragments\tNFRAGMENTS\tFRAGMENTS"

    print "# PREFIX\tpide\tNPAIRS\tNAMIN\tNAMAX\tNAMEAN\tNAMEDIAN\tNASTDDEV\tAAMIN\tAAMAX\tAAMEAN\tAAMEDIAN\tAASTDDEV"

    print string.join(("# PREFIX", "codons",
                       "NCLEAN", "NNOSTOPS",
                       "ALIGNED_MIN", "ALIGNED_MAX", "ALIGNED_MEAN", "ALIGNED_MEDIAN", "ALIGNED_STDDEV",
                       "CODONS_MIN", "CODONS_MAX", "CODONS_MEAN", "CODONS_MEDIAN", "CODONS_STDDEV",
Esempio n. 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="input uses forward coordinates.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("default", "cds", "cdnas", "map", "gff",
                               "intron-fasta", "exons"),
                      help="output format.")

    parser.add_option("-r",
                      "--reset-to-start",
                      dest="reset_to_start",
                      action="store_true",
                      help="move genomic coordinates to begin from 0.")

    parser.add_option("--reset-query",
                      dest="reset_query",
                      action="store_true",
                      help="move peptide coordinates to begin from 0.")

    parser.set_defaults(genome_file=None,
                        forward_coordinates=False,
                        format="default",
                        reset_to_start=False,
                        reset_query=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0

    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line.startswith("id"):
            continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write("# parsing failed with msg %s in line %s" %
                                 (msg, line))
            nerrors += 1
            continue

        cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome,
                                    query_from=entry.mQueryFrom,
                                    sbjct_from=entry.mSbjctGenomeFrom,
                                    add_stop_codon=0)

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write(
                "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write("# %s\n" % entry)

        lsequence = fasta.getLength(entry.mSbjctToken)
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        # deal with forward coordinates: convert them to negative strand
        # coordinates
        if options.forward_coordinates and \
                entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \
                entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates(lsequence)

        # attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom - entry.mSbjctGenomeFrom
            end = cd.mGenomeTo - entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        # reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        # play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0

        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write("\t".join(
                    map(str, (entry.mPredictionId, cd.mSbjctToken,
                              cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) +
                                     "\n")

        elif options.format == "cdnas":
            print string.join(
                map(str,
                    (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken,
                     entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset,
                     entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib_lite.makeAlignmentSet()

            for cd in cds:
                alignlib_lite.addDiagonal2Alignment(
                    map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo,
                    (cd.mGenomeFrom - offset) - cd.mPeptideFrom)

            print string.join(
                map(str, (entry.mPredictionId, entry.mSbjctToken,
                          entry.mSbjctStrand,
                          alignlib_lite.AlignmentFormatEmissions(
                              map_prediction2genome))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (
                    entry.mPredictionId, rank, entry.mSbjctToken,
                    entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd.
                                            mGenomeFrom -
                                            entry.mSbjctGenomeFrom]
                options.stdout.write(">%s\n%s\n" % (key, sequence))
                last = cd.mGenomeTo

        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  (entry.mSbjctToken,
                   "gpipe", "similarity",
                   entry.mSbjctGenomeFrom,
                   entry.mSbjctGenomeTo,
                   entry.mPercentIdentity,
                   entry.mSbjctStrand,
                   ".",
                   entry.mQueryToken,
                   entry.mQueryFrom,
                   entry.mQueryTo,
                   entry.score,
                   entry.mNIntrons,
                   entry.mNFrameShifts,
                   entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      (entry.mSbjctToken,
                       "gpipe", "similarity",
                       cd.mGenomeFrom,
                       cd.mGenomeTo,
                       entry.mPercentIdentity,
                       entry.mSbjctStrand,
                       ".",
                       entry.mQueryToken,
                       cd.mPeptideFrom / 3 + 1,
                       cd.mPeptideTo / 3 + 1,
                       entry.score,
                       rank,
                       len(cds),
                       entry.mPredictionId)
        else:
            exon_from = 0
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom
                cd.mPeptideTo = exon_from
                print string.join(
                    map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.frame, cd.mGenomeFrom,
                              cd.mGenomeTo, cd.mSequence)), "\t")
                cds_id += 1

        noutput += 1
Esempio n. 14
0
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2,
                           peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]),
                transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]),
                transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (
                        seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(
                cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3,
                (map_a2b.getRowTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(
                cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3,
                (map_a2b.getColTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1,
                                       c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries,
                param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries,
                param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                        (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(
                    map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                              map_a2b.getNumGaps(), coverage_a, coverage_b)),
                    "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             sbjct_token1, sbjct_strand1, sbjct_from1,
                             sbjct_to1, seqs1[i].getLength(), sbjct_token2,
                             sbjct_strand2, sbjct_from2, sbjct_to2,
                             seqs2[j].getLength(), map_a2b.getRowFrom(),
                             map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
                             map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(),
                             coverage_a, coverage_b, nmissed_cmp2ref,
                             mapped_boundaries, nmissed_ref2cmp,
                             reference_boundaries, i, j, len(c1), len(c2), cc,
                             is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                             map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                             map_a2b.getNumGaps(), coverage_a, coverage_b)),
                        "\t")
                elif method == "location":
                    print string.join(
                        map(str, ("pair", method, prediction_id1,
                                  prediction_id2, sbjct_token1, sbjct_strand1,
                                  sbjct_from1, sbjct_to1, seqs1[i].getLength(),
                                  sbjct_token2, sbjct_strand2, sbjct_from2,
                                  sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (
                        i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
Esempio n. 15
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()
Esempio n. 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="master sequence.")

    parser.add_option("-p",
                      "--master-pattern",
                      dest="master_pattern",
                      type="string",
                      help="master pattern.")

    parser.add_option("--master-species",
                      dest="master_species",
                      type="string",
                      help="species to use as master sequences.")

    parser.add_option("-t",
                      "--translate",
                      dest="filename_translation",
                      type="string",
                      help="filename on where to store translated sequences.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-c",
                      "--mark-codons",
                      dest="mark_codons",
                      action="store_true",
                      help="mark codons.")

    parser.add_option(
        "-i",
        "--ignore-case",
        dest="ignore_case",
        action="store_true",
        help="ignore case (otherwise: lowercase are unaligned chars).")

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons.")

    parser.add_option("--mask-stops",
                      dest="mask_stops",
                      action="store_true",
                      help="mask stop codons.")

    parser.add_option("--mask-char",
                      dest="mask_char",
                      type="string",
                      help="masking character to use.")

    parser.add_option("-f",
                      "--remove-frameshifts",
                      dest="remove_frameshifts",
                      action="store_true",
                      help="remove columns corresponding to frameshifts.")

    parser.add_option(
        "--mask-master",
        dest="mask_master",
        action="store_true",
        help=
        "columns in master to be removed are masked to keep residue numbering."
    )

    parser.add_option(
        "-s",
        "--split-exons",
        dest="split_exons",
        action="store_true",
        help="split columns aligned to different exons in the same gene.")

    parser.add_option("-a",
                      "--target",
                      dest="target",
                      type="choice",
                      choices=("paml", ),
                      help="perform cleaning up for certain targets.")

    parser.set_defaults(
        gap_char="-",
        mask_char="n",
        gap_chars="-.",
        separator="|",
        master=None,
        master_species=None,
        filename_translation=None,
        filename_exons=None,
        master_pattern=None,
        remove_stops=False,
        mark_codons=False,
        mask_unaligned=False,
        split_exons=False,
        remove_frameshifts=False,
        min_segment_length=5,
        ignore_case=False,
        mask_stops=False,
        target=None,
        mask_master=False,
    )

    (options, args) = E.Start(parser)

    if options.target == "paml":
        options.mask_stops = True
        options.mask_char = "n"
        options.remove_frameshifts = True

        if options.loglevel >= 1:
            options.stdlog.write(
                "# setting output to paml : removing frameshifts, masking stops with '%s'.\n"
                % (options.mask_char))

    ## 1. read multiple alignment in fasta format
    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read mali with %i entries.\n" % len(mali))

    if len(mali) == 0:
        raise "empty multiple alignment"

    identifiers = mali.getIdentifiers()

    masters = []
    if options.master:
        masters = options.master.split(",")
    elif options.master_pattern:
        for id in identifiers:
            if re.search(options.master_pattern, id):
                masters.append(id)
    elif options.master_species:
        for id in identifiers:
            if options.master_species == id.split(options.separator)[0]:
                masters.append(id)
    else:
        masters.append(identifiers[0])

    if options.loglevel >= 2:
        options.stdlog.write("# master sequences are: %s\n" % str(masters))
        options.stdlog.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                         filter=set(identifiers),
                                         from_zero=True)

        if options.loglevel >= 2:
            options.stdlog.write("# read exons %i sequences.\n" % len(exons))
    else:
        exons = {}

    #################################################################################
    #################################################################################
    #################################################################################
    ## translate characters to upper/lower case according to exon info.
    #################################################################################
    if exons:
        for id in identifiers:
            if id in exons:
                mali.getSequence(id).mString = AddExonInformation(
                    mali[id], exons[id], mask_char=options.mask_char)

    elif options.ignore_case:
        ## convert all to uppercase
        mali.upper()

    #################################################################################
    #################################################################################
    #################################################################################
    ## untangle misaligned exons
    #################################################################################
    if exons and options.split_exons:

        ## first split with masters
        if len(masters) > 0:
            SplitExons(mali, exons, masters=masters, options=options)

            if options.loglevel >= 4:
                mali.writeToFile(open("log_mali1", "w"), format="fasta")

        SplitExons(mali, exons, options)

    #################################################################################
    #################################################################################
    #################################################################################
    ## remove frameshifts
    #################################################################################
    if options.remove_frameshifts:
        out_of_frame_columns = []
        if len(masters) == 1:

            frame_columns = GetFrameColumns(mali,
                                            masters[0],
                                            gap_chars=options.gap_chars)

        else:

            columns = []

            for id in masters:
                columns += GetFrameColumns(mali,
                                           id,
                                           gap_chars=options.gap_chars)

            if len(columns) == 0:
                columns += GetFrameColumns(mali,
                                           identifiers[0],
                                           gap_chars=options.gap_chars)

            # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100),
            # and (1,2,100) before (1,3,4).
            columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2])))

            # select codons
            frame_columns = []
            last_codon = columns[0]

            for codon in columns[1:]:
                # skip identical codons
                if codon == last_codon: continue

                # take first (shortest) codon in case of identical first residue
                if codon[0] == last_codon[0]:
                    continue

                # if not overlapping, keep
                if codon[0] > last_codon[2]:
                    frame_columns.append(last_codon)
                else:
                    out_of_frame_columns += last_codon

                # if overlapping, but out of register: skip
                last_codon = codon

            frame_columns.append(last_codon)

        # build set of skipped columns
        frame_set = set()
        for column in frame_columns:
            for c in column:
                frame_set.add(c)

        # columns that contain a master sequence that is out of
        # frame
        out_of_frame_set = set(out_of_frame_columns)
        out_of_frame_set = out_of_frame_set.difference(frame_set)

        if options.loglevel >= 1:
            options.stdlog.write("# found %i/%i columns in frame\n" %
                                 (len(frame_columns) * 3, mali.getWidth()))

            if options.loglevel >= 5:
                options.stdlog.write("# frame columns: %i\n" %
                                     (len(frame_columns)))
                x = 0
                for column in frame_columns:
                    options.stdlog.write("# %i\t%s\n" %
                                         (x, ",".join(map(str, column))))
                    x += 1

            if options.loglevel >= 5:
                options.stdlog.write(
                    "# Out-of frame columns with residue of masters: %i\n" %
                    (len(out_of_frame_set)))
                options.stdlog.write("# %s" %
                                     ",".join(map(str, out_of_frame_columns)))

        mask_chars = (string.upper(options.mask_char),
                      string.lower(options.mask_char))

        to_delete = []

        ignore_case = exons or options.ignore_case

        for id in identifiers:

            ngaps, nmasked = 0, 0

            sequence = mali.getSequence(id).mString

            if options.loglevel >= 7:
                options.stdlog.write(
                    "# processing sequence %s of length %i with gaps\n" %
                    (id, len(sequence)))

            ## treat masters differently if they are only to be masked, not
            ## pruned.
            ## simple mask all characters that are to skipped
            fragments = []
            nstops, ncodons, naligned = 0, 0, 0

            codon = []
            chars = []

            is_master = id in masters

            for x in range(len(sequence)):
                c = sequence[x]

                ## delete columns that do not align to
                ## a master.
                if x not in frame_set and x not in out_of_frame_set:
                    continue

                chars.append(c)
                if c not in options.gap_chars:
                    codon.append(c)
                if len(codon) % 3 == 0:
                    codon = "".join(codon)
                    codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon(
                        codon, options)

                    if codon_is_aligned: naligned += 1

                    to_mask = False
                    if codon_is_all_gaps:
                        ngaps += len(chars)
                    elif codon_is_ok:
                        ncodons += 1
                        if string.upper(codon) in ("TAG", "TAA", "TGA"):
                            nstops += 1
                            to_mask = True
                    else:
                        to_mask = True
                        nmasked += 1

                    if to_mask:
                        for i in range(len(chars)):
                            if chars[i] not in options.gap_chars:
                                chars[i] = options.mask_char

                    fragments.append("".join(chars))
                    chars = []
                    codon = []

            ## mask incomplete codons at the end
            if chars:
                for i in range(len(chars)):
                    if chars[i] not in options.gap_chars:
                        chars[i] = options.mask_char
                fragments.append("".join(chars))


##             else:

##                 for a,b,c in frame_columns:

##                     codon = sequence[a] + sequence[b] + sequence[c]

##                     codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options )

##                     if codon_is_aligned: naligned += 1

##                     if codon_is_all_gaps:
##                         fragments.append( options.gap_char * 3 )
##                         ngaps += 1
##                     elif codon_is_ok:
##                         ncodons += 1
##                         if string.upper(codon) in ("TAG", "TAA", "TGA"):
##                             if options.remove_stops:
##                                 fragments.append( options.gap_char * 3 )
##                             elif options.mask_stops:
##                                 fragments.append( options.mask_char * 3 )
##                             else:
##                                 fragments.append( codon )
##                             nstops += 1
##                         else:
##                             fragments.append( codon )
##                     else:
##                         fragments.append( options.gap_char * 3 )
##                         nmasked += 1

##                     if options.loglevel >= 7:
##                         options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id,
##                                                                                                  a,b,c,
##                                                                                                  codon,
##                                                                                                  str(codon_is_ok),
##                                                                                                  str(codon_is_aligned) ))

            s = string.join(fragments, "")
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n"
                    % (id, len(fragments), naligned, ncodons, nstops, ngaps,
                       nmasked))
                options.stdlog.flush()

            ## postpone deletion in order to not
            ## confuse the iteration of ids
            if naligned == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned nucleotides.\n"
                    % id)
                to_delete.append(id)
            elif ncodons == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned codons.\n"
                    % id)
                to_delete.append(id)
            else:
                mali.setSequence(id, string.join(fragments, ""))

        for id in to_delete:
            del mali[id]

    for id in identifiers:
        if options.mark_codons:
            a = mali[id]
            f = lambda x: a[x:x + 3]
            s = string.join([f(x) for x in range(0, len(a), 3)], " ")
        else:
            s = mali[id]
        options.stdout.write(">%s\n%s\n" % (id, s))

    if options.filename_translation:
        outfile = open(options.filename_translation, "w")
        for id in mali.keys():
            outfile.write(">%s\n%s\n" %
                          (id, Genomics.TranslateDNA2Protein(mali[id])))
        outfile.close()

    E.Stop()
Esempio n. 17
0
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0, reference.mSbjctFrom)

        for line2 in ct.fetchall():
            target = PredictionParser.PredictionParserEntry()
            target.FillFromTable(line2)

            target_exons = Exons.Alignment2Exons(target.mMapPeptide2Genome, 0,
                                                 target.mSbjctFrom)

            ## check for exon overlap
            rr, tt = 0, 0
            overlap = 0
            while rr < len(reference_exons) and tt < len(target_exons):

                r = reference_exons[rr]
                t = target_exons[tt]
Esempio n. 18
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int",
                       help="overlap residues.")
    parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string",
                       help="filename to filter tokens." )
    parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true",
                       help="exon identity." )
    parser.add_option( "--exons", dest="filename_exons", type="string",
                       help="filename with exon information." )
    parser.add_option( "-m", "--output-members", dest="filename_members", type="string",
                       help="output filename with members." )
    parser.add_option( "--overlap-id", dest="overlap_id", action="store_true",
                       help="overlap id." )
    parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true",
                       help="remove spanning predictions." )
    parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true",
                       help="remove complementary predictions." )
    parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true",
                       help="remove exon swoppers." )
    parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true",
                       help="remove gene spanners." )
    parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true",
                       help="remove suboptimal predictions." )
    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide information." )
    parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string",
                       help="filename with peptide information - after extension." )
    
    parser.add_option( "--test", dest="test_nids", type="string",
                       help="test nids." )
    ## filter options
    parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string",
                       help="filename with transcripts that are used to filter." )
    parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true",
                       help="remove all transcripts that span the filter set." )
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )
    parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int",
                       help="if set discard clusters bigger than this size (patch) [default=%default]." )
    

    parser.set_defaults(
        filename_members = None,
        filename_peptides = None,
        filename_extended_peptides = None,
        filename_exons = None,
        quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ),
        ## Classes, where redundancy is removed by similarity. When exon structure
        ## is not conserved, I can't predict alternative splice variants, so remove
        ## the redundancy.
        quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ),
        quality_genes = ("CG", "SG", "PG", "RG", "UG"),
        ## class that can be removed in spanning/complementary predictions
        quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ),
        ## class that is required for defining exon swopper event
        quality_remove_exon_swopper = ("CG", "PG"),
        ## class that will kept, in spite of being an exons swopper.
        quality_keep_exon_swopper = (),
        ## class that is required for removing gene spanners
        quality_remove_gene_spanners = ("CG"),
        ## class that will kept, in spite of being a gene spanner
        quality_keep_gene_spanners = (),
        ## class that is required for defining suboptimal matches
        quality_remove_suboptimal = ("CG", "PG" ),
        ## class that will be kept, in spite of being a suboptimal match
        quality_keep_suboptimal = (),
        ## gap penalties
        gop = -10.0,
        gep = -1.0,
        ## maximum number of gaps to allow in alignment
        max_gaps = 20,
        ## threshold of percent identity that allows to remove a prediction
        ## of a lower class.
        ## This allows for insertions/deletions
        min_identity = 98,
        ## threshold of percent identity that allows to remove a prediction
        ## of a non-gene by a gene
        min_identity_non_genes = 80,
        ## safety threshold: do not remove, if coverage of member is by x better
        ## than representative
        safety_pide = 10,
        safety_coverage = 10,
        overlap_id = False,
        remove_spanning_predictions = False,
        remove_exon_swoppers = False,
        remove_gene_spanners = False,
        remove_suboptimal = False,
        ## nids to use for testing
        test_nids = None,
        ## remove members with less than maximum coverage
        max_member_coverage = 90,
        ## maximum allowable exon slippage
        max_slippage = 9,
        ## minimum difference in identity for suboptimal predictions to be removed.
        suboptimal_min_identity_difference = 10,
        ## filter options
        filter_filename_transcripts = None,
        filter_remove_spanning = True,
        filter_remove_spanning_both_strands = True,
        genome_file = None,
        discard_large_clusters = None )
    
    (options, args) = E.Start( parser, add_psql_options = True )    

    if options.test_nids: options.test_nids = options.test_nids.split(",")

    # list of eliminated predictions
    eliminated_predictions = {}
    
    if options.filename_members:
        outfile_members = open( options.filename_members, "w" )
    else:
        outfile_members = sys.stdout

    ######################################################
    ######################################################
    ######################################################        
    # data
    ######################################################    
    data = []

    class Entry:
        def __init__(self, gff):
            self.mPid = float(gff["pid"])
            self.mQueryCoverage = float(gff["qcov"])
            self.gene_id = gff['gene_id']
            self.transcript_id = gff['transcript_id']
            self.mExtendedStart = int( gff['xstart'] )
            self.mExtendedEnd = int( gff['xend'] )
            self.start = gff.start
            self.contig = gff.contig
            self.strand = gff.strand
            self.end = gff.end
            self.mQuality = gff['class']
            
    for gff in GTF.iterator( sys.stdin ):
        data.append( Entry(gff) )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i transcripts.\n" % len(data) )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    # read peptide sequences
    ######################################################    
    if options.loglevel >= 1:
        options.stdlog.write( "# loading peptide databases ... " )
        options.stdlog.flush()

    if options.filename_peptides:
        peptides = IndexedFasta.IndexedFasta( options.filename_peptides )
        peptide_lengths = peptides.getContigSizes()
    else:
        peptide_lengths = {}
        peptides = {}

    ######################################################
    ######################################################
    ######################################################        
    # read extended peptide sequences
    ######################################################    
    if options.filename_extended_peptides:
        extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides )
    else:
        extended_peptides = {}

    if options.loglevel >= 1:
        options.stdlog.write( "finished\n" )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    ## open genome file
    ######################################################        
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    ######################################################
    ######################################################
    ######################################################        
    ## reading exons, clustering and formatting them.
    ######################################################        
    if options.filename_exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries ... " )
            options.stdlog.flush()
            
        ids = [ x.transcript_id for x in data ] 

        exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"),
                                          contig_sizes = contig_sizes,
                                          filter = set(ids) )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) ))

        if len(exons) == 0:
            raise ValueError("no exons found in table.")
            
        # flag terminal exons
        Exons.SetRankToPositionFlag( exons )

        identity_map_cluster2transcripts, identity_map_transcript2cluster =\
                                          Exons.ClusterByExonIdentity( exons,
                                                                       max_terminal_num_exons = 3,
                                                                       max_slippage= options.max_slippage,
                                                                       loglevel = options.loglevel )

        overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\
                                         Exons.ClusterByExonOverlap( exons,
                                                                     min_overlap = 10,
                                                                     loglevel = options.loglevel )
    else:
        exons = {}

    ######################################################        
    nrepresentatives, nmembers, neliminated = 0, 0, 0
    eliminated_by_method = {}

    ######################################################
    ######################################################
    ######################################################
    ## read filter transcripts and apply filters
    ######################################################        
    if options.filter_filename_transcripts:

        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries for filter set ... " )
            options.stdlog.flush()
            
        filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ),
                                                 delete_missing = True,
                                                 contig_sizes = contig_sizes )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) )
        
        t = time.time()
        eliminated = FilterEliminateOverlappingTranscripts( exons,
                                                            filter_exons,
                                                            eliminated_predictions,
                                                            contig_sizes,
                                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t ))
            options.stdlog.flush()

    if options.remove_exon_swoppers and not exons:
        raise ValueError( "please specify exon table if using --remove-swoppers." )
    if options.remove_gene_spanners and not exons:
        raise ValueError( "please specify exon table if using --remove-gene-spanners." )

    ########################################################################################
    ## remove predictions spanning other predictions but do not overlap with them on an exon level.
    if options.remove_gene_spanners and exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# removing gene spanners\n" )
            options.stdlog.flush()
            
        t = time.time()
        eliminated = EliminateGeneSpanners( data,
                                            eliminated_predictions,
                                            exons,
                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t ))
            options.stdlog.flush()
            
    ########################################################################################
    ## sort data by quality, length of prediction and coverage * pid

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data\n" )
        options.stdlog.flush()

    map2pos = {}
    for x in range(len(options.quality_hierarchy)):
        map2pos[options.quality_hierarchy[x]] = x

    data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) )

    # build map of prediction to quality
    map_prediction2data = {}
    for d in data:
        map_prediction2data[d.transcript_id] = d

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data finished\n" )
        options.stdlog.flush()

    ########################################################################################
    ## remove predictions joining two other complete non-overlapping predictions
    if options.remove_exon_swoppers and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing exon swoppers\n" )
            options.stdlog.flush()

        eliminated = EliminateExonSwoppers( data,
                                            eliminated_predictions,
                                            identity_map_transcript2cluster,
                                            identity_map_cluster2transcripts,
                                            map_prediction2data,
                                            exons,
                                            options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i exon swoppers\n" % n )
            options.stdlog.flush()

    ########################################################################################
    ## remove suboptimal predictions
    if options.remove_suboptimal and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing suboptimal predictions\n" )
            options.stdlog.flush()

        t = time.time()
        eliminated = EliminateSuboptimalPredictions( data,
                                                     eliminated_predictions,
                                                     overlap_map_transcript2cluster,
                                                     overlap_map_cluster2transcripts,
                                                     map_prediction2data,
                                                     exons,
                                                     options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) )
            options.stdlog.flush()        

    ########################################################################################
    ## remove redundant predictions
    l = len(data)
        
    options.report_step = max(1, int(l / 100))

    t2= time.time()

    last_quality = None
    qualities = []

    options.stdout.write( "%s\t%s\n" % ("rep", "comment") )
    
    for x in range(len(data)):

        if options.loglevel >= 1:
            if x % options.report_step  == 0:
                options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \
                                      (x+1, l,
                                       int(100 * (x+1) / l),
                                       len(eliminated_predictions), l,
                                       100 * len(eliminated_predictions) / l,
                                       time.time() - t2 ) )
                                                                    
                options.stdlog.flush()
                
        rep = data[x]

        rep_id, rep_quality = rep.transcript_id, rep.mQuality
        
        if rep_id in eliminated_predictions: continue

        if rep_quality != last_quality:
            if last_quality:
                qualities.append( last_quality )
            last_quality = rep_quality
        
        if options.loglevel >= 2:
            options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) )
            options.stdlog.flush()

        eliminated = []

        if options.overlap_id:
            eliminated += EliminateRedundantEntriesByOverlap( rep,
                                                              data[x+1:],
                                                              eliminated_predictions,
                                                              options,
                                                              peptides, 
                                                              extended_peptides,
                                                              filter_quality = qualities,
                                                              this_quality = rep_quality )
                                                              
        else:
            eliminated += EliminateRedundantEntriesByRange( rep,
                                                            data,
                                                            eliminated_predictions,
                                                            options,
                                                            peptides, 
                                                            extended_peptides,
                                                            filter_quality = qualities,
                                                            this_quality = rep_quality )

        options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) )

        if outfile_members:
            outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id)))
            nrepresentatives += 1
            nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method )            

    if outfile_members != sys.stdout:
        outfile_members.close()

    options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\
                          (nrepresentatives, nmembers, neliminated,
                           nrepresentatives+nmembers+neliminated ) )
    
    options.stdlog.write( "# elimination by method:\n" )
    
    for v,c in eliminated_by_method.items():
        options.stdlog.write( "# method=%s, count=%i\n" % (v, c) )

    E.Stop()
Esempio n. 19
0
def CheckExonSwop( rep_id,
                   exons,
                   eliminated_predictions,
                   other_ids,
                   map_prediction2data,
                   options ):
    """check for exon swop

    return true, if exon swop occurs.

    Exon swop occurs, if this prediction joins
    two predictions, one of which should be CG.

    None of the predictions should be fully contained
    in the master prediction.

    given:
        the rep_id to analyzse
        a map of rep_id to exons
        a list of rep_ids to check against

    -> is it an exon swopper?
      -> joining two CG predictions that do not overlap and
         contain no extra exons apart from the overlapping.
    -> is it large spanning prediction?
      -> spanning many predictions, including at least one CG?
    
    """
    overlaps = []
    ## get predictions which overlap by exons (but not completely):
    
    for id in other_ids:
        if id == rep_id: continue
        if id in eliminated_predictions: continue
        if Exons.CheckOverlap( exons[rep_id], exons[id]) and \
               not Exons.CheckCoverage( exons[rep_id],
                                        exons[id],
                                        max_slippage=options.max_slippage ):
            overlaps.append( id )

    if options.loglevel >= 3:
        options.stdlog.write( "# exon swop: %s overlaps with %i out of %i predictions\n" % (rep_id, len(overlaps), len(other_ids) ) )
        options.stdlog.flush()
            
    for x in range(0, len(overlaps)-1):
        id1 = overlaps[x]
        for y in range(x+1, len(overlaps)):
            id2 = overlaps[y]
            if options.loglevel >= 4:
                options.stdlog.write( "# exon swop: %s ? %s + %s: %s %s %s %s\n" % \
                      (rep_id, id1, id2, 
                       map_prediction2data[id1].mQuality in options.quality_remove_exon_swopper,
                       map_prediction2data[id2].mQuality in options.quality_remove_exon_swopper,
                       not Exons.CheckOverlap( exons[id1], exons[id2] ),
                       Exons.CheckCoverageAinB( exons[rep_id], exons[id1] + exons[id2],
                                                min_terminal_num_exons = 0,
                                                min_terminal_exon_coverage = 0.7,
                                                max_slippage = options.max_slippage ) ) )
                
            if (map_prediction2data[id1].mQuality in options.quality_remove_exon_swopper and \
                map_prediction2data[id2].mQuality in options.quality_remove_exon_swopper ) and \
                not Exons.CheckOverlap( exons[id1], exons[id2] ) and \
                Exons.CheckCoverageAinB( exons[rep_id], exons[id1] + exons[id2],
                                         min_terminal_num_exons = 0,
                                         min_terminal_exon_coverage = 0.7,
                                         max_slippage = options.max_slippage ):
                if options.loglevel >= 1:
                    options.stdlog.write( "# elimination: %s(%s) joins %s(%s) and %s(%s)\n" % \
                          (rep_id, map_prediction2data[rep_id].mQuality,
                           id1, map_prediction2data[id1].mQuality,
                           id2, map_prediction2data[id2].mQuality) )
                return True
            
    return False
Esempio n. 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")
Esempio n. 21
0
    elif param_filename_contigs:
        # read contigs
        contig_sizes = Genomics.ReadContigSizes(
            open(param_filename_contigs, "r"))
        delete_missing = True
    else:
        contig_sizes = {"dummy": 1000000000}
        delete_missing = False

    if param_loglevel >= 1:
        print "# read %i peptide sequences" % len(peptide_sequences)
        sys.stdout.flush()

    exons = Exons.ReadExonBoundaries(
        sys.stdin,
        contig_sizes=contig_sizes,
        delete_missing=delete_missing,
    )

    if param_loglevel >= 1:
        print "# read exon information for %i transcripts" % len(exons)
        sys.stdout.flush()

    if len(exons) == 0:
        raise IOError("no exons in exon list.")

    Exons.SetRankToPositionFlag(exons)

    if param_use_genome_length:
        lengths = Exons.GetGenomeLengths(exons)
    else:
Esempio n. 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--output-filename-summary",
                      dest="output_filename_summary",
                      type="string",
                      help="filename with summary information.")

    parser.add_option("--skip-header",
                      dest="skip_header",
                      action="store_true",
                      help="skip header.")

    parser.add_option(
        "--fill-introns",
        dest="fill_introns",
        type="int",
        help=
        "fill intron if divisible by three and no stop codon up to a maximum length of #."
    )

    parser.add_option(
        "--introns-max-stops",
        dest="introns_max_stops",
        type="int",
        help="maximum number of stop codons to tolerate within an intron.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("predictions", "extensions", "filled-introns"),
                      help="output format.")

    parser.set_defaults(
        genome_file="genome",
        start_codons=("ATG"),
        stop_codons=("TAG", "TAA", "TGA"),
        skip_header=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        options.stdout.write("\t".join((
            "prediction_id",
            "intron",
            "contig",
            "strand",
            "start",
            "end",
            "length",
            "nstops",
            "type",
            "prime5",
            "prime3",
        )) + "\n")

    for line in sys.stdin:

        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength(p.mSbjctToken)

        genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom,
                                             p.mSbjctGenomeTo).upper()

        exons = Exons.Alignment2Exons(p.mMapPeptide2Genome,
                                      query_from=0,
                                      sbjct_from=0)

        new_exons = []

        last_e = exons[0]

        nintron = 0

        for e in exons[1:]:

            nintron += 1
            lintron = e.mGenomeFrom - last_e.mGenomeTo

            intron_is_l3 = lintron % 3 != 0

            if intron_is_l3:
                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                ## note that e.mAlignment can sometimes be empty. This might
                ## be an exonerate bug. In the alignment string there are two
                ## consecutive exons.
                if e.mAlignment and last_e.mAlignment and e.mAlignment[0][
                        0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0

                sequence = genomic_sequence[last_e.mGenomeTo -
                                            offset_left:e.mGenomeFrom +
                                            offset_right]

                intron_nstops = 0
                for codon in [
                        sequence[x:x + 3] for x in range(0, len(sequence), 3)
                ]:
                    if codon in options.stop_codons:
                        intron_nstops += 1
            else:
                intron_nstops = 0

            ## check for splice signals
            sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom]

            intron_type, prime5, prime3 = Genomics.GetIntronType(sequence)

            if options.loglevel >= 2:
                options.stdlog.write( "\t".join(map(str, (p.mPredictionId,
                                                          nintron,
                                                          lintron,
                                                          intron_nstops,
                                                          intron_type,
                                                          genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\
                                                          sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" )

            options.stdout.write("\t".join(
                map(str, (p.mPredictionId, nintron, p.mSbjctToken,
                          p.mSbjctStrand,
                          last_e.mGenomeTo + p.mSbjctGenomeFrom,
                          e.mGenomeFrom + p.mSbjctGenomeFrom, lintron,
                          intron_nstops, intron_type, prime5, prime3))) + "\n")

            last_e = e

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\
            ninput, noutput))

    E.Stop()
Esempio n. 23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-q",
                      "--quality",
                      dest="quality",
                      type="string",
                      help="quality categories to take into account.")
    parser.add_option("-f",
                      "--format=",
                      dest="format",
                      type="string",
                      help="input format [exons|gff|table]")

    parser.add_option("-e",
                      "--exons=",
                      dest="tablename_exons",
                      type="string",
                      help="table name with exons.")
    parser.add_option("-p",
                      "--predictions=",
                      dest="tablename_predictions",
                      type="string",
                      help="table name with predictions.")
    parser.add_option("-n",
                      "--non-redundant",
                      dest="non_redundant",
                      action="store_true",
                      help="only non-redundant predictions.")
    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to use.")

    parser.set_defaults(
        fields=[
            "Id", "NumExons", "GeneLength", "MinExonLength", "MaxExonLength",
            "MinIntronLength", "MaxIntronLength"
        ],
        tablename_exons="exons",
        tablename_predictions="predictions",
        quality=None,
        non_redundant=False,
        schema=None,
        tablename_redundant="redundant",
        tablename_quality="quality",
        format="exons",
    )

    (options, args) = E.Start(parser,
                              add_csv_options=True,
                              add_psql_options=True)

    if options.quality:
        options.quality = options.quality.split(",")

    if options.format == "table":
        dbhandle = pgdb.connect(options.psql_connection)
        exons = Exons.GetExonBoundariesFromTable(
            dbhandle,
            options.tablename_predictions,
            options.tablename_exons,
            non_redundant_filter=options.non_redundant,
            quality_filter=options.quality,
            table_name_quality=options.tablename_quality,
            table_name_redundant=options.tablename_redundant,
            schema=options.schema)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    stats = Exons.CalculateStats(exons)

    print "\t".join(options.fields)

    writer = csv.DictWriter(sys.stdout,
                            options.fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    for k, v in stats.items():
        v["Id"] = k
        writer.writerow(v)

    E.Stop()
Esempio n. 24
0
def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""):
    """split multiple alignment into clusters of orthologous transcripts.

    Orthologous transcripts are defined by similarity of gene structure to
    query sequences.

    Also: return matrix of gene structure compatibility

    0   : perfect compatibility (exact match)

    ratio of missed exon boundaries to total exon boundaries.

    100 : no compatibility
    """

    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    matrix_compatibility = numpy.zeros((wmali, wmali))

    if len(identifiers) == 0:
        return
    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    nok = 0
    nperfect = 0

    ntotal_exons = 0
    nidentical_exons = 0
    nskipped_exons = 0

    ref_nok = 0
    ref_nperfect = 0

    ref_ntotal_exons = 0
    ref_nidentical_exons = 0
    ref_nskipped_exons = 0
    ref_ntotal = 0

    rx = re.compile(param_master_pattern)

    # list of number of exons
    anexons = []

    ## exons in reference
    ref_nexons = 0
    x = 0
    for key1 in identifiers:

        seq = mali[key1]

        matches = []
        unassigned = []

        is_perfect = False

        anexons.append(len(exons[key1]))
        if rx.search(key1):
            ref_nexons = len(exons[key1])

        y = 0
        for key2 in identifiers:

            if key2 == key1:
                continue

            if param_loglevel >= 3:
                print "#############################################"
                print "# comparing %s to %s" % (key1, key2)

            mref = 0
            mcmp = 0

            seq_master = mali[key2]
            ref_exons = exons[key2]

            map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char)

            # map exon boundaries to reference sequence
            cmp_exons = []

            if param_loglevel >= 5:
                print alignlib_lite.py_writeAlignataTable(map_cmp2ref)

            for e in exons[key1]:
                ne = e.GetCopy()
                ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1)
                ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0)
                cmp_exons.append(ne)

            # massage boundaries for terminal exons:
            if cmp_exons[0].mPeptideFrom <= 0:
                cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom
            if cmp_exons[-1].mPeptideTo <= 0:
                cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo

            if param_loglevel >= 4:
                for e in exons[key1]:
                    print "# exon", str(e)

            if param_loglevel >= 3:
                for e in cmp_exons:
                    print "# exon", str(e)
                for e in ref_exons:
                    print "# exon", str(e)

            # do exon comparison
            comparison = Exons.CompareGeneStructures(cmp_exons,
                                                     ref_exons,
                                                     threshold_min_pide=0,
                                                     threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary)

            if param_loglevel >= 3:
                print comparison.Pretty(prefix="# EVAL: ")

            # analyse results
            min_nexons = min(len(cmp_exons), len(ref_exons))
            max_nexons = max(len(cmp_exons), len(ref_exons))

            similarity = (max_nexons - comparison.mNumIdenticalExons) * \
                (abs(comparison.mNumDifferenceExons))

            is_perfect = False
            is_ok = False
            status = []

            # non-equivalent exon pairs
            ne = len(cmp_exons) - comparison.mNumIdenticalExons - \
                comparison.mNumSkippedExons

            is_perfect = False
            is_ok = False
            if comparison.mNumIdenticalExons == 0:
                # F: complete and utter failure, no excuses
                status.append("F")
            else:
                if ne == 0:
                    # P: perfect conservation
                    status.append("=")
                    is_ok = True
                    is_perfect = True
                elif ne == min_nexons - comparison.mNumSkippedExons:
                    # D: completely different predictions
                    status.append("D")
                elif ne in (1, 2):
                    # A: almost conserved
                    status.append("A")
                    is_ok = True
                elif ne > 2:
                    # M : mostly conserved (in case of long proteins that is
                    # good enough).
                    if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity:
                        status.append("M")
                    else:
                    # S : spuriously conserved
                        status.append("S")
                else:
                    # U: unconserved
                    status.append("U")

            if len(cmp_exons) > len(ref_exons):
                status.append(">")
            elif len(ref_exons) < len(cmp_exons):
                status.append("<")
            else:
                status.append("=")

            if min_nexons == max_nexons and min_nexons == 1:
                status.append("S")
            elif min_nexons == 1 and max_nexons == 2:
                status.append("s")
            elif min_nexons == 2 and max_nexons == 2:
                status.append("D")
            elif min_nexons == 2 and max_nexons > 2:
                status.append("d")
            elif min_nexons == max_nexons:
                status.append("M")
            elif min_nexons > 2 and max_nexons > 2:
                status.append("m")
            else:
                status.append("U")

            status = string.join(status, "")

            structure_compatibility = 100

            if is_ok:
                nok += 1
                structure_compatibility = 100 - 100 * \
                    (comparison.mNumIdenticalExons +
                     comparison.mNumSkippedExons) / len(cmp_exons)
            if is_perfect:
                nperfect += 1
                structure_compatibility = 0

            if abs(comparison.mNumDifferenceExons) > param_max_exons_difference:
                compatibility_value = 100
            else:
                compatibility_value = structure_compatibility

            t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries

            if t == 0:
                compatibility_value = 0
            else:
                compatibility_value = 100 * \
                    (comparison.mNumMissedRefBoundaries +
                     comparison.mNumMissedCmpBoundaries) / t

            matrix_compatibility[x][y] = compatibility_value

            nidentical_exons += comparison.mNumIdenticalExons
            nskipped_exons += comparison.mNumSkippedExons
            ntotal_exons += len(cmp_exons)

            if param_loglevel >= 2:
                print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value,
                                                                    len(cmp_exons), len(ref_exons), str(comparison))

            # comparison to reference: count separately:
            if rx.search(key2):
                ref_nidentical_exons += comparison.mNumIdenticalExons
                ref_nskipped_exons += comparison.mNumSkippedExons
                ref_ntotal_exons += len(cmp_exons)
                if is_ok:
                    ref_nok += 1
                if is_perfect:
                    ref_nperfect += 1
                ref_ntotal += 1

            y += 1

        x += 1

    ntotal = wmali * (wmali - 1)

    print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                    ntotal, nperfect, nok,
                                                                                    float(
                                                                                        nperfect) / ntotal, float(nok) / ntotal,
                                                                                    ntotal_exons, nidentical_exons, nskipped_exons,
                                                                                    float(
                                                                                        nidentical_exons) / ntotal_exons,
                                                                                    float(nidentical_exons + nskipped_exons) / ntotal_exons)

    if ref_ntotal > 0:
        if ref_ntotal_exons == 0:
            raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % (
                ref_ntotal)

        print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                        ref_ntotal, ref_nperfect, ref_nok,
                                                                                        float(
                                                                                            ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal,
                                                                                        ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons,
                                                                                        float(
                                                                                            ref_nidentical_exons) / ref_ntotal_exons,
                                                                                        float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons)

    print "%s\tnexons\t%i\t%i\t" % (prefix,
                                    len(anexons), ref_nexons) +\
        string.join(map(lambda x: "%.2f" % x, (min(anexons),
                                               max(anexons),
                                               scipy.mean(
                                                   anexons),
                                               scipy.median(
                                                   anexons),
                                               scipy.std(anexons))), "\t")

    return matrix_compatibility
Esempio n. 25
0
                   "NEXONS_MEAN", "NEXONS_MEDIAN", "NEXONS_STDDEV")),
        "\t".join(("species", "NSPECIES", "SPECIES_MAX", "MAX_PER_SPECIES",
                   "UNKNOWN")),
        "\t".join(("failed", "NFAILED_SEQS", "NTOTAL_SEQS", "PFAILED_SEQS",
                   "NFAILED_PAIRS", "NTOTAL_PAIRS", "PFAILED_PAIRS")),
        "\t".join(("npairs", "NPAIRS_MIN", "NPAIRS_MAX", "NPAIRS_MEAN",
                   "NPAIRS_MEDIAN", "NPAIRS_STDDEV")),
        "\t".join(("ppairs", "PPAIRS_MIN", "PPAIRS_MAX", "PPAIRS_MEAN",
                   "PPAIRS_MEDIAN", "PPAIRS_STDDEV")),
        "\t".join(("cov", "COV_MIN", "COV_MAX", "COV_MEAN", "COV_MEDIAN",
                   "COV_STDDEV")),
        "\t".join(("pcov", "PCOV_MIN", "PCOV_MAX", "PCOV_MEAN", "PCOV_MEDIAN",
                   "PCOV_STDDEV")),
        "\t".join(
            ("genepair", "STATUS", "COMPATIBILITY", "CMP_NEXONS", "REF_NEXONS",
             Exons.ComparisonResult().GetHeader())),
        "\t".join(("bootstrap", "NORGS", "NOTUS", "PTEST", "PTOTAL", "FTOTAL",
                   evaluate_bootstrap.Results().printHeader())),
    ]

    if param_only_headers:
        print "PREFIX\t" + "\nPREFIX\t".join(headers)
        print E.GetFooter()
        sys.exit(0)
    else:
        print "# PREFIX\t" + "\n# PREFIX\t".join(headers)

    # 1. read multiple alignment in fasta format
    all_mali, all_identifiers = MaliIO.readFasta(sys.stdin)

    if len(all_identifiers) == 0:
Esempio n. 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/exons2exons.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      help="method to apply.",
                      type="choice",
                      choices=("remove-stop", ))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option("--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="work in forward coordinates.")

    parser.set_defaults(method=None,
                        forward_coordinates=False,
                        genome_file=None)

    (options, args) = E.Start(parser)

    if options.method == "remove-stop" and not options.genome_file:
        raise "please supply genome file for method %s" % options.method

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()
        exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    ninput, noutput, nremoved_stops, nremoved_exons = 0, 0, 0, 0
    for id, ee in exons.items():

        if options.loglevel >= 3:
            for e in ee:
                options.stdlog.write("# %s\n" % str(e))

        if options.method == "remove-stop":
            e = ee[-1]
            d = min(3, e.mPeptideTo - e.mPeptideFrom)
            if d < 3:
                codon2 = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                           e.mGenomeTo - d, e.mGenomeTo)
                prev_e = ee[-2]
                codon1 = fasta.getSequence(prev_e.mSbjctToken,
                                           prev_e.mSbjctStrand,
                                           prev_e.mGenomeTo - (3 - d),
                                           prev_e.mGenomeTo)
                codon = codon1 + codon2
            else:
                codon = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                          e.mGenomeTo - d, e.mGenomeTo)

            if codon.upper() in Genomics.StopCodons:

                if d < 3:
                    nremoved_exons += 1
                    d = 3 - d
                    del ee[-1]
                    e = ee[-1]

                e.mGenomeTo -= d
                e.mPeptideTo -= d
                nremoved_stops += 1

                if e.mGenomeTo == e.mGenomeFrom:
                    nremoved_exons += 1
                    del ee[-1]
                    e = ee[-1]

            assert (e.mGenomeTo > e.mGenomeFrom)
            assert (e.mPeptideTo > e.mPeptideFrom)

        if options.forward_coordinates:

            l = contig_sizes[ee[0].mSbjctToken]
            for e in ee:
                e.InvertGenomicCoordinates(l)

        for e in ee:
            options.stdout.write(str(e) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nremoved_stops=%i, nremoved_exons=%i\n" %
            (ninput, noutput, nremoved_stops, nremoved_exons))

    E.Stop()
Esempio n. 27
0
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

    elif options.output_format == "exontable":
        if options.format == "exons":
            exons = Exons.ReadExonBoundaries(sys.stdin,
                                             contig_sizes=contig_sizes,
                                             delete_missing=True)
        else:
            raise "unknown format."

        for k in exons.keys():
            ee = exons[k]

            id = 0
            for e in ee:
                id += 1
                print "\t".join(
                    map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo,
                              e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              e.mGenomeFrom, e.mGenomeTo)))
Esempio n. 28
0
    for o, a in optlist:
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version", ):
            print "version="
            sys.exit(0)
        elif o in ("-h", "--help"):
            print USAGE
            sys.exit(0)
        elif o in ("-c", "--contigs"):
            param_filename_contigs = a

    print E.GetHeader()
    print E.GetParams()

    last_exon = Exons.Exon()

    contig_sizes = {}
    if param_filename_contigs:

        infile = open(param_filename_contigs, "r")
        for line in infile:
            if line[0] == "#": continue

            sbjct_token, size = line[:-1].split("\t")[:2]
            contig_sizes[sbjct_token] = int(size)

    map_prediction2genome = alignlib_lite.makeAlignmentSet()
    nexons, npairs = 0, 0

    for line in sys.stdin:
Esempio n. 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2cleaned_mali.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-m",
                      "--genome-master",
                      dest="genome_master",
                      type="string",
                      help="genome to use as master.")

    parser.add_option("-s",
                      "--filename-removed",
                      dest="filename_removed",
                      type="string",
                      help="output filename for deleted entries.")

    parser.add_option("-e",
                      "--filename-exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="output filename of component summary.")

    parser.add_option("-c",
                      "--filename-components",
                      dest="filename_components",
                      type="string",
                      help="output filename for components.")

    parser.add_option(
        "--min-percent-overlap",
        dest="min_percent_overlap",
        type="float",
        help=
        "minimum percent overlap for splitting multiple alignment into components."
    )

    parser.add_option("--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="maximum percent overlap for split genes.")

    parser.add_option(
        "--min-genomic-distance",
        dest="min_genomic_distance",
        type="int",
        help=
        "minimum genomic distance for adjacent genes to be considered dodgy.")

    parser.add_option("-o",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("joining", "split"),
                      help="""how to filter the alignment.
                      joining: remove joining transcripts (spindly genes)
                      split:  remove split transcripts""")

    parser.add_option(
        "-g",
        "--gene-mode",
        dest="gene_mode",
        action="store_true",
        help=
        """the aligned sequences are genes. This forces the exon boundaries to
                      collated by genes.""")

    parser.set_defaults( \
        genome_master = None,
        filename_removed = None,
        filename_components = None,
        filename_summary = None,
        filename_exons = None,
        mode="joining",
        input_format = "fasta",
        output_format = "fasta",
        max_percent_overlap = 0,
        min_percent_overlap = 0,
        gene_mode = False,
        separator = "|")

    (options, args) = E.Start(parser)

    ###############################################################
    ###############################################################
    ###############################################################
    ## input
    ###############################################################

    mali = Mali.Mali()
    mali.readFromFile(sys.stdin, format=options.input_format)
    all_identifiers = mali.getIdentifiers()

    if options.filename_exons:
        ## read exon boundaries and keep forward coordinates

        if options.gene_mode:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             from_zero=True)

            gene_exons = {}
            for id, ee in exons.items():
                data = id.split(options.separator)
                new_id = options.separator.join((data[0], data[2]))
                if new_id not in gene_exons: gene_exons[new_id] = []
                for e in ee:
                    e.mQueryToken = new_id
                gene_exons[new_id] += ee
            for id, ee in gene_exons.items():
                ee.sort(lambda x, y: cmp(x.mGenomeFrom, y.mGenomeFrom))
            exons = gene_exons

        else:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             filter=set(all_identifiers),
                                             from_zero=True)

    else:
        exons = {}

    ###############################################################
    ###############################################################
    ###############################################################
    ## collect all transcripts for a species together with their
    ## aligned length
    ###############################################################
    map_species2transcripts = {}

    for id in mali.getIdentifiers():
        data = id.split(options.separator)

        species = data[0]

        if exons:
            l = exons[id][-1].mGenomeTo - exons[id][0].mGenomeFrom
        else:
            l = len(mali.getEntry(id).getSequence())

        try:
            map_species2transcripts[species].append((l, id))
        except KeyError:
            map_species2transcripts[species] = [(l, id)]

    if options.mode == "joining":
        mapped_transcripts = removeJoiningTranscripts(mali, exons,
                                                      map_species2transcripts,
                                                      options)

    elif options.mode == "split":
        mapped_transcripts = removeSplitTranscripts(mali, exons,
                                                    map_species2transcripts,
                                                    options)

    ###############################################################
    ###############################################################
    ###############################################################
    ## now build overlap graph of remaining sequences split multiple
    ## alignment in components.
    ## Compute reciprocal best match graph
    ###############################################################
    graph = networkx.Graph()

    removed_transcripts = set(map(lambda x: x[0], mapped_transcripts))

    for t in all_identifiers:
        if t not in removed_transcripts:
            graph.add_node(t)

    for t1 in range(len(all_identifiers) - 1):
        transcript1 = all_identifiers[t1]
        if transcript1 in removed_transcripts: continue

        for t2 in range(t1 + 1, len(all_identifiers)):
            transcript2 = all_identifiers[t2]
            if transcript2 in removed_transcripts: continue

            overlap = getPercentOverlap(mali[transcript1], mali[transcript2])
            if overlap > 5:
                graph.add_edge(transcript1, transcript2)

    ## compute components
    components = networkx.connected_components(graph)

    ###############################################################
    ###############################################################
    ###############################################################
    ## output
    ###############################################################
    if options.filename_components:
        n = 1
        outfile = open(options.filename_components, "w")

        outfile.write("id\tcomponent\n")
        for component in components:
            for c in component:
                outfile.write("%s\t%i\n" % (c, n))
            n += 1
        outfile.close()

    if options.filename_removed and len(removed_transcripts) > 0:
        outfile = open(options.filename_removed, "w")
        outfile.write("removed\trepresentative\treason\n")
        for removed_transcript, rep_transcript, reason in mapped_transcripts:
            outfile.write("%s\t%s\t%s\n" %
                          (removed_transcript, rep_transcript, reason))
        outfile.close()

    if options.filename_summary:
        n = 1
        outfile = open(options.filename_summary, "w")
        outfile.write("component\tsize\tnspecies\tnmaster\n")
        for component in components:
            species = map(lambda x: x.split(options.separator)[0], component)
            outfile.write(
                "%i\t%i\t%i\t%i\t%i\n" %
                (n, len(component), len(species),
                 len(filter(lambda x: x == options.genome_master, species))))

            n += 1

    for transcript in removed_transcripts:
        mali.deleteEntry(transcript)

    new_identifiers = mali.getIdentifiers()

    mali.removeGaps(minimum_gaps=len(new_identifiers))

    mali.writeToFile(options.stdout, format=options.output_format)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# input=%i, output=%i, removed=%i, ncomponents=%i\n" %
            (len(all_identifiers), len(new_identifiers),
             len(removed_transcripts), len(components)))
        options.stdlog.write("# final component sizes: %s\n" %
                             ",".join(map(lambda x: str(len(x)), components)))

    E.Stop()