Exemple #1
0
        if line[0] == "#": continue

        this_exon = Exons.Exon()
        this_exon.Read(line)

        if this_exon.mSbjctStrand == "-":
            this_exon.InvertGenomicCoordinates(
                contig_sizes[this_exon.mSbjctToken])

        nexons += 1

        if last_exon.mQueryToken != this_exon.mQueryToken:

            if last_exon.mQueryToken:
                f = alignlib_lite.AlignmentFormatEmissions(
                    map_prediction2genome)
                print string.join(
                    map(str, (last_exon.mQueryToken, last_exon.mSbjctToken,
                              last_exon.mSbjctStrand, f)), "\t")

                npairs += 1
            map_prediction2genome.clear()

        alignlib_lite.addDiagonal2Alignment(
            map_prediction2genome, this_exon.mPeptideFrom + 1,
            this_exon.mPeptideTo + 1,
            this_exon.mGenomeFrom - this_exon.mPeptideFrom)

        last_exon = this_exon

    f = alignlib_lite.AlignmentFormatEmissions(map_prediction2genome)
def IsParalogLink(link, cds1, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    """

    map_a2b = alignlib_lite.makeAlignmentVector()
    alignlib_lite.AlignmentFormatEmissions(link.mQueryFrom, link.mQueryAli,
                                           link.mSbjctFrom,
                                           link.mSbjctAli).copy(map_a2b)

    if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or \
       link.mSbjctLength < (map_a2b.getColTo() - map_a2b.getColFrom() + 1):
        print "ERRONEOUS LINK: %s" % str(link)
        raise "length discrepancy"

    coverage_a = 100.0 * \
        (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength
    coverage_b = 100.0 * \
        (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength

    # check exon boundaries, look at starts, skip first exon
    def MyMap(a, x):
        if x < a.getRowFrom():
            return 0
        while x <= a.getRowTo():
            c = a.mapRowToCol(x)
            if c:
                return c
            x += 1
        else:
            return 0

    mapped_boundaries = UniquifyList(
        map(lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), cds1[1:]))
    reference_boundaries = UniquifyList(
        map(lambda x: x.mPeptideFrom / 3 + 1, cds2[1:]))

    nmissed = 0
    nfound = 0
    nmin = min(len(mapped_boundaries), len(reference_boundaries))
    nmax = max(len(mapped_boundaries), len(reference_boundaries))
    both_single_exon = len(cds1) == 1 and len(cds2) == 1
    one_single_exon = len(cds1) == 1 or len(cds2) == 1
    if len(mapped_boundaries) < len(reference_boundaries):
        mless = mapped_boundaries
        mmore = reference_boundaries
    else:
        mmore = mapped_boundaries
        mless = reference_boundaries

    # check if exon boundaries are ok
    for x in mless:
        is_ok = 0
        for c in mmore:
            if abs(x - c) < param_boundaries_max_slippage:
                is_ok = 1
                break
        if is_ok:
            nfound += 1
        else:
            nmissed += 1

    # set is_ok for dependent on exon boundaries
    # in single exon cases, require a check of coverage
    is_ok = False
    check_coverage = False
    if both_single_exon or one_single_exon:
        is_ok = True
        check_coverage = True
    else:
        if nmin == 1:
            is_ok = nmissed == 0
        elif nmin == 2:
            is_ok = nmissed <= 1
        elif nmin > 2:
            is_ok = nfound >= 2

    cc = min(coverage_a, coverage_b)

    if param_loglevel >= 3:
        print "# nquery=", len(cds1), "nsbjct=", len(cds2), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, \
              "is_ok=", is_ok, "check_cov=", check_coverage, \
              "min_cov=", cc, coverage_a, coverage_b, \
              "mapped=", mapped_boundaries, "reference=", reference_boundaries

    if not is_ok:
        return True, "different exon boundaries"

    if check_coverage and cc < param_min_coverage:
        return True, "low coverage"

    return False, None
Exemple #3
0
 def Expand(self):
     self.mMapOld2New = alignlib_lite.makeAlignmentVector()
     alignlib_lite.AlignmentFormatEmissions(
         self.mOldFrom, self.mOldAli, self.mNewFrom,
         self.mNewAli).copy(self.mMapOld2New)
Exemple #4
0
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2,
                           peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]),
                transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]),
                transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (
                        seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(
                cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3,
                (map_a2b.getRowTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(
                cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3,
                (map_a2b.getColTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1,
                                       c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries,
                param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries,
                param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                        (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(
                    map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                              map_a2b.getNumGaps(), coverage_a, coverage_b)),
                    "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             sbjct_token1, sbjct_strand1, sbjct_from1,
                             sbjct_to1, seqs1[i].getLength(), sbjct_token2,
                             sbjct_strand2, sbjct_from2, sbjct_to2,
                             seqs2[j].getLength(), map_a2b.getRowFrom(),
                             map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
                             map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(),
                             coverage_a, coverage_b, nmissed_cmp2ref,
                             mapped_boundaries, nmissed_ref2cmp,
                             reference_boundaries, i, j, len(c1), len(c2), cc,
                             is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                             map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                             map_a2b.getNumGaps(), coverage_a, coverage_b)),
                        "\t")
                elif method == "location":
                    print string.join(
                        map(str, ("pair", method, prediction_id1,
                                  prediction_id2, sbjct_token1, sbjct_strand1,
                                  sbjct_from1, sbjct_to1, seqs1[i].getLength(),
                                  sbjct_token2, sbjct_strand2, sbjct_from2,
                                  sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (
                        i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
Exemple #5
0
                    else:
                        alignlib_lite.copyAlignment(tmp_map_row2col,
                                                    map_row2col,
                                                    e1.mPeptideFrom / 3 + 1,
                                                    e1.mPeptideTo / 3 + 1,
                                                    e2.mPeptideFrom / 3 + 1,
                                                    e2.mPeptideTo / 3 + 1)

                    # in case of split codons, there is an alignment of length
                    # 1. Skip that.
                    if tmp_map_row2col.getLength() > 1:

                        print string.join(
                            map(str, (link.mQueryToken, e1.mRank,
                                      link.mSbjctToken, e2.mRank, link.mEvalue,
                                      alignlib_lite.AlignmentFormatEmissions(
                                          tmp_map_row2col))), "\t")

                        npairs += 1
        else:
            if param_loglevel >= 2:
                print "# SKIPPED: %s" % str(link)
            nskipped += 1

        if (ninput % param_report_step) == 0:
            if param_loglevel >= 1:
                print "# ninput=%i, noutput=%i, nskipped=%i" % (ninput, npairs,
                                                                nskipped)
            sys.stdout.flush()

    if param_loglevel >= 1:
        print "# ninput=%i, noutput=%i, nskipped=%i" % (ninput, npairs,
Exemple #6
0
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2,
               transcript2, peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(
            alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(cds1,
                                         cds2,
                                         map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
#######################################################################
# write alignment of introns for orthologous introns
# orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(
                            intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons,
                                                          nskipped_introns)

    return nerrors, nskipped
Exemple #7
0
def WriteGeneStructureCorrespondence( mali, identifiers, exons, param_master_pattern, gap_char = "-" , prefix = "" ):
    """split multiple alignment into clusters of orthologous transcripts.

    Orthologous transcripts are defined by similarity of gene structure to
    query sequences.

    Also: return matrix of gene structure compatibility

    0   : perfect compatibility (exact match)

    ratio of missed exon boundaries to total exon boundaries.
    
    100 : no compatibility
    """

    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    matrix_compatibility = numpy.zeros( (wmali, wmali) )

    if len(identifiers) == 0: return
    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    nok = 0
    nperfect   = 0

    ntotal_exons = 0
    nidentical_exons = 0
    nskipped_exons = 0

    ref_nok = 0
    ref_nperfect = 0

    ref_ntotal_exons = 0
    ref_nidentical_exons = 0
    ref_nskipped_exons = 0
    ref_ntotal = 0
    
    rx = re.compile( param_master_pattern )

    ## list of number of exons
    anexons = []

    ## exons in reference
    ref_nexons = 0
    for x in range(len(identifiers)):

        key1 = identifiers[x]
        seq = mali[key1]
        
        matches = []
        unassigned = []

        is_perfect = False

        anexons.append( len (exons[key1]) )
        if rx.search( key1 ):
            ref_nexons = len(exons[key1] )

        for y in range(len(identifiers)):

            key2 = identifiers[y]
            
            if key2 == key1:
                continue
            
            if param_loglevel >= 3:
                print "#############################################"
                print "# comparing %s to %s" % (key1, key2)
            
            mref = 0
            mcmp = 0
            
            seq_master = mali[key2]
            ref_exons = exons[key2]

            map_cmp2ref = MaliIO.getMapFromMali( seq, seq_master, gap_char )

            ## map exon boundaries to reference sequence
            cmp_exons = []

            if param_loglevel >= 5:
                print str(alignlib_lite.AlignmentFormatEmissions( map_cmp2ref ))
            
            for e in exons[key1]:
                ne = e.GetCopy()
                ne.mPeptideFrom = MyMap( map_cmp2ref, e.mPeptideFrom + 1, 3, -1 )
                ne.mPeptideTo   = MyMap( map_cmp2ref, e.mPeptideTo, 3, 0 )
                cmp_exons.append(ne)

            ## massage boundaries for terminal exons:
            if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom
            if cmp_exons[-1].mPeptideTo  <= 0: cmp_exons[-1].mPeptideTo  = ref_exons[-1].mPeptideTo
                
            if param_loglevel >= 4:
                for e in exons[key1]:
                    print "# exon", str(e)
                    
            if param_loglevel >= 3:                    
                for e in cmp_exons:
                    print "# exon", str(e)
                for e in ref_exons:
                    print "# exon", str(e)

            ## do exon comparison
            comparison = Exons.CompareGeneStructures( cmp_exons, 
                                                      ref_exons,
                                                      threshold_min_pide = 0,
                                                      threshold_slipping_exon_boundary = param_threshold_splipping_exon_boundary,
                                                      threshold_terminal_exon = param_threshold_terminal_exon )


            if param_loglevel >= 3:
                print comparison.Pretty( prefix = "# EVAL: ")

            ## analyse results
            min_nexons = min(len(cmp_exons), len(ref_exons))                
            max_nexons = max(len(cmp_exons), len(ref_exons))

            similarity = (max_nexons - comparison.mNumIdenticalExons) * (abs( comparison.mNumDifferenceExons))
            
            is_perfect = False
            is_ok = False
            status = []

            # non-equivalent exon pairs
            ne = len(cmp_exons) - comparison.mNumIdenticalExons - comparison.mNumSkippedExons

            is_perfect = False
            is_ok = False
            if comparison.mNumIdenticalExons == 0:
                # F: complete and utter failure, no excuses
                status.append( "F" )
            else:
                if ne == 0:
                    # P: perfect conservation
                    status.append( "=" )
                    is_ok = True
                    is_perfect = True
                elif ne == min_nexons - comparison.mNumSkippedExons:
                    # D: completely different predictions
                    status.append( "D" )
                elif ne in (1,2):
                    # A: almost conserved
                    status.append( "A" )
                    is_ok = True
                elif ne > 2:
                    # M : mostly conserved (in case of long proteins that is good enough).
                    if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity:
                        status.append( "M" )
                    else:
                    # S : spuriously conserved
                        status.append( "S" )                    
                else:
                    # U: unconserved
                    status.append( "U" )

            if len(cmp_exons) > len(ref_exons):
                status.append( ">" )
            elif len(ref_exons) < len(cmp_exons):
                status.append( "<" )
            else:
                status.append( "=" )

            if min_nexons == max_nexons and min_nexons == 1:
                status.append( "S" )
            elif min_nexons == 1 and max_nexons == 2:
                status.append( "s")
            elif min_nexons == 2 and max_nexons == 2:
                status.append( "D" )
            elif min_nexons == 2 and max_nexons > 2:
                status.append( "d" )
            elif min_nexons == max_nexons:
                status.append( "M" )
            elif min_nexons > 2 and max_nexons > 2:
                status.append( "m" )                
            else:
                status.append( "U")
                
            status = string.join( status, "")

            structure_compatibility = 100
            
            if is_ok:
                nok += 1
                structure_compatibility = 100 - 100 * (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons)
            if is_perfect:
                nperfect += 1
                structure_compatibility = 0
                
            if abs(comparison.mNumDifferenceExons) > param_max_exons_difference:
                compatibility_value = 100
            else:
                compatibility_value = structure_compatibility

            t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries

            if t == 0:
                compatibility_value = 0
            else:
                compatibility_value = 100 * (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t

            matrix_compatibility[x][y] = compatibility_value
            
            nidentical_exons += comparison.mNumIdenticalExons
            nskipped_exons   += comparison.mNumSkippedExons
            ntotal_exons     += len(cmp_exons)

            if param_loglevel >= 2:
                print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value,
                                                                    len(cmp_exons), len(ref_exons), str(comparison))

            ## comparison to reference: count separately:
            if rx.search( key2 ):
                ref_nidentical_exons += comparison.mNumIdenticalExons
                ref_nskipped_exons   += comparison.mNumSkippedExons
                ref_ntotal_exons     += len(cmp_exons)
                if is_ok: ref_nok += 1
                if is_perfect: ref_nperfect += 1
                ref_ntotal += 1
                
    ntotal = wmali * ( wmali - 1)

    print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                    ntotal, nperfect, nok,
                                                                                    float(nperfect) / ntotal, float(nok) / ntotal,
                                                                                    ntotal_exons, nidentical_exons, nskipped_exons, 
                                                                                    float(nidentical_exons) / ntotal_exons,
                                                                                    float(nidentical_exons + nskipped_exons) / ntotal_exons)

    if ref_ntotal > 0:
        if ref_ntotal_exons == 0:
            raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % (ref_ntotal)
        
        print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                        ref_ntotal, ref_nperfect, ref_nok,
                                                                                        float(ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal,
                                                                                        ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, 
                                                                                        float(ref_nidentical_exons) / ref_ntotal_exons,
                                                                                        float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons)
        
    print "%s\tnexons\t%i\t%i\t" % (prefix,
                                    len(anexons), ref_nexons) +\
                                    string.join(map(lambda x: "%.2f" % x, (min(anexons),
                                                                           max(anexons),
                                                                           scipy.mean(anexons),
                                                                           scipy.median(anexons),
                                                                           numpy.std(anexons))), "\t")

    return matrix_compatibility
Exemple #8
0
def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1,
                                                    lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(
                    map_rep2mem,
                    alignlib_lite.makeSequence(peptide_sequences[rep]),
                    alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id
Exemple #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="input uses forward coordinates.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("default", "cds", "cdnas", "map", "gff",
                               "intron-fasta", "exons"),
                      help="output format.")

    parser.add_option("-r",
                      "--reset-to-start",
                      dest="reset_to_start",
                      action="store_true",
                      help="move genomic coordinates to begin from 0.")

    parser.add_option("--reset-query",
                      dest="reset_query",
                      action="store_true",
                      help="move peptide coordinates to begin from 0.")

    parser.set_defaults(genome_file=None,
                        forward_coordinates=False,
                        format="default",
                        reset_to_start=False,
                        reset_query=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0

    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line.startswith("id"):
            continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write("# parsing failed with msg %s in line %s" %
                                 (msg, line))
            nerrors += 1
            continue

        cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome,
                                    query_from=entry.mQueryFrom,
                                    sbjct_from=entry.mSbjctGenomeFrom,
                                    add_stop_codon=0)

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write(
                "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write("# %s\n" % entry)

        lsequence = fasta.getLength(entry.mSbjctToken)
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        # deal with forward coordinates: convert them to negative strand
        # coordinates
        if options.forward_coordinates and \
                entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \
                entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates(lsequence)

        # attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom - entry.mSbjctGenomeFrom
            end = cd.mGenomeTo - entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        # reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        # play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0

        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write("\t".join(
                    map(str, (entry.mPredictionId, cd.mSbjctToken,
                              cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) +
                                     "\n")

        elif options.format == "cdnas":
            print string.join(
                map(str,
                    (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken,
                     entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset,
                     entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib_lite.makeAlignmentSet()

            for cd in cds:
                alignlib_lite.addDiagonal2Alignment(
                    map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo,
                    (cd.mGenomeFrom - offset) - cd.mPeptideFrom)

            print string.join(
                map(str, (entry.mPredictionId, entry.mSbjctToken,
                          entry.mSbjctStrand,
                          alignlib_lite.AlignmentFormatEmissions(
                              map_prediction2genome))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (
                    entry.mPredictionId, rank, entry.mSbjctToken,
                    entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd.
                                            mGenomeFrom -
                                            entry.mSbjctGenomeFrom]
                options.stdout.write(">%s\n%s\n" % (key, sequence))
                last = cd.mGenomeTo

        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  (entry.mSbjctToken,
                   "gpipe", "similarity",
                   entry.mSbjctGenomeFrom,
                   entry.mSbjctGenomeTo,
                   entry.mPercentIdentity,
                   entry.mSbjctStrand,
                   ".",
                   entry.mQueryToken,
                   entry.mQueryFrom,
                   entry.mQueryTo,
                   entry.score,
                   entry.mNIntrons,
                   entry.mNFrameShifts,
                   entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      (entry.mSbjctToken,
                       "gpipe", "similarity",
                       cd.mGenomeFrom,
                       cd.mGenomeTo,
                       entry.mPercentIdentity,
                       entry.mSbjctStrand,
                       ".",
                       entry.mQueryToken,
                       cd.mPeptideFrom / 3 + 1,
                       cd.mPeptideTo / 3 + 1,
                       entry.score,
                       rank,
                       len(cds),
                       entry.mPredictionId)
        else:
            exon_from = 0
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom
                cd.mPeptideTo = exon_from
                print string.join(
                    map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.frame, cd.mGenomeFrom,
                              cd.mGenomeTo, cd.mSequence)), "\t")
                cds_id += 1

        noutput += 1
Exemple #10
0
                    tt += 1
                    continue
                overlap += (min(r.mGenomeTo, t.mGenomeTo) -
                            max(r.mGenomeFrom, t.mGenomeFrom))
                rr += 1
                tt += 1

            if overlap == 0:
                continue

            map_reference2target.clear()
            row = alignlib_lite.makeSequence(reference.mTranslation)
            col = alignlib_lite.makeSequence(target.mTranslation)
            alignator.align(map_reference2target, row, col)

            f = alignlib_lite.AlignmentFormatEmissions(map_reference2target)
            row_ali, col_ali = f.mRowAlignment, f.mColAlignment
            pidentity = 100.0 * alignlib_lite.calculatePercentIdentity(
                map_reference2target, row, col)
            psimilarity = 100.0 * alignlib_lite.calculatePercentSimilarity(
                map_reference2target)

            union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )
            inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )

            assignment_id += 1

            print string.join(
                map(str,