コード例 #1
0
ファイル: mali_evaluate.py プロジェクト: lesheng/cgat
def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""):
    """split multiple alignment into clusters of orthologous transcripts.

    Orthologous transcripts are defined by similarity of gene structure to
    query sequences.

    Also: return matrix of gene structure compatibility

    0   : perfect compatibility (exact match)

    ratio of missed exon boundaries to total exon boundaries.

    100 : no compatibility
    """

    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    matrix_compatibility = numpy.zeros((wmali, wmali))

    if len(identifiers) == 0:
        return
    wmali = len(identifiers)
    lmali = len(mali[identifiers[0]])

    nok = 0
    nperfect = 0

    ntotal_exons = 0
    nidentical_exons = 0
    nskipped_exons = 0

    ref_nok = 0
    ref_nperfect = 0

    ref_ntotal_exons = 0
    ref_nidentical_exons = 0
    ref_nskipped_exons = 0
    ref_ntotal = 0

    rx = re.compile(param_master_pattern)

    # list of number of exons
    anexons = []

    ## exons in reference
    ref_nexons = 0
    x = 0
    for key1 in identifiers:

        seq = mali[key1]

        matches = []
        unassigned = []

        is_perfect = False

        anexons.append(len(exons[key1]))
        if rx.search(key1):
            ref_nexons = len(exons[key1])

        y = 0
        for key2 in identifiers:

            if key2 == key1:
                continue

            if param_loglevel >= 3:
                print "#############################################"
                print "# comparing %s to %s" % (key1, key2)

            mref = 0
            mcmp = 0

            seq_master = mali[key2]
            ref_exons = exons[key2]

            map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char)

            # map exon boundaries to reference sequence
            cmp_exons = []

            if param_loglevel >= 5:
                print alignlib_lite.py_writeAlignataTable(map_cmp2ref)

            for e in exons[key1]:
                ne = e.GetCopy()
                ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1)
                ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0)
                cmp_exons.append(ne)

            # massage boundaries for terminal exons:
            if cmp_exons[0].mPeptideFrom <= 0:
                cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom
            if cmp_exons[-1].mPeptideTo <= 0:
                cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo

            if param_loglevel >= 4:
                for e in exons[key1]:
                    print "# exon", str(e)

            if param_loglevel >= 3:
                for e in cmp_exons:
                    print "# exon", str(e)
                for e in ref_exons:
                    print "# exon", str(e)

            # do exon comparison
            comparison = Exons.CompareGeneStructures(cmp_exons,
                                                     ref_exons,
                                                     threshold_min_pide=0,
                                                     threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary)

            if param_loglevel >= 3:
                print comparison.Pretty(prefix="# EVAL: ")

            # analyse results
            min_nexons = min(len(cmp_exons), len(ref_exons))
            max_nexons = max(len(cmp_exons), len(ref_exons))

            similarity = (max_nexons - comparison.mNumIdenticalExons) * \
                (abs(comparison.mNumDifferenceExons))

            is_perfect = False
            is_ok = False
            status = []

            # non-equivalent exon pairs
            ne = len(cmp_exons) - comparison.mNumIdenticalExons - \
                comparison.mNumSkippedExons

            is_perfect = False
            is_ok = False
            if comparison.mNumIdenticalExons == 0:
                # F: complete and utter failure, no excuses
                status.append("F")
            else:
                if ne == 0:
                    # P: perfect conservation
                    status.append("=")
                    is_ok = True
                    is_perfect = True
                elif ne == min_nexons - comparison.mNumSkippedExons:
                    # D: completely different predictions
                    status.append("D")
                elif ne in (1, 2):
                    # A: almost conserved
                    status.append("A")
                    is_ok = True
                elif ne > 2:
                    # M : mostly conserved (in case of long proteins that is
                    # good enough).
                    if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity:
                        status.append("M")
                    else:
                    # S : spuriously conserved
                        status.append("S")
                else:
                    # U: unconserved
                    status.append("U")

            if len(cmp_exons) > len(ref_exons):
                status.append(">")
            elif len(ref_exons) < len(cmp_exons):
                status.append("<")
            else:
                status.append("=")

            if min_nexons == max_nexons and min_nexons == 1:
                status.append("S")
            elif min_nexons == 1 and max_nexons == 2:
                status.append("s")
            elif min_nexons == 2 and max_nexons == 2:
                status.append("D")
            elif min_nexons == 2 and max_nexons > 2:
                status.append("d")
            elif min_nexons == max_nexons:
                status.append("M")
            elif min_nexons > 2 and max_nexons > 2:
                status.append("m")
            else:
                status.append("U")

            status = string.join(status, "")

            structure_compatibility = 100

            if is_ok:
                nok += 1
                structure_compatibility = 100 - 100 * \
                    (comparison.mNumIdenticalExons +
                     comparison.mNumSkippedExons) / len(cmp_exons)
            if is_perfect:
                nperfect += 1
                structure_compatibility = 0

            if abs(comparison.mNumDifferenceExons) > param_max_exons_difference:
                compatibility_value = 100
            else:
                compatibility_value = structure_compatibility

            t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries

            if t == 0:
                compatibility_value = 0
            else:
                compatibility_value = 100 * \
                    (comparison.mNumMissedRefBoundaries +
                     comparison.mNumMissedCmpBoundaries) / t

            matrix_compatibility[x][y] = compatibility_value

            nidentical_exons += comparison.mNumIdenticalExons
            nskipped_exons += comparison.mNumSkippedExons
            ntotal_exons += len(cmp_exons)

            if param_loglevel >= 2:
                print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value,
                                                                    len(cmp_exons), len(ref_exons), str(comparison))

            # comparison to reference: count separately:
            if rx.search(key2):
                ref_nidentical_exons += comparison.mNumIdenticalExons
                ref_nskipped_exons += comparison.mNumSkippedExons
                ref_ntotal_exons += len(cmp_exons)
                if is_ok:
                    ref_nok += 1
                if is_perfect:
                    ref_nperfect += 1
                ref_ntotal += 1

            y += 1

        x += 1

    ntotal = wmali * (wmali - 1)

    print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                    ntotal, nperfect, nok,
                                                                                    float(
                                                                                        nperfect) / ntotal, float(nok) / ntotal,
                                                                                    ntotal_exons, nidentical_exons, nskipped_exons,
                                                                                    float(
                                                                                        nidentical_exons) / ntotal_exons,
                                                                                    float(nidentical_exons + nskipped_exons) / ntotal_exons)

    if ref_ntotal > 0:
        if ref_ntotal_exons == 0:
            raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % (
                ref_ntotal)

        print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix,
                                                                                        ref_ntotal, ref_nperfect, ref_nok,
                                                                                        float(
                                                                                            ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal,
                                                                                        ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons,
                                                                                        float(
                                                                                            ref_nidentical_exons) / ref_ntotal_exons,
                                                                                        float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons)

    print "%s\tnexons\t%i\t%i\t" % (prefix,
                                    len(anexons), ref_nexons) +\
        string.join(map(lambda x: "%.2f" % x, (min(anexons),
                                               max(anexons),
                                               scipy.mean(
                                                   anexons),
                                               scipy.median(
                                                   anexons),
                                               scipy.std(anexons))), "\t")

    return matrix_compatibility
コード例 #2
0
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2,
               transcript2, peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(
            alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(cds1,
                                         cds2,
                                         map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
#######################################################################
# write alignment of introns for orthologous introns
# orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(
                            intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons,
                                                          nskipped_introns)

    return nerrors, nskipped