def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""): """split multiple alignment into clusters of orthologous transcripts. Orthologous transcripts are defined by similarity of gene structure to query sequences. Also: return matrix of gene structure compatibility 0 : perfect compatibility (exact match) ratio of missed exon boundaries to total exon boundaries. 100 : no compatibility """ wmali = len(identifiers) lmali = len(mali[identifiers[0]]) matrix_compatibility = numpy.zeros((wmali, wmali)) if len(identifiers) == 0: return wmali = len(identifiers) lmali = len(mali[identifiers[0]]) nok = 0 nperfect = 0 ntotal_exons = 0 nidentical_exons = 0 nskipped_exons = 0 ref_nok = 0 ref_nperfect = 0 ref_ntotal_exons = 0 ref_nidentical_exons = 0 ref_nskipped_exons = 0 ref_ntotal = 0 rx = re.compile(param_master_pattern) # list of number of exons anexons = [] ## exons in reference ref_nexons = 0 x = 0 for key1 in identifiers: seq = mali[key1] matches = [] unassigned = [] is_perfect = False anexons.append(len(exons[key1])) if rx.search(key1): ref_nexons = len(exons[key1]) y = 0 for key2 in identifiers: if key2 == key1: continue if param_loglevel >= 3: print "#############################################" print "# comparing %s to %s" % (key1, key2) mref = 0 mcmp = 0 seq_master = mali[key2] ref_exons = exons[key2] map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char) # map exon boundaries to reference sequence cmp_exons = [] if param_loglevel >= 5: print alignlib_lite.py_writeAlignataTable(map_cmp2ref) for e in exons[key1]: ne = e.GetCopy() ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1) ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0) cmp_exons.append(ne) # massage boundaries for terminal exons: if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom if cmp_exons[-1].mPeptideTo <= 0: cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo if param_loglevel >= 4: for e in exons[key1]: print "# exon", str(e) if param_loglevel >= 3: for e in cmp_exons: print "# exon", str(e) for e in ref_exons: print "# exon", str(e) # do exon comparison comparison = Exons.CompareGeneStructures(cmp_exons, ref_exons, threshold_min_pide=0, threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary) if param_loglevel >= 3: print comparison.Pretty(prefix="# EVAL: ") # analyse results min_nexons = min(len(cmp_exons), len(ref_exons)) max_nexons = max(len(cmp_exons), len(ref_exons)) similarity = (max_nexons - comparison.mNumIdenticalExons) * \ (abs(comparison.mNumDifferenceExons)) is_perfect = False is_ok = False status = [] # non-equivalent exon pairs ne = len(cmp_exons) - comparison.mNumIdenticalExons - \ comparison.mNumSkippedExons is_perfect = False is_ok = False if comparison.mNumIdenticalExons == 0: # F: complete and utter failure, no excuses status.append("F") else: if ne == 0: # P: perfect conservation status.append("=") is_ok = True is_perfect = True elif ne == min_nexons - comparison.mNumSkippedExons: # D: completely different predictions status.append("D") elif ne in (1, 2): # A: almost conserved status.append("A") is_ok = True elif ne > 2: # M : mostly conserved (in case of long proteins that is # good enough). if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity: status.append("M") else: # S : spuriously conserved status.append("S") else: # U: unconserved status.append("U") if len(cmp_exons) > len(ref_exons): status.append(">") elif len(ref_exons) < len(cmp_exons): status.append("<") else: status.append("=") if min_nexons == max_nexons and min_nexons == 1: status.append("S") elif min_nexons == 1 and max_nexons == 2: status.append("s") elif min_nexons == 2 and max_nexons == 2: status.append("D") elif min_nexons == 2 and max_nexons > 2: status.append("d") elif min_nexons == max_nexons: status.append("M") elif min_nexons > 2 and max_nexons > 2: status.append("m") else: status.append("U") status = string.join(status, "") structure_compatibility = 100 if is_ok: nok += 1 structure_compatibility = 100 - 100 * \ (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons) if is_perfect: nperfect += 1 structure_compatibility = 0 if abs(comparison.mNumDifferenceExons) > param_max_exons_difference: compatibility_value = 100 else: compatibility_value = structure_compatibility t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries if t == 0: compatibility_value = 0 else: compatibility_value = 100 * \ (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t matrix_compatibility[x][y] = compatibility_value nidentical_exons += comparison.mNumIdenticalExons nskipped_exons += comparison.mNumSkippedExons ntotal_exons += len(cmp_exons) if param_loglevel >= 2: print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value, len(cmp_exons), len(ref_exons), str(comparison)) # comparison to reference: count separately: if rx.search(key2): ref_nidentical_exons += comparison.mNumIdenticalExons ref_nskipped_exons += comparison.mNumSkippedExons ref_ntotal_exons += len(cmp_exons) if is_ok: ref_nok += 1 if is_perfect: ref_nperfect += 1 ref_ntotal += 1 y += 1 x += 1 ntotal = wmali * (wmali - 1) print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ntotal, nperfect, nok, float( nperfect) / ntotal, float(nok) / ntotal, ntotal_exons, nidentical_exons, nskipped_exons, float( nidentical_exons) / ntotal_exons, float(nidentical_exons + nskipped_exons) / ntotal_exons) if ref_ntotal > 0: if ref_ntotal_exons == 0: raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % ( ref_ntotal) print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ref_ntotal, ref_nperfect, ref_nok, float( ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal, ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, float( ref_nidentical_exons) / ref_ntotal_exons, float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons) print "%s\tnexons\t%i\t%i\t" % (prefix, len(anexons), ref_nexons) +\ string.join(map(lambda x: "%.2f" % x, (min(anexons), max(anexons), scipy.mean( anexons), scipy.median( anexons), scipy.std(anexons))), "\t") return matrix_compatibility
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str( alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures(cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len( intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped