def WriteRadius(mali, identifiers, prefix="", gap_char="-"): """write percent identities in pairwise comparisons both for nucleotide acids and amino acids.""" pides_na = [] seq_aa = [] for x in range(0, len(identifiers)): seq_aa.append(Genomics.TranslateDNA2Protein(mali[identifiers[x]])) for y in range(x + 1, len(identifiers)): if x == y: continue pides_na.append(MaliIO.getPercentIdentity( mali[identifiers[x]], mali[identifiers[y]], gap_char)) pides_aa = [] for x in range(0, len(identifiers) - 1): for y in range(x + 1, len(identifiers)): pides_aa.append( MaliIO.getPercentIdentity(seq_aa[x], seq_aa[y], gap_char)) print "%s\tpide\t%i\t" % (prefix, len(pides_na)) +\ string.join(map(lambda x: "%.2f" % x, (min(pides_na), max(pides_na), scipy.mean(pides_na), scipy.median(pides_na), numpy.std(pides_na))), "\t") + "\t" +\ string.join(map(lambda x: "%.2f" % x, (min(pides_aa), max(pides_aa), scipy.mean(pides_aa), scipy.median(pides_aa), numpy.std(pides_aa))), "\t")
def WriteRadius(mali, identifiers, prefix="", gap_char="-"): """write percent identities in pairwise comparisons both for nucleotide acids and amino acids.""" pides_na = [] seq_aa = [] for x in range(0, len(identifiers)): seq_aa.append(Genomics.TranslateDNA2Protein(mali[identifiers[x]])) for y in range(x + 1, len(identifiers)): if x == y: continue pides_na.append(MaliIO.getPercentIdentity( mali[identifiers[x]], mali[identifiers[y]], gap_char)) pides_aa = [] for x in range(0, len(identifiers) - 1): for y in range(x + 1, len(identifiers)): pides_aa.append( MaliIO.getPercentIdentity(seq_aa[x], seq_aa[y], gap_char)) print "%s\tpide\t%i\t" % (prefix, len(pides_na)) +\ string.join(map(lambda x: "%.2f" % x, (min(pides_na), max(pides_na), scipy.mean(pides_na), scipy.median(pides_na), scipy.std(pides_na))), "\t") + "\t" +\ string.join(map(lambda x: "%.2f" % x, (min(pides_aa), max(pides_aa), scipy.mean(pides_aa), scipy.median(pides_aa), scipy.std(pides_aa))), "\t")
def WriteCodonSummary(mali, identifiers, frame_columns, prefix="", gap_char="-"): """write codon summary.""" new_mali = {} aligned = [] codons = [] stops = [] nclean = 0 total_no_stops = 0 for key, seq in core_mali.items(): new_mali[key], naligned, ncodons, nstops = MaliIO.getCodonSequence( seq, frame_columns, param_gap_char, remove_stops=True) aligned.append(naligned) codons.append(ncodons) stops.append(nstops) if nstops == 0: total_no_stops += 1 if naligned == ncodons and nstops == 0: nclean += 1 print "%s\tcodons\t%i\t%i\t" % (prefix, nclean, total_no_stops) +\ string.join(map(lambda x: "%.2f" % x, (min(aligned), max(aligned), scipy.mean(aligned), scipy.median(aligned), scipy.std(aligned))), "\t") + "\t" +\ string.join(map(lambda x: "%.2f" % x, (min(codons), max(codons), scipy.mean(codons), scipy.median(codons), scipy.std(codons))), "\t") + "\t" +\ string.join(map(lambda x: "%.2f" % x, (min(stops), max(stops), scipy.mean(stops), scipy.median(stops), scipy.std(stops))), "\t") return new_mali
def WriteCodonSummary(mali, identifiers, frame_columns, prefix="", gap_char="-"): """write codon summary.""" new_mali = {} aligned = [] codons = [] stops = [] nclean = 0 total_no_stops = 0 for key, seq in core_mali.items(): new_mali[key], naligned, ncodons, nstops = MaliIO.getCodonSequence( seq, frame_columns, param_gap_char, remove_stops=True ) aligned.append(naligned) codons.append(ncodons) stops.append(nstops) if nstops == 0: total_no_stops += 1 if naligned == ncodons and nstops == 0: nclean += 1 print "%s\tcodons\t%i\t%i\t" % (prefix, nclean, total_no_stops) + string.join( map( lambda x: "%.2f" % x, (min(aligned), max(aligned), scipy.mean(aligned), scipy.median(aligned), scipy.std(aligned)), ), "\t", ) + "\t" + string.join( map( lambda x: "%.2f" % x, (min(codons), max(codons), scipy.mean(codons), scipy.median(codons), scipy.std(codons)), ), "\t", ) + "\t" + string.join( map(lambda x: "%.2f" % x, (min(stops), max(stops), scipy.mean(stops), scipy.median(stops), scipy.std(stops))), "\t", ) return new_mali
def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""): """split multiple alignment into clusters of orthologous transcripts. Orthologous transcripts are defined by similarity of gene structure to query sequences. Also: return matrix of gene structure compatibility 0 : perfect compatibility (exact match) ratio of missed exon boundaries to total exon boundaries. 100 : no compatibility """ wmali = len(identifiers) lmali = len(mali[identifiers[0]]) matrix_compatibility = numpy.zeros((wmali, wmali)) if len(identifiers) == 0: return wmali = len(identifiers) lmali = len(mali[identifiers[0]]) nok = 0 nperfect = 0 ntotal_exons = 0 nidentical_exons = 0 nskipped_exons = 0 ref_nok = 0 ref_nperfect = 0 ref_ntotal_exons = 0 ref_nidentical_exons = 0 ref_nskipped_exons = 0 ref_ntotal = 0 rx = re.compile(param_master_pattern) # list of number of exons anexons = [] ## exons in reference ref_nexons = 0 for x in range(len(identifiers)): key1 = identifiers[x] seq = mali[key1] matches = [] unassigned = [] is_perfect = False anexons.append(len(exons[key1])) if rx.search(key1): ref_nexons = len(exons[key1]) for y in range(len(identifiers)): key2 = identifiers[y] if key2 == key1: continue if param_loglevel >= 3: print "#############################################" print "# comparing %s to %s" % (key1, key2) mref = 0 mcmp = 0 seq_master = mali[key2] ref_exons = exons[key2] map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char) # map exon boundaries to reference sequence cmp_exons = [] if param_loglevel >= 5: print str(alignlib_lite.AlignmentFormatEmissions(map_cmp2ref)) for e in exons[key1]: ne = e.GetCopy() ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1) ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0) cmp_exons.append(ne) # massage boundaries for terminal exons: if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom if cmp_exons[-1].mPeptideTo <= 0: cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo if param_loglevel >= 4: for e in exons[key1]: print "# exon", str(e) if param_loglevel >= 3: for e in cmp_exons: print "# exon", str(e) for e in ref_exons: print "# exon", str(e) # do exon comparison comparison = Exons.CompareGeneStructures(cmp_exons, ref_exons, threshold_min_pide=0, threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary, threshold_terminal_exon=param_threshold_terminal_exon) if param_loglevel >= 3: print comparison.Pretty(prefix="# EVAL: ") # analyse results min_nexons = min(len(cmp_exons), len(ref_exons)) max_nexons = max(len(cmp_exons), len(ref_exons)) similarity = (max_nexons - comparison.mNumIdenticalExons) * \ (abs(comparison.mNumDifferenceExons)) is_perfect = False is_ok = False status = [] # non-equivalent exon pairs ne = len(cmp_exons) - comparison.mNumIdenticalExons - \ comparison.mNumSkippedExons is_perfect = False is_ok = False if comparison.mNumIdenticalExons == 0: # F: complete and utter failure, no excuses status.append("F") else: if ne == 0: # P: perfect conservation status.append("=") is_ok = True is_perfect = True elif ne == min_nexons - comparison.mNumSkippedExons: # D: completely different predictions status.append("D") elif ne in (1, 2): # A: almost conserved status.append("A") is_ok = True elif ne > 2: # M : mostly conserved (in case of long proteins that is # good enough). if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity: status.append("M") else: # S : spuriously conserved status.append("S") else: # U: unconserved status.append("U") if len(cmp_exons) > len(ref_exons): status.append(">") elif len(ref_exons) < len(cmp_exons): status.append("<") else: status.append("=") if min_nexons == max_nexons and min_nexons == 1: status.append("S") elif min_nexons == 1 and max_nexons == 2: status.append("s") elif min_nexons == 2 and max_nexons == 2: status.append("D") elif min_nexons == 2 and max_nexons > 2: status.append("d") elif min_nexons == max_nexons: status.append("M") elif min_nexons > 2 and max_nexons > 2: status.append("m") else: status.append("U") status = string.join(status, "") structure_compatibility = 100 if is_ok: nok += 1 structure_compatibility = 100 - 100 * \ (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons) if is_perfect: nperfect += 1 structure_compatibility = 0 if abs(comparison.mNumDifferenceExons) > param_max_exons_difference: compatibility_value = 100 else: compatibility_value = structure_compatibility t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries if t == 0: compatibility_value = 0 else: compatibility_value = 100 * \ (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t matrix_compatibility[x][y] = compatibility_value nidentical_exons += comparison.mNumIdenticalExons nskipped_exons += comparison.mNumSkippedExons ntotal_exons += len(cmp_exons) if param_loglevel >= 2: print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value, len(cmp_exons), len(ref_exons), str(comparison)) # comparison to reference: count separately: if rx.search(key2): ref_nidentical_exons += comparison.mNumIdenticalExons ref_nskipped_exons += comparison.mNumSkippedExons ref_ntotal_exons += len(cmp_exons) if is_ok: ref_nok += 1 if is_perfect: ref_nperfect += 1 ref_ntotal += 1 ntotal = wmali * (wmali - 1) print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ntotal, nperfect, nok, float( nperfect) / ntotal, float(nok) / ntotal, ntotal_exons, nidentical_exons, nskipped_exons, float( nidentical_exons) / ntotal_exons, float(nidentical_exons + nskipped_exons) / ntotal_exons) if ref_ntotal > 0: if ref_ntotal_exons == 0: raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % ( ref_ntotal) print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ref_ntotal, ref_nperfect, ref_nok, float( ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal, ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, float( ref_nidentical_exons) / ref_ntotal_exons, float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons) print "%s\tnexons\t%i\t%i\t" % (prefix, len(anexons), ref_nexons) +\ string.join(map(lambda x: "%.2f" % x, (min(anexons), max(anexons), scipy.mean( anexons), scipy.median( anexons), numpy.std(anexons))), "\t") return matrix_compatibility
"\t".join(("genepair", "STATUS", "COMPATIBILITY", "CMP_NEXONS", "REF_NEXONS", Exons.ComparisonResult().GetHeader())), "\t".join(("bootstrap", "NORGS", "NOTUS", "PTEST", "PTOTAL", "FTOTAL", evaluate_bootstrap.Results().printHeader())), ] if param_only_headers: print "PREFIX\t" + "\nPREFIX\t".join(headers) print E.GetFooter() sys.exit(0) else: print "# PREFIX\t" + "\n# PREFIX\t".join(headers) # 1. read multiple alignment in fasta format all_mali, all_identifiers = MaliIO.readFasta(sys.stdin) if len(all_identifiers) == 0: raise "alignment is empty." if param_loglevel >= 1: print "# read mali with %i entries." % len(all_identifiers) if param_filename_components: infile = open(param_filename_components, "r") components = {} for line in infile: if line[0] == "#": continue if line[0] == ">":
sys.exit(2) for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-o", "--file-output"): param_filename_output = a # 1. read multiple alignment in fasta format mali, identifiers = MaliIO.readFasta(sys.stdin) if param_loglevel >= 1: print "# read mali with %i entries." % len(identifiers) print E.GetHeader() print E.GetParams() # 1. remove gaps in multiple alignment mali = MaliIO.removeGaps(mali) if param_master: frame_columns = GetFrameColumns(mali, param_master) elif param_master_pattern: columns = []
print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-s", "--subset"): param_subset = a elif o == ("-c", "--components"): param_filename_components = a if param_loglevel >= 1: print E.GetHeader() print E.GetParams() # 1. read multiple alignment in fasta format all_mali, all_identifiers = MaliIO.readFasta(sys.stdin) if len(all_identifiers) == 0: raise "alignment is empty." if param_loglevel >= 1: print "# read mali with %i entries." % len(all_identifiers) if param_filename_components: infile = open(param_filename_components, "r") components = {} for line in infile: if line[0] == "#": continue if line[0] == ">":
def WriteGeneStructureCorrespondence(mali, identifiers, exons, param_master_pattern, gap_char="-", prefix=""): """split multiple alignment into clusters of orthologous transcripts. Orthologous transcripts are defined by similarity of gene structure to query sequences. Also: return matrix of gene structure compatibility 0 : perfect compatibility (exact match) ratio of missed exon boundaries to total exon boundaries. 100 : no compatibility """ wmali = len(identifiers) lmali = len(mali[identifiers[0]]) matrix_compatibility = numpy.zeros((wmali, wmali)) if len(identifiers) == 0: return wmali = len(identifiers) lmali = len(mali[identifiers[0]]) nok = 0 nperfect = 0 ntotal_exons = 0 nidentical_exons = 0 nskipped_exons = 0 ref_nok = 0 ref_nperfect = 0 ref_ntotal_exons = 0 ref_nidentical_exons = 0 ref_nskipped_exons = 0 ref_ntotal = 0 rx = re.compile(param_master_pattern) # list of number of exons anexons = [] ## exons in reference ref_nexons = 0 x = 0 for key1 in identifiers: seq = mali[key1] matches = [] unassigned = [] is_perfect = False anexons.append(len(exons[key1])) if rx.search(key1): ref_nexons = len(exons[key1]) y = 0 for key2 in identifiers: if key2 == key1: continue if param_loglevel >= 3: print "#############################################" print "# comparing %s to %s" % (key1, key2) mref = 0 mcmp = 0 seq_master = mali[key2] ref_exons = exons[key2] map_cmp2ref = MaliIO.getMapFromMali(seq, seq_master, gap_char) # map exon boundaries to reference sequence cmp_exons = [] if param_loglevel >= 5: print alignlib_lite.py_writeAlignataTable(map_cmp2ref) for e in exons[key1]: ne = e.GetCopy() ne.mPeptideFrom = MyMap(map_cmp2ref, e.mPeptideFrom + 1, 3, -1) ne.mPeptideTo = MyMap(map_cmp2ref, e.mPeptideTo, 3, 0) cmp_exons.append(ne) # massage boundaries for terminal exons: if cmp_exons[0].mPeptideFrom <= 0: cmp_exons[0].mPeptideFrom = ref_exons[0].mPeptideFrom if cmp_exons[-1].mPeptideTo <= 0: cmp_exons[-1].mPeptideTo = ref_exons[-1].mPeptideTo if param_loglevel >= 4: for e in exons[key1]: print "# exon", str(e) if param_loglevel >= 3: for e in cmp_exons: print "# exon", str(e) for e in ref_exons: print "# exon", str(e) # do exon comparison comparison = Exons.CompareGeneStructures(cmp_exons, ref_exons, threshold_min_pide=0, threshold_slipping_exon_boundary=param_threshold_splipping_exon_boundary) if param_loglevel >= 3: print comparison.Pretty(prefix="# EVAL: ") # analyse results min_nexons = min(len(cmp_exons), len(ref_exons)) max_nexons = max(len(cmp_exons), len(ref_exons)) similarity = (max_nexons - comparison.mNumIdenticalExons) * \ (abs(comparison.mNumDifferenceExons)) is_perfect = False is_ok = False status = [] # non-equivalent exon pairs ne = len(cmp_exons) - comparison.mNumIdenticalExons - \ comparison.mNumSkippedExons is_perfect = False is_ok = False if comparison.mNumIdenticalExons == 0: # F: complete and utter failure, no excuses status.append("F") else: if ne == 0: # P: perfect conservation status.append("=") is_ok = True is_perfect = True elif ne == min_nexons - comparison.mNumSkippedExons: # D: completely different predictions status.append("D") elif ne in (1, 2): # A: almost conserved status.append("A") is_ok = True elif ne > 2: # M : mostly conserved (in case of long proteins that is # good enough). if (100 * comparison.mNumIdenticalExons) / max_nexons > param_evaluate_min_percent_exon_identity: status.append("M") else: # S : spuriously conserved status.append("S") else: # U: unconserved status.append("U") if len(cmp_exons) > len(ref_exons): status.append(">") elif len(ref_exons) < len(cmp_exons): status.append("<") else: status.append("=") if min_nexons == max_nexons and min_nexons == 1: status.append("S") elif min_nexons == 1 and max_nexons == 2: status.append("s") elif min_nexons == 2 and max_nexons == 2: status.append("D") elif min_nexons == 2 and max_nexons > 2: status.append("d") elif min_nexons == max_nexons: status.append("M") elif min_nexons > 2 and max_nexons > 2: status.append("m") else: status.append("U") status = string.join(status, "") structure_compatibility = 100 if is_ok: nok += 1 structure_compatibility = 100 - 100 * \ (comparison.mNumIdenticalExons + comparison.mNumSkippedExons) / len(cmp_exons) if is_perfect: nperfect += 1 structure_compatibility = 0 if abs(comparison.mNumDifferenceExons) > param_max_exons_difference: compatibility_value = 100 else: compatibility_value = structure_compatibility t = comparison.mNumRefBoundaries + comparison.mNumCmpBoundaries if t == 0: compatibility_value = 0 else: compatibility_value = 100 * \ (comparison.mNumMissedRefBoundaries + comparison.mNumMissedCmpBoundaries) / t matrix_compatibility[x][y] = compatibility_value nidentical_exons += comparison.mNumIdenticalExons nskipped_exons += comparison.mNumSkippedExons ntotal_exons += len(cmp_exons) if param_loglevel >= 2: print "%s\tgenepair\t%s\t%s\t%s\t%i\t%i\t%i\t%s" % (prefix, key1, key2, status, compatibility_value, len(cmp_exons), len(ref_exons), str(comparison)) # comparison to reference: count separately: if rx.search(key2): ref_nidentical_exons += comparison.mNumIdenticalExons ref_nskipped_exons += comparison.mNumSkippedExons ref_ntotal_exons += len(cmp_exons) if is_ok: ref_nok += 1 if is_perfect: ref_nperfect += 1 ref_ntotal += 1 y += 1 x += 1 ntotal = wmali * (wmali - 1) print "%s\tallstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ntotal, nperfect, nok, float( nperfect) / ntotal, float(nok) / ntotal, ntotal_exons, nidentical_exons, nskipped_exons, float( nidentical_exons) / ntotal_exons, float(nidentical_exons + nskipped_exons) / ntotal_exons) if ref_ntotal > 0: if ref_ntotal_exons == 0: raise "no exons in reference : ref_ntotal_exons = 0, ref_ntotal = %i" % ( ref_ntotal) print "%s\trefstructure\t%i\t%i\t%i\t%6.4f\t%6.4f\t%i\t%i\t%i\t%6.4f\t%6.4f" % (prefix, ref_ntotal, ref_nperfect, ref_nok, float( ref_nperfect) / ref_ntotal, float(ref_nok) / ref_ntotal, ref_ntotal_exons, ref_nidentical_exons, ref_nskipped_exons, float( ref_nidentical_exons) / ref_ntotal_exons, float(ref_nidentical_exons + ref_nskipped_exons) / ref_ntotal_exons) print "%s\tnexons\t%i\t%i\t" % (prefix, len(anexons), ref_nexons) +\ string.join(map(lambda x: "%.2f" % x, (min(anexons), max(anexons), scipy.mean( anexons), scipy.median( anexons), scipy.std(anexons))), "\t") return matrix_compatibility
elif o in ("-e", "--exons"): param_filename_exons = a elif o in ("-c", "--cluster"): param_do_cluster = True elif o in ("-f", "--remove-fragments"): param_remove_fragments = True elif o in ("-p", "--prefix"): param_prefix = a elif o == "--components": param_filename_components = a print E.GetHeader() print E.GetParams() # 1. read multiple alignment in fasta format all_mali, all_identifiers = MaliIO.readFasta(sys.stdin) if len(all_identifiers) == 0: raise "alignment is empty." if param_loglevel >= 1: print "# read mali with %i entries." % len(all_identifiers) if param_filename_components: infile = open(param_filename_components, "r") components = {} for line in infile: if line[0] == "#": continue if line[0] == ">":
sys.exit(2) for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print USAGE sys.exit(0) elif o in ("-o", "--file-output"): param_filename_output = a ## 1. read multiple alignment in fasta format mali, identifiers = MaliIO.readFasta(sys.stdin) if param_loglevel >= 1: print "# read mali with %i entries." % len(identifiers) print E.GetHeader() print E.GetParams() ## 1. remove gaps in multiple alignment mali = MaliIO.removeGaps(mali) if param_master: frame_columns = GetFrameColumns(mali, param_master) elif param_master_pattern: columns = []