def analyzeMali(mali, options, prefix_row=""): if len(mali) == 0: raise "not analyzing empty multiple alignment" # count empty sequences row_data = map( lambda x: Mali.MaliData(x.mString, options.gap_chars, options. mask_chars), mali.values()) col_data = map( lambda x: Mali.MaliData(x, options.gap_chars, options.mask_chars), mali.getColumns()) if len(row_data) == 0 or len(col_data) == 0: return False if options.loglevel >= 2: for row in row_data: options.stdlog.write("# row: %s\n" % str(row)) for col in col_data: options.stdlog.write("# col: %s\n" % str(col)) options.stdout.write(prefix_row) # calculate average column occupancy col_mean = scipy.mean(map(lambda x: x.mNChars, col_data)) col_median = scipy.median(map(lambda x: x.mNChars, col_data)) length = mali.getLength() if float(int(col_median)) == col_median: options.stdout.write("%5.2f\t%5.2f\t%i\t%5.2f" % (col_mean, 100.0 * col_mean / length, col_median, 100.0 * col_median / length)) else: options.stdout.write("%5.2f\t%5.2f\t%5.1f\t%5.2f" % (col_mean, 100.0 * col_mean / length, col_median, 100.0 * col_median / length)) row_mean = scipy.mean(map(lambda x: x.mNChars, row_data)) row_median = scipy.median(map(lambda x: x.mNChars, row_data)) width = mali.getWidth() if float(int(row_median)) == row_median: options.stdout.write("\t%5.2f\t%5.2f\t%i\t%5.2f" % (row_mean, 100.0 * row_mean / width, row_median, 100.0 * row_median / width)) else: options.stdout.write("\t%5.2f\t%5.2f\t%5.1f\t%5.2f" % (row_mean, 100.0 * row_mean / width, row_median, 100.0 * row_median / width)) options.stdout.write("\n") return True
def ProcessResult(result, options, mali=None, prefix=None, p_value=None): counts = None if options.method == "summary-slr": thresholds = "95%", "99%", "95% corrected", "99% corrected" if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % ( result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), result.mNSitesSynonymous, result.mNSitesGaps + result.mNSitesSingleChar, )) options.stdout.write("\t".join( map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "%i" % result.mNNegativeSites[x], thresholds))) options.stdout.write("\n") elif options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): mali_length = mali.getLength() mali_width = mali.getWidth() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) # sanity check: do lengths of mali and # of sites correspond if len(result.mSites) * 3 != mali_width: raise "mali (%i) and # of sites (%i) do not correspond." % ( mali_width, len(result.mSites)) if options.method == "summary-filtered": # count sites, but filter with multiple alignment ntotal = 0 npositive = 0 nnegative = 0 nneutral = 0 nfiltered = 0 nsynonymous = 0 if prefix: options.stdout.write("%s\t" % prefix) for x in range(len(result.mSites)): site = result.mSites[x] column = column_data[x * 3] if column.mNChars != mali_length: nfiltered += 1 continue if site.isPositive(options.significance_threshold, options.use_adjusted): npositive += 1 elif site.isNegative(options.significance_threshold, options.use_adjusted): nnegative += 1 if site.isSynonymous(): nsynonymous += 1 ntotal += 1 options.stdout.write( "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" % (result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), nfiltered, ntotal, nsynonymous, nnegative, npositive)) counts = Result(nfiltered, ntotal, nsynonymous, nnegative, npositive) elif options.method in ( "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list", ): select_positive_sites = options.method in ("positive-site-table", "positive-site-list") select_negative_sites = options.method in ("negative-site-table", "negative-site-list") # iterate over sites and output those under xxx selection identifiers = mali.getIdentifiers() chars_per_row = [[] for x in range(mali_length)] sites = [] for col in range(len(result.mSites)): site = result.mSites[col] column = column_data[col * 3] if column.mNChars != mali_length: continue keep = False if select_positive_sites and site.isPositive( options.significance_threshold, options.use_adjusted): keep = True elif select_negative_sites and site.isNegative( options.significance_threshold, options.use_adjusted): keep = True if not keep: continue sites.append((col, site)) nsites = len(sites) if options.truncate_sites_list: # truncate sites list, sort by significance sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) sites = sites[:options.truncate_sites_list] for col, site in sites: site = result.mSites[col] xcol = col * 3 for row in range(mali_length): id = identifiers[row] x = max(xcol - options.context_size * 3, 0) y = min(xcol + 3 + options.context_size * 3, mali_width) segment = mali[id][x:y] codon = mali[id][xcol:xcol + 3] pos = mali.getResidueNumber(id, xcol) pos /= 3 # save as real-world coordinates chars_per_row[row].append( PositionInformation( Genomics.MapCodon2AA(codon), pos + 1, xcol, Genomics.TranslateDNA2Protein(segment).upper())) if p_value is not None: pp_value = p_value else: pp_value = "na" if options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): if options.context_size: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i in %s" % (x.mAA, x.mSequencePosition, x.mContext) for x in chars_per_row[row] ]))) else: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i" % (x.mAA, x.mSequencePosition) for x in chars_per_row[row] ]))) elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): for row in range(mali_length): if prefix: xprefix = "%s\t%s" % (prefix, identifiers[row]) else: xprefix = "%s" % (identifiers[row]) x = 0 for chars in chars_per_row[row]: x += 1 options.stdout.write( "%s\t%i\t%s\t%i\t%i\t%s\n" % (xprefix, x, chars.mAA, chars.mSequencePosition, chars.mMaliPosition, chars.mContext)) options.stdout.flush() return counts
def selectPositiveSites(results, selection_mode, options, mali=None): """returns sites, which are consistently estimated to be positively selected. Depending on the option selection_mode, various sites are selected: 'all': all positive sites are returned 'consistent': only positive sites that are positive in all models and runs 'emes': only sites that are > 0.9 in one model and at least > 0.5 in all other models If mali is given, positions that are not fully aligned are removed. """ ## filter and extract functions if selection_mode == "emes": filter_f = lambda x: x.mProbability >= 0.5 and x.mOmega >= options.filter_omega else: filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## maximum significance per site (for emes) max_per_site = {} total_sites = set() first = True for result in results: for model in options.models: sites = result.mSites[model] s1, s2 = set(), set() if "neb" in options.analysis: s1 = set( map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) for x in filter(filter_f, sites.mNEB.mPositiveSites): if x.mResidue not in max_per_site: max_per_site[x.mResidue] = 0 max_per_site[x.mResidue] = max(x.mProbability, max_per_site[x.mResidue]) if "beb" in options.analysis: s2 = set( map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) for x in filter(filter_f, sites.mBEB.mPositiveSites): if x.mResidue not in max_per_site: max_per_site[x.mResidue] = 0 max_per_site[x.mResidue] = max(x.mProbability, max_per_site[x.mResidue]) s = s1.union(s2) if first: total_sites = s first = False else: if selection_mode == "all": total_sites = total_sites.union(s) elif selection_mode == "consistent": total_sites = total_sites.intersection(s) elif selection_mode == "emes": total_sites = total_sites.intersection(s) if selection_mode == "emes": if options.loglevel >= 2: options.stdlog.write( "# before EMES filtering %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, 0.5)) # filter according to emes: maximum significance larger than 0.9 total_sites = set(filter(lambda x: max_per_site[x] > 0.9, total_sites)) if options.loglevel >= 2: options.stdlog.write( "# after EMES filtering %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, 0.9)) else: if options.loglevel >= 2: options.stdlog.write( "# extracted %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, options.filter_probabiltiy)) if mali and options.filter_mali: if options.filter_mali == "gaps": nfiltered = 0 mali_length = mali.getLength() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) new_sites = set() for x in total_sites: ## PAML uses one-based coordinates column = column_data[x - 1] if column.mNChars != mali_length: nfiltered += 1 if options.loglevel >= 3: options.stdlog.write( "# rejected position %i due to mali\n" % x) continue new_sites.add(x) total_sites = new_sites if options.loglevel >= 2: options.stdlog.write("# after MALI filtering %i positive sites\n" % (len(total_sites))) return total_sites, max_per_site