def __init__(self, options, *args, **kwargs): SequencePairPropertiesDistance.__init__(self, *args, **kwargs) self.mBaseml = WrapperCodeML.BaseML() self.mBaseml.SetOptions(options) if options.loglevel >= 3: self.mDump = True self.mTest = True else: self.mDump = False self.mTest = False
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codeml2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-m", "--methods", dest="methods", type="string", help="""methods for analysis. write-ks-tree: write out ks tree(s). write-ka-tree: write out ka tree(s). """) parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option( "--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites." ) parser.add_option( "--filter-omega", dest="filter_omega", type="float", help="threshold for omega above which to include positive sites.") parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--mode", dest="mode", type="choice", choices=("pairs", "1xn"), help="analysis mode.") parser.set_defaults( methods="", prefix=None, filter_probability=0, filter_omega=0, models="", significance_threshold=0.05, mode="pairs", ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.models = options.models.split(",") codeml = WrapperCodeML.CodeML() results = [] if len(args) == 0: # read from stdin, if no arguments are given results.append(codeml.parseOutput(sys.stdin.readlines())) else: # read multiple results for f in args: try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.ParsingError, msg: options.stdlog.write("# parsing error in file %s: %s.\n" % (f, msg)) continue
def runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options): """setup codeml wrapper. Sets options and returns a wrapper. """ ids = mali.getIdentifiers() ## setup codeml codeml_options = {} if options.seqtype == "codon": codeml_options["seqtype"] = "1" elif options.seqtype == "aa": codeml_options["seqtype"] = "2" elif options.seqtype == "trans": codeml_options["seqtype"] = "3" if options.clean_data: codeml_options["cleandata"] = options.clean_data if options.omega != None: codeml_options["omega"] = str(options.omega) if options.kappa != None: codeml_options["kappa"] = str(options.kappa) if options.fix_kappa: codeml_options["fix_kappa"] = "1" if options.fix_omega: codeml_options["fix_omega"] = "1" if options.codon_frequencies != None: c = options.codon_frequencies.upper() if c == "UNIFORM": a = "0" elif c == "F1X4": a = "1" elif c == "F3X4": a = "2" elif c == "F61": a = "3" else: a = options.codon_frequencies codeml_options["CodonFreq"] = a if options.paml_method != None: codeml_options["paml_method"] = str(options.method) if options.optimization_threshold != None: codeml_options["Small_Diff"] = str(options.optimization_threshold) ninput, noutput, nskipped = 0, 0, 0 tstart = time.time() if pairs and (options.pairwise or has_non_overlaps): wrapper = WrapperCodeML.CodeMLPairwise() ## do pairwise run result = WrapperCodeML.CodeMLResultPairs() ntotal = (len(ids) * (len(ids) - 1)) / 2 for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(ids[x], m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(ids[y], m2.mFrom, m2.mTo, m2.mString) ## remove empty columns and masked columns if options.clean_mali: temp_mali.mGapChars = temp_mali.mGapChars + ("n", "N") temp_mali.removeGaps(minimum_gaps=1, frame=3) if temp_mali.getWidth() < options.min_overlap: if options.loglevel >= 1: options.stdlog.write( "# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, mali.getEntry( ids[y]).mId, temp_mali.getWidth())) nskipped += 1 continue sub_result = wrapper.Run(temp_mali, options=codeml_options, dump=options.dump) result.mPairs += sub_result.mPairs if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write( "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" % (ninput, ntotal, 100.0 * ninput / ntotal, time.time() - tstart)) options.stdlog.flush() noutput += printPairs(sub_result.mPairs, mali, map_new2old, options) options.stdout.flush() if options.loglevel >= 1: options.stdlog.write( "# pairwise computation: ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) options.stdlog.flush() else: wrapper = WrapperCodeML.CodeML() result = wrapper.Run(mali, tree=tree, options=codeml_options, dump=options.dump) result_pairs = WrapperCodeML.CodeMLResultPairs() result_pairs.fromResult(result) noutput += printPairs(result_pairs.mPairs, mali, map_new2old, options) l = mali.getLength() if options.loglevel >= 1: options.stdlog.write("# input=%i, npairs=%i, noutput=%i\n" % (l, l * (l - 1) / 2, len(result_pairs.mPairs)))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option( "--filter-probability", dest="filter_probability", type="float", help= "threshold for probability above which to include positive sites [default=%default]." ) parser.add_option( "--filter-omega", dest="filter_omega", type="float", help= "threshold for omega above which to include positive sites [default=%default]." ) parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option( "--filename-mali", dest="filename_mali", type="string", help= "filename with multiple alignment used for calculating sites - used for filtering" ) parser.add_option( "--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option( "--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() ## filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) ## map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### ## use multiple alignment to map residues to a reference mali ## or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." ## translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) ## see if you can find an identical subsequence and then align to thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( \ """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) ## write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] ## do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map( extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map( extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] == None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\ (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s == None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write("\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write("# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write( "# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def runBaseML(mali, pairs, options): baseml = WrapperCodeML.BaseML() paml_options = {} map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() if options.kappa is not None: paml_options["kappa"] = str(options.kappa) if options.fix_kappa: paml_options["fix_kappa"] = "1" if options.alpha is not None: paml_options["alpha"] = str(options.alpha) if options.fix_alpha: paml_options["fix_alpha"] = "1" if options.clean_data: paml_options["cleandata"] = options.clean_data map_distance2index = {} for key, val in baseml.mOptions["model"].items(): map_distance2index[val] = key if options.distance.upper() in map_distance2index: paml_options["model"] = map_distance2index[options.distance] else: raise "unknown distance for baseml: %s" % options.distance if options.filename_tree: result = baseml.Run(mali, tree=options.filename_tree, dump=options.dump, test=options.test, options=paml_options) elif options.pairwise: noutput = 0 ninput = 0 ntotal = (len(ids) * (len(ids) - 1)) / 2 if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(ids[x], m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(ids[y], m2.mFrom, m2.mTo, m2.mString) result = baseml.Run(temp_mali, tree="(%s,%s);" % (ids[x], ids[y]), dump=options.dump, test=options.test, options=paml_options) if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write( "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" % (ninput, ntotal, 100.0 * ninput / ntotal, time.time() - tstart)) options.stdlog.flush() noutput += printPair(result, temp_mali, map_new2old, options) options.stdout.flush() else: # assume that there are only two sequences if mali.getLength() == 2: id1, id2 = mali.getIdentifiers() result = baseml.Run(mali, tree="(%s,%s);" % (id1, id2), dump=options.dump, test=options.test, options=paml_options) else: raise "please supply tree if there are more than two sequences and pairwise mode is not selected." if options.output_format == "list": all_identifiers = mali.getIdentifiers() options.stdout.write("\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa")) + "\n") for x in range(len(all_identifiers) - 1): id_x = all_identifiers[x] for y in range(x + 1, len(all_identifiers)): id_y = all_identifiers[y] options.stdout.write("\t".join( (id_x, id_y, options.format % result.mDistanceMatrix[id_x][id_y], options.format % result.mLogLikelihood, options.format % result.mAlpha, options.format % result.mKappa)) + "\n") elif options.output_format == "tree": options.stdout.write("%s\n" % result.mTree)