def outputRates( result, options ): """output rates in a grammar.""" trained_model = result.getModel() pis = trained_model.evaluateTerminalFrequencies() matrices = trained_model.evaluateRateMatrix() terminals = pis.keys() for terminal in terminals: Q, distance = RateEstimation.getDistanceGTR( pis[terminal], matrices[terminal] ) options.stdout.write("\t%s" % (options.value_format % distance ) )
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write( "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write( "\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) # nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),)) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join(("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)] matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join((o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices( mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) ## nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()), )) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join( ("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )] matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join( (o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)