def outputRates( result, options ): """output rates in a grammar.""" trained_model = result.getModel() pis = trained_model.evaluateTerminalFrequencies() matrices = trained_model.evaluateRateMatrix() terminals = pis.keys() for terminal in terminals: Q, distance = RateEstimation.getDistanceGTR( pis[terminal], matrices[terminal] ) options.stdout.write("\t%s" % (options.value_format % distance ) )
def getQMatrix(pi, k, s, n): """build a q matrix. Diagonal elements are set to the negative of the row sums. The matrix is normalized such that trace of the matrix is -1. """ codons = Bio.Data.CodonTable.standard_dna_table.forward_table.keys() Q = initializeQMatrix(codons) trace = 0.0 for codon_i in codons: row_sum = 0.0 for codon_j in codons: if codon_i == codon_j: continue is_single, is_synonymous, is_transition = RateEstimation.evaluateCodonPair( codon_i, codon_j) if not is_single: continue if is_synonymous: if is_transition: v = s else: v = s * k else: if is_transition: v = n else: v = n * k v *= pi[codon_j] Q[codon_i][codon_j] = v row_sum += v Q[codon_i][codon_i] = -row_sum trace += pi[codon_i] * row_sum for codon_i in codons: for codon_j in codons: Q[codon_i][codon_j] /= trace return Q, trace
def processMali(mali, options): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int(float(ncols) / 3.0 * options.block_size) * 3 else: size = int(options.block_size) * 3 size = min(size, ncols) mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename(id, species) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps(minimum_gaps=1, frame=3) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r")) tree = nexus.trees[0] tree.relabel(map_old2new) else: tree = None annotation = mali.getAnnotation("STATE") chars = set(list(annotation)) for c in chars: assert c in ( "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block") blocks = (("B0_", chars[0]), ) else: blocks = (("B0_", "N"), ("B1_", "C")) result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks, options) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix(trained_model) annotation = mali.getAnnotation("STATE") for block, code in blocks: terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs) rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn) ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri) rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv) nchars = annotation.count(code) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv) try: Q, t = RateEstimation.getQMatrix(pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=ri * avg_omega, Rsv=rv * avg_omega, Rni=ri * avg_omega, Rnv=rv * avg_omega) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=rs * avg_kappa, Rsv=rs * avg_kappa, Rni=rn * avg_kappa, Rnv=rn * avg_kappa) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % (rI / rI0 * rV0 / rV) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write("\t".join( map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars)))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) options.stdout.write("\t%s\n" % msg)
def prepareGrammar(xgram, mali, tree, map_old2new, blocks, options): """prepare grammar for custom grammars.""" labels = map(lambda x: x[1], blocks) nblocks = len(blocks) annotate_terminals = {} for x in range(len(labels)): annotations = [] key = [] for c in range(0, 3): t = "B%i_COD%i" % (x, c) key.append(t) annotations.append( Annotation(row="STATE", column=t, label=labels[x])) annotate_terminals[tuple(key)] = annotations input_model = Codons.buildCodonML( codon_model="f3x4-fourproducts", num_blocks=nblocks, grammar_type="linear-blocks", annotate_terminals=annotate_terminals, shared_frequencies=options.shared_frequencies, shared_rates=False, ) ## manually share rates between blocks if options.shared_rates == "kappa": for c in range(0, nblocks): input_model.renameParameter("B%i_Ri" % c, "Ri") input_model.renameParameter("B%i_Rv" % c, "Rv") elif options.shared_rates == "kappa-ds": for c in range(0, nblocks): input_model.renameParameter("B%i_Ri" % c, "Ri") input_model.renameParameter("B%i_Rv" % c, "Rv") input_model.renameParameter("B%i_Rs" % c, "Rs") elif options.shared_rates == "omega": for c in range(0, nblocks): input_model.renameParameter("B%i_Rs" % c, "Rs") input_model.renameParameter("B%i_Rn" % c, "Rn") elif options.shared_rates == "omega-ds": for c in range(0, nblocks): input_model.renameParameter("B%i_Rv" % c, "Rv") input_model.renameParameter("B%i_Rs" % c, "Rs") input_model.renameParameter("B%i_Rn" % c, "Rn") elif options.shared_rates == "ds": for c in range(0, nblocks): input_model.renameParameter("B%i_Rs" % c, "Rs") elif options.shared_rates == "all": for c in range(0, nblocks): input_model.renameParameter("B%i_Rv" % c, "Rv") input_model.renameParameter("B%i_Rs" % c, "Rs") input_model.renameParameter("B%i_Rn" % c, "Rn") input_model.renameParameter("B%i_Ri" % c, "Ri") writeModel(input_model, "input", options) ids = mali.getIdentifiers() fh, filename = tempfile.mkstemp() os.close(fh) outfile = open(filename, "w") ## clip mali by supplied blocks mali.clipByAnnotation("STATE", "".join(labels)) if tree: tree.rescaleBranchLengths(1.0) tree_options = "#=GF NH %s" % tree.to_string(branchlengths_only=True, format="nh") elif mali.getNumSequences() == 2: tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values()) else: raise "Please supply a tree." mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=(tree_options, )) outfile.close() ## prefix, code if options.shared_frequencies: frequency_codes = (("", ""), ) else: frequency_codes = blocks if options.insert_frequencies: for prefix, code in frequency_codes: temp_mali = mali.getClone() temp_mali.clipByAnnotation("STATE", code) RateEstimation.setFrequencies(input_model, temp_mali, prefix) if options.fix_frequencies: for prefix, code in frequency_codes: for char in ('a', 'c', 'g', 't'): for x in (0, 1, 2): param = "%sp%s%i" % (prefix, char, x) input_model.mGrammar.moveVariableToConst(param) writeModel(input_model, "input", options) t1 = time.time() result = xgram.train(input_model, filename) if options.dump: options.stdlog.write("".join(result.mData)) options.stdlog.write("".join(result.mLog)) mali.writeToFile(options.stdlog, format="stockholm", write_ranges=False, options=(tree_options, )) t2 = time.time() trained_model = result.getModel() writeModel(trained_model, "trained", options) return result, mali, ids
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg): """output the results of running the Xrate four parameter grammar. """ ids = mali.getIdentifiers() pi, matrix = RateEstimation.getRateMatrix(result.getModel(), terminals=('COD0', 'COD1', 'COD2')) if rsi == None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_N, o_S = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: Q, t = RateEstimation.getQMatrix(pi, Rsi=rsi, Rsv=rsv, Rni=rni, Rnv=rnv) ## get rate matrix as if omega was set to 1 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rni) / 2.0, Rsv=(rsv + rnv) / 2.0, Rni=(rsi + rni) / 2.0, Rnv=(rsv + rnv) / 2.0) ## get rate matrix as if kappa was set to 1 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rsv) / 2.0, Rsv=(rsi + rsv) / 2.0, Rni=(rni + rnv) / 2.0, Rnv=(rni + rnv) / 2.0) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) # 64.0/61.0 results from the fact that xrate does not normalize # the terminals dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_S = options.value_format % (mali.getNumColumns() * rS0) o_N = options.value_format % (mali.getNumColumns() * rN0) ## kappa is given normalized by sites like omega o_kappa = options.value_format % (rI / rI1 * rV1 / rV) ## kappa1 is given by the ratio of the rates NOT normalized by the sites. msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0, options.value_format % ((rsi + rni) / (rsv + rnv))) options.stdout.write("\t".join( map(str, (mali.getEntry(ids[0]).mId, mali.getEntry( ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa, result.getLogLikelihood(), "na")))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg) options.stdout.flush()
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg): """output the results of running the Xrate four parameter grammar. """ ids = mali.getIdentifiers() pi, matrix = RateEstimation.getRateMatrix(result.getModel(), terminals=('COD0', 'COD1', 'COD2')) if rsi is None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_N, o_S = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: Q, t = RateEstimation.getQMatrix(pi, Rsi=rsi, Rsv=rsv, Rni=rni, Rnv=rnv) # get rate matrix as if omega was set to 1 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rni) / 2.0, Rsv = (rsv + rnv) / 2.0, Rni = (rsi + rni) / 2.0, Rnv = (rsv + rnv) / 2.0) # get rate matrix as if kappa was set to 1 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rsv) / 2.0, Rsv = (rsi + rsv) / 2.0, Rni = (rni + rnv) / 2.0, Rnv = (rni + rnv) / 2.0) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) # 64.0/61.0 results from the fact that xrate does not normalize # the terminals dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_S = options.value_format % (mali.getNumColumns() * rS0) o_N = options.value_format % (mali.getNumColumns() * rN0) # kappa is given normalized by sites like omega o_kappa = options.value_format % (rI / rI1 * rV1 / rV) # kappa1 is given by the ratio of the rates NOT normalized by the # sites. msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0, options.value_format % ((rsi + rni) / (rsv + rnv))) options.stdout.write("\t".join(map(str, (mali.getEntry(ids[0]).mId, mali.getEntry(ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa, result.getLogLikelihood( ), "na")))) if options.with_rho: options.stdout.write("\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg) options.stdout.flush()
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write( "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write( "\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) # nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),)) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join(("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)] matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join((o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices( mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) ## nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()), )) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join( ("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )] matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join( (o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
def processMali( mali, options ): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int( float( ncols ) / 3.0 * options.block_size) * 3 else: size = int( options.block_size ) * 3 size = min( size, ncols ) mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement( options.xrate_min_increment ) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=3 ) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] tree.relabel( map_old2new ) else: tree = None annotation = mali.getAnnotation( "STATE" ) chars = set(list(annotation)) for c in chars: assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block" ) blocks = ( ("B0_", chars[0]), ) else: blocks = ( ("B0_", "N"), ("B1_", "C") ) result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix( trained_model ) annotation = mali.getAnnotation( "STATE" ) for block, code in blocks : terminals = ( "%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block ) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs ) rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn ) ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri ) rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv ) nchars = annotation.count( code ) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv ) try: Q, t = RateEstimation.getQMatrix( pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv ) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix( pi, Rsi = ri * avg_omega, Rsv = rv * avg_omega, Rni = ri * avg_omega, Rnv = rv * avg_omega ) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix( pi, Rsi = rs * avg_kappa, Rsv = rs * avg_kappa, Rni = rn * avg_kappa, Rnv = rn * avg_kappa ) rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q ) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 ) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 ) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % ( rI / rI0 * rV0 / rV ) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write( "\t".join( map(str, ( code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars )))) if options.with_rho: options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0 )))) options.stdout.write( "\t%s\n" % msg )
def prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ): """prepare grammar for custom grammars.""" labels = map( lambda x: x[1], blocks ) nblocks = len(blocks) annotate_terminals = {} for x in range(len(labels)): annotations = [] key = [] for c in range( 0,3 ): t = "B%i_COD%i" % (x, c) key.append(t) annotations.append( Annotation( row = "STATE", column = t, label = labels[x] )) annotate_terminals[ tuple(key) ] = annotations input_model = Codons.buildCodonML( codon_model = "f3x4-fourproducts", num_blocks = nblocks, grammar_type = "linear-blocks", annotate_terminals=annotate_terminals, shared_frequencies = options.shared_frequencies, shared_rates = False, ) ## manually share rates between blocks if options.shared_rates == "kappa": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Ri" % c, "Ri" ) input_model.renameParameter( "B%i_Rv" % c, "Rv" ) elif options.shared_rates == "kappa-ds": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Ri" % c, "Ri" ) input_model.renameParameter( "B%i_Rv" % c, "Rv" ) input_model.renameParameter( "B%i_Rs" % c, "Rs" ) elif options.shared_rates == "omega": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Rs" % c, "Rs" ) input_model.renameParameter( "B%i_Rn" % c, "Rn" ) elif options.shared_rates == "omega-ds": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Rv" % c, "Rv" ) input_model.renameParameter( "B%i_Rs" % c, "Rs" ) input_model.renameParameter( "B%i_Rn" % c, "Rn" ) elif options.shared_rates == "ds": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Rs" % c, "Rs" ) elif options.shared_rates == "all": for c in range( 0, nblocks): input_model.renameParameter( "B%i_Rv" % c, "Rv" ) input_model.renameParameter( "B%i_Rs" % c, "Rs" ) input_model.renameParameter( "B%i_Rn" % c, "Rn" ) input_model.renameParameter( "B%i_Ri" % c, "Ri" ) writeModel( input_model, "input", options ) ids = mali.getIdentifiers() fh, filename = tempfile.mkstemp() os.close(fh) outfile = open(filename, "w" ) ## clip mali by supplied blocks mali.clipByAnnotation( "STATE", "".join(labels)) if tree: tree.rescaleBranchLengths( 1.0 ) tree_options = "#=GF NH %s" % tree.to_string( branchlengths_only=True, format="nh") elif mali.getNumSequences() == 2: tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values()) else: raise "Please supply a tree." mali.writeToFile( outfile, format="stockholm", write_ranges = False, options = ( tree_options, ) ) outfile.close() ## prefix, code if options.shared_frequencies: frequency_codes = ( ("", ""), ) else: frequency_codes = blocks if options.insert_frequencies: for prefix, code in frequency_codes: temp_mali = mali.getClone() temp_mali.clipByAnnotation( "STATE", code ) RateEstimation.setFrequencies( input_model, temp_mali, prefix ) if options.fix_frequencies: for prefix, code in frequency_codes: for char in ('a', 'c', 'g', 't'): for x in (0, 1, 2): param = "%sp%s%i" % (prefix, char, x) input_model.mGrammar.moveVariableToConst( param ) writeModel( input_model, "input", options ) t1 = time.time() result = xgram.train( input_model, filename ) if options.dump: options.stdlog.write( "".join(result.mData) ) options.stdlog.write( "".join(result.mLog) ) mali.writeToFile( options.stdlog, format="stockholm", write_ranges = False, options = (tree_options,)) t2 = time.time() trained_model = result.getModel() writeModel( trained_model, "trained", options ) return result, mali, ids