def printPairs(pairs, mali, map_new2old, options): """print pairs form codeml.""" noutput = 0 for pair in pairs: options.stdout.write("\t".join( map(str, (mali.getEntry(pair.mName2).mId, mali.getEntry( pair.mName1).mId, pair.mKa, pair.mKs, pair.mKaks, pair.mN, pair.mS, "na", "na", pair.mKappa, pair.mLogLikelihood, pair.mTau)))) if options.with_rho: options.stdout.write("\t" + "\t".join( map(str, (pair.mRn, pair.mRs, pair.mBranchLength, pair.mRn0, pair.mRs0, "na")))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[pair.mName1], mali[pair.mName2]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t" + pair.mError + "\n") options.stdout.flush() noutput += 1 return noutput
def printPair(pair, mali, map_new2old, options, msg=""): """print pairs form codeml.""" ids = mali.getIdentifiers() if options.output_format == "list": options.stdout.write("\t".join( (map_new2old[ids[0]], map_new2old[ids[1]], options.format % pair.mDistanceMatrix[ids[0]][ids[1]], options.format % pair.mLogLikelihood, printValue(pair.mAlpha, options.format), printValue(pair.mKappa, options.format), msg))) elif options.output_format == "tree": options.stdout.write(">pair%i" % (noutput + 1)) options.stdout.write("%s\n" % pair.mTree) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[1]], mali[ids[0]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") return 1
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg): """output the results of running the Xrate four parameter grammar. """ ids = mali.getIdentifiers() pi, matrix = RateEstimation.getRateMatrix(result.getModel(), terminals=('COD0', 'COD1', 'COD2')) if rsi == None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_N, o_S = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: Q, t = RateEstimation.getQMatrix(pi, Rsi=rsi, Rsv=rsv, Rni=rni, Rnv=rnv) ## get rate matrix as if omega was set to 1 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rni) / 2.0, Rsv=(rsv + rnv) / 2.0, Rni=(rsi + rni) / 2.0, Rnv=(rsv + rnv) / 2.0) ## get rate matrix as if kappa was set to 1 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rsv) / 2.0, Rsv=(rsi + rsv) / 2.0, Rni=(rni + rnv) / 2.0, Rnv=(rni + rnv) / 2.0) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) # 64.0/61.0 results from the fact that xrate does not normalize # the terminals dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_S = options.value_format % (mali.getNumColumns() * rS0) o_N = options.value_format % (mali.getNumColumns() * rN0) ## kappa is given normalized by sites like omega o_kappa = options.value_format % (rI / rI1 * rV1 / rV) ## kappa1 is given by the ratio of the rates NOT normalized by the sites. msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0, options.value_format % ((rsi + rni) / (rsv + rnv))) options.stdout.write("\t".join( map(str, (mali.getEntry(ids[0]).mId, mali.getEntry( ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa, result.getLogLikelihood(), "na")))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg) options.stdout.flush()
def runXrateSN(xgram, mali, options): """run xrate using Ians sn.eg grammar.""" result, mali, ids = prepareGrammar(xgram, mali, options) trained_model = result.getModel() pi, matrix = evaluateGrammar(trained_model) def getQMatrix(pi, k, s, n): """build a q matrix. Diagonal elements are set to the negative of the row sums. The matrix is normalized such that trace of the matrix is -1. """ codons = Bio.Data.CodonTable.standard_dna_table.forward_table.keys() Q = initializeQMatrix(codons) trace = 0.0 for codon_i in codons: row_sum = 0.0 for codon_j in codons: if codon_i == codon_j: continue is_single, is_synonymous, is_transition = RateEstimation.evaluateCodonPair( codon_i, codon_j) if not is_single: continue if is_synonymous: if is_transition: v = s else: v = s * k else: if is_transition: v = n else: v = n * k v *= pi[codon_j] Q[codon_i][codon_j] = v row_sum += v Q[codon_i][codon_i] = -row_sum trace += pi[codon_i] * row_sum for codon_i in codons: for codon_j in codons: Q[codon_i][codon_j] /= trace return Q, trace s = trained_model.mGrammar.getParameter('s') n = trained_model.mGrammar.getParameter('n') k = trained_model.mGrammar.getParameter('k') not_k = trained_model.mGrammar.getParameter('not_k') Q, t = getQMatrix(pi, k, s, n) Q0, t0 = getQMatrix(pi, k, 1, 1) ri, rv, rS, rN = countSubstitutions(pi, Q) ri0, rv0, rS0, rN0 = countSubstitutions(pi, Q0) kappa = ri / rv dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t if s == None or n == None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: o_omega = options.value_format % (n / s) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_kappa = options.value_format % kappa msg = "iter=%i s=%6.4f n=%6.4f k=%6.4f ~k=%6.4f" % ( result.getNumIterations(), s, n, k, not_k) options.stdout.write("\t".join( map(str, (mali.getEntry(ids[0]).mId, mali.getEntry( ids[1]).mId, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: fasta2distances.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--filters", dest="filters", type="string", help="Filters to use for filtering sequences [all|codon1|codon2|codon3|d4].") parser.add_option("--fields", dest="fields", type="string", help="Fields to output [aligned|nunaligned1|nunaligned2|identical|transitions|transversions|jc69|t92].") parser.set_defaults( filename_map=None, filters="all,codon1,codon2,codon3,d4", gap_char="-", fields="aligned,unaligned1,unaligned2,identical,transitions,transversions,jc69,t92", ) (options, args) = E.Start(parser, add_pipe_options=True) options.filters = options.filters.split(",") options.fields = options.fields.split(",") iterator = FastaIterator.FastaIterator(options.stdin) headers = ["id1", "id2"] for f in options.filters: headers += list(["%s_%s" % (f, x) for x in options.fields]) options.stdout.write("\t".join(headers) + "\n") while 1: try: cur_record = next(iterator) if cur_record is None: break first_record = cur_record cur_record = next(iterator) if cur_record is None: break second_record = cur_record except StopIteration: break if len(first_record.sequence) != len(second_record.sequence): raise "sequences %s and %s of unequal length" % ( first_record.title, second_record.title) if len(first_record.sequence) % 3 != 0: raise "sequence %s not multiple of 3" % first_record.title # old: Bio.Alphabet.IUPAC.extended_dna.letters alphabet = "ACGT" + options.gap_char result = [] for f in options.filters: s1, s2 = FilterAlignedPairForPositions(first_record.sequence, second_record.sequence, f) info = Genomics.CalculatePairIndices(s1, s2, options.gap_char) for field in options.fields: if field == "aligned": c = "%i" % info.mNAligned elif field == "unaligned1": c = "%i" % info.mNUnaligned1 elif field == "unaligned2": c = "%i" % info.mNUnaligned2 elif field == "transversions": c = "%i" % info.mNTransversions elif field == "transitions": c = "%i" % info.mNTransitions elif field == "identical": c = "%i" % info.mNIdentical elif field == "jc69": try: c = "%6.4f" % CalculateDistanceJC69(info)[0] except ValueError: c = "nan" elif field == "t92": try: c = "%6.4f" % CalculateDistanceT92(info)[0] except ValueError: c = "nan" else: raise "Unknown field %s" % field result.append(c) options.stdout.write("%s\t%s\t%s\n" % (first_record.title, second_record.title, "\t".join(result))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option( "-s", "--sites", dest="sites", type="string", help="sites to use [default=%default].", ) parser.add_option( "-f", "--file", dest="filename", type="string", help="filename of multiple alignment (- for stdin) [default=%default].", metavar="FILE") parser.add_option("-o", "--format", dest="format", type="string", help="format [default=%default].", metavar="format") parser.add_option( "-d", "--distance", dest="distance", type="choice", choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81", "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT", "PMB", "PAM", "Kimura", "CategoriesModel"), help="method to use for distance calculation [default=%default].") parser.add_option("--method", dest="method", type="choice", choices=("phylip", "baseml", "own", "xrate"), help="program to use for rate calculation.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("list", "tree"), help="output format.") parser.add_option( "-m", "--min-sites", dest="min_sites", type="int", help="minimum number of sites for output[default=%default].", ) parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na", "auto"), help="alphabet to use.", ) parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree information.") parser.add_option("--set-alpha", dest="alpha", type="float", help="initial alpha value.") parser.add_option("--fix-alpha", dest="fix_alpha", action="store_true", help="do not estimate alpha.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.add_option("--test", dest="test", action="store_true", help="test run - does not clean up.") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions.") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input", "trained", "all"), help="output sections to write for xrate.") parser.add_option("--output-pattern", dest="output_pattern", type="string", help="output pattern for output files.") parser.add_option("--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate.") parser.set_defaults( input_format="fasta", filename_tree=None, with_counts=False, sites="d4", distance="T92", min_sites=1, filename="-", alphabet="auto", format="%6.4f", method="phylip", kappa=None, fix_kappa=False, alpha=None, fix_alpha=False, dump=False, clean_data=None, output_format="list", iteration="all-vs-all", pairwise=False, report_step=1000, output_pattern="%s.eg", write=[], test_xrate=False, xrate_min_increment=None, is_codons=False, ) (options, args) = E.Start(parser) if options.filename != "-": infile = open(options.filename, "r") else: infile = sys.stdin # read multiple alignment if options.pairwise: # read sequences, but not as a multiple alignment. This permits # multiple names. mali = Mali.SequenceCollection() options.iteration = "pairwise" else: mali = Mali.Mali() mali.readFromFile(infile, format=options.input_format) ids = mali.getIdentifiers() if options.alphabet == "auto": s = "".join(map(lambda x: x.mString, mali.values())).lower() ss = re.sub("[acgtxn]", "", s) if float(len(ss)) < (len(s) * 0.1): options.alphabet = "na" if mali.getNumColumns() % 3 == 0: options.is_codons = True else: options.alphabet = "aa" if options.loglevel >= 1: options.stdlog.write("# autodetected alphabet: %s\n" % options.alphabet) if options.filename != "-": infile.close() npairs = 0 nskipped_length = 0 nskipped_distance = 0 pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids) - 1): for y in range(x + 1, len(ids)): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) if options.alphabet == "na": if options.method == "baseml": runBaseML(mali, pairs, options) elif options.method == "phylip" and options.distance in ("F84", "K80", "JC69", "LogDet"): runDNADIST(mali, pairs, options) elif options.method == "xrate": runXrate(mali, pairs, options) else: if options.is_codons: h = Genomics.SequencePairInfoCodons().getHeader() else: h = Genomics.SequencePairInfo().getHeader() options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h)) for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] info = Genomics.CalculatePairIndices( mali[id_x], mali[id_y], with_codons=options.is_codons) if options.distance in ("T92", "JC69"): if options.sites == "d4": seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x], mali[id_y], position=3, degeneracy=4) if len(seq1) < options.min_sites: nskipped_length += 1 continue else: raise "unknown sites %s" % options.sites if options.distance == "T92": distance, variance = CalculateDistanceT92(info) elif options.distance == "JC69": distance, variance = CalculateDistanceJC69(info) elif options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( map(str, (id_x, id_y, options.format % distance, options.format % variance, info))) + "\n") else: nskipped_distance += 1 elif options.alphabet == "aa": if options.distance in ("JTT", "PMB", "PAM", "Kimura", "CategoriesModel"): # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("protdist") phylip.setMali(mali) phylip_options = [] if options.distance == "PMG": phylip_options += ["D"] * 1 elif options.distance == "PAM": phylip_options += ["D"] * 2 elif options.distance == "Kimura": phylip_options += ["D"] * 3 elif options.distance == "CategoriesModel": phylip_options += ["D"] * 4 phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() writePhylipResult(result, options) else: options.stdout.write("id1\tid2\tdist\tvar\n") # iterate over all pairs of sequences for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] if options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": # percentage overlap distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( (id_x, id_y, options.format % distance, options.format % variance)) + "\n") else: nskipped_distance += 1 if options.loglevel >= 1: options.stdlog.write( "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n" % (len(ids), npairs, nskipped_length, nskipped_distance)) E.Stop()
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) ## nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()), )) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join( ("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )] matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join( (o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)