def processMali(mali, options): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int(float(ncols) / 3.0 * options.block_size) * 3 else: size = int(options.block_size) * 3 size = min(size, ncols) mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename(id, species) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps(minimum_gaps=1, frame=3) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r")) tree = nexus.trees[0] tree.relabel(map_old2new) else: tree = None annotation = mali.getAnnotation("STATE") chars = set(list(annotation)) for c in chars: assert c in ( "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block") blocks = (("B0_", chars[0]), ) else: blocks = (("B0_", "N"), ("B1_", "C")) result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks, options) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix(trained_model) annotation = mali.getAnnotation("STATE") for block, code in blocks: terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs) rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn) ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri) rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv) nchars = annotation.count(code) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv) try: Q, t = RateEstimation.getQMatrix(pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=ri * avg_omega, Rsv=rv * avg_omega, Rni=ri * avg_omega, Rnv=rv * avg_omega) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=rs * avg_kappa, Rsv=rs * avg_kappa, Rni=rn * avg_kappa, Rnv=rn * avg_kappa) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % (rI / rI0 * rV0 / rV) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write("\t".join( map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars)))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) options.stdout.write("\t%s\n" % msg)
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg): """output the results of running the Xrate four parameter grammar. """ ids = mali.getIdentifiers() pi, matrix = RateEstimation.getRateMatrix(result.getModel(), terminals=('COD0', 'COD1', 'COD2')) if rsi is None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_N, o_S = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: Q, t = RateEstimation.getQMatrix(pi, Rsi=rsi, Rsv=rsv, Rni=rni, Rnv=rnv) # get rate matrix as if omega was set to 1 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rni) / 2.0, Rsv = (rsv + rnv) / 2.0, Rni = (rsi + rni) / 2.0, Rnv = (rsv + rnv) / 2.0) # get rate matrix as if kappa was set to 1 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rsv) / 2.0, Rsv = (rsi + rsv) / 2.0, Rni = (rni + rnv) / 2.0, Rnv = (rni + rnv) / 2.0) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) # 64.0/61.0 results from the fact that xrate does not normalize # the terminals dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_S = options.value_format % (mali.getNumColumns() * rS0) o_N = options.value_format % (mali.getNumColumns() * rN0) # kappa is given normalized by sites like omega o_kappa = options.value_format % (rI / rI1 * rV1 / rV) # kappa1 is given by the ratio of the rates NOT normalized by the # sites. msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0, options.value_format % ((rsi + rni) / (rsv + rnv))) options.stdout.write("\t".join(map(str, (mali.getEntry(ids[0]).mId, mali.getEntry(ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa, result.getLogLikelihood( ), "na")))) if options.with_rho: options.stdout.write("\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg) options.stdout.flush()
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg): """output the results of running the Xrate four parameter grammar. """ ids = mali.getIdentifiers() pi, matrix = RateEstimation.getRateMatrix(result.getModel(), terminals=('COD0', 'COD1', 'COD2')) if rsi == None: o_dn, o_ds, o_omega = "na", "na", "na" o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na" o_t, o_t0 = "na", "na" o_N, o_S = "na", "na" o_kappa = "na", msg = "estimated rate parameters are zero" else: Q, t = RateEstimation.getQMatrix(pi, Rsi=rsi, Rsv=rsv, Rni=rni, Rnv=rnv) ## get rate matrix as if omega was set to 1 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rni) / 2.0, Rsv=(rsv + rnv) / 2.0, Rni=(rsi + rni) / 2.0, Rnv=(rsv + rnv) / 2.0) ## get rate matrix as if kappa was set to 1 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=(rsi + rsv) / 2.0, Rsv=(rsi + rsv) / 2.0, Rni=(rni + rnv) / 2.0, Rnv=(rni + rnv) / 2.0) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) # 64.0/61.0 results from the fact that xrate does not normalize # the terminals dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 o_S = options.value_format % (mali.getNumColumns() * rS0) o_N = options.value_format % (mali.getNumColumns() * rN0) ## kappa is given normalized by sites like omega o_kappa = options.value_format % (rI / rI1 * rV1 / rV) ## kappa1 is given by the ratio of the rates NOT normalized by the sites. msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0, options.value_format % ((rsi + rni) / (rsv + rnv))) options.stdout.write("\t".join( map(str, (mali.getEntry(ids[0]).mId, mali.getEntry( ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa, result.getLogLikelihood(), "na")))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]]) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\t%s\n" % msg) options.stdout.flush()
def processMali( mali, options ): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int( float( ncols ) / 3.0 * options.block_size) * 3 else: size = int( options.block_size ) * 3 size = min( size, ncols ) mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement( options.xrate_min_increment ) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=3 ) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] tree.relabel( map_old2new ) else: tree = None annotation = mali.getAnnotation( "STATE" ) chars = set(list(annotation)) for c in chars: assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block" ) blocks = ( ("B0_", chars[0]), ) else: blocks = ( ("B0_", "N"), ("B1_", "C") ) result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix( trained_model ) annotation = mali.getAnnotation( "STATE" ) for block, code in blocks : terminals = ( "%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block ) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs ) rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn ) ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri ) rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv ) nchars = annotation.count( code ) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv ) try: Q, t = RateEstimation.getQMatrix( pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv ) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix( pi, Rsi = ri * avg_omega, Rsv = rv * avg_omega, Rni = ri * avg_omega, Rnv = rv * avg_omega ) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix( pi, Rsi = rs * avg_kappa, Rsv = rs * avg_kappa, Rni = rn * avg_kappa, Rnv = rn * avg_kappa ) rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q ) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 ) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 ) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % ( rI / rI0 * rV0 / rV ) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write( "\t".join( map(str, ( code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars )))) if options.with_rho: options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0 )))) options.stdout.write( "\t%s\n" % msg )