def test_populate_parameters(): """populate_parameters should set up a nested likelihood function""" lf_GTR = nest.inflate_likelihood_function(_GTR) lf_General = nest.inflate_likelihood_function(_General) for edge in lf_GTR.tree.getTipNames(): assert not allclose(lf_GTR.getPsubForEdge(edge), lf_General.getPsubForEdge(edge)), 'models started close' nest.populate_parameters(lf_General, lf_GTR) for edge in lf_GTR.tree.getTipNames(): assert_array_almost_equal(lf_GTR.getPsubForEdge(edge), lf_General.getPsubForEdge(edge))
def test_inflate_deflate_likelihood_function(): """deflate/inflate_likelihood_function are reciprocal maps""" lf = nest.inflate_likelihood_function(_GTRplusGamma) aln = get_aln('GTRplusGamma', _GTRplusGamma['aln_length']) lf.setAlignment(aln) down = nest.deflate_likelihood_function(lf) down_up = nest.inflate_likelihood_function(down) down_up.setAlignment(aln) down_up_down = nest.deflate_likelihood_function(down_up) assert_equal(down, down_up_down)
def ml_bootstraps(empirical, num_bootstraps=100): assert empirical['model'] in \ ('NG', 'NFG', 'MG94G', 'GNC', 'Y98GTR', 'CNFGTR', 'MG94GTR', 'Y98') gc = get_genetic_code(empirical['gc'].encode('utf-8')) model = lambda **kw: eval(empirical['model'])(gc=gc, **kw) elf = nest.inflate_likelihood_function(empirical['lf'], model) aln_length = empirical['lf']['aln_length'] if empirical['model'] != 'NG': # for unexpected simulateAlignment behaviour aln_length = int(aln_length / 3) assert empirical['lf']['aln_length'] == 3 * aln_length def bootstrap(empdoc): aln = elf.simulateAlignment(aln_length) simdoc = {'aln': str(aln), 'tree': empdoc['lf']['tree']} result = ml(simdoc, **empdoc) return result['lf']['gs'] def extract_result(bootstraps): egs = empirical['lf']['gs'] result = { 'gstats': bootstraps, 'gstat': egs, 'pvalue': sum(g > egs for g in bootstraps) / (num_bootstraps + 1) } return result emp_gen = (empirical for i in [None] * num_bootstraps) return extract_result(map(bootstrap, emp_gen))
def test_deflate_likelihood_function(): """deflate_likelihood_function produces internally consistent statistics""" lf = nest.inflate_likelihood_function(_General) aln = get_aln('General', _General['aln_length']) lf.setAlignment(aln) EN = nest.deflate_likelihood_function(lf)['EN'] assert_equal(EN, nest.get_expected_no_subs(lf))
def split_ens(doc): assert doc['model'] in ('GNC', 'Y98GTR', 'CNFGTR') gc = get_genetic_code(doc['gc'].encode('utf-8')) model = lambda **kw: eval(doc['model'])(gc=gc, **kw) lf = inflate_likelihood_function(doc['lf'], model) ENS = get_expected_no_subs(lf, gc) return {'ENS' : get_expected_no_subs(lf, gc)}
def test_makeContinuousPsubDefn(): standard_params = {'is_independent': True, 'is_constant': False} lf_gen = inflate_likelihood_function(_General) model = GeneralBen(DNA.Alphabet, recode_gaps=True, model_gaps=False, optimise_motif_probs=True, name='GeneralBen') lf_ben = model.makeLikelihoodFunction(lf_gen.tree) populate_parameters(lf_ben, lf_gen, **standard_params) ben_ens = get_expected_no_subs(lf_ben) ben_lens = lf_ben.getParamValueDict(['edge'])['length'] gen_ens = get_expected_no_subs(lf_gen) for edge in lf_gen.tree.getTipNames(): assert_array_almost_equal( np.array(lf_ben.getRateMatrixForEdge(edge)) * lf_ben.getParamValue('length', edge), np.array(lf_gen.getRateMatrixForEdge(edge)) * lf_gen.getParamValue('length', edge)) assert_almost_equal(ben_ens[edge], gen_ens[edge]) assert_almost_equal(ben_lens[edge], ben_ens[edge])
def param_bootstrap(stats, num_reps=None, model_pos=None, fitter=None, **kw): gene, f_stats, g_stats = stats try: f_row = f_stats[model_pos] except IndexError: logging.error(' Skipping ' + '/'.join(f_stats[0]['tip_names']) + ' in ' + gene + ': position ' + model_pos + ' invalid') return model = f_row['name'] if model_pos in g_stats: g_row = g_stats[model_pos] else: g_row = { 'name': model, 'tip_names': f_row['tip_names'], 'gs_samples': [], 'll_samples': [], 'en_samples': [] } g_stats[model_pos] = g_row gs_samples = g_row['gs_samples'] ll_samples = g_row['ll_samples'] en_samples = g_row['en_samples'] if 'state' in g_row: random.setstate(eval(g_row['state'])) lf = nest.inflate_likelihood_function(f_row) aln_length = f_row['aln_length'] start = time.time() for i in 10 * range(num_reps): if len(gs_samples) >= num_reps: break try: aln = lf.simulateAlignment(aln_length, random_series=random) lfs = fitter(aln, lf.tree, return_lfs=model, **kw) fitted_lf = lfs[model_pos] ll_samples.append(fitted_lf.getLogLikelihood()) gs_samples.append(fitted_lf.getGStatistic()) if 'Q' in fitted_lf.defn_for: en_samples.append(nest.get_expected_no_subs(fitted_lf)) except: logging.warning(' Missed a G stat for ' + model + ' and ' + '/'.join(f_row['tip_names']) + ' in ' + gene + ':\n' + format_exc()) else: logging.error( ' Failed to compile sufficient bootstrap repetitions for ' + model + ' and ' + '/'.join(f_row['tip_names']) + ' in ' + gene) g_row['state'] = repr(random.getstate()) f_row['gs_p'] = (sum(1 for g in gs_samples if g < f_row['gs']), len(gs_samples) + 1) f_row['ll_p'] = (sum(1 for l in ll_samples if l < f_row['ll']), len(ll_samples) + 1) logging.info(' Done ' + model + ' and ' + '/'.join(f_row['tip_names']) + ' in ' + gene + ' in ' + str(time.time() - start) + ' secs') return gene, f_stats, g_stats
def test_hetero_fit(): """hetero_fit should fit GTR plus Gamma models""" pre_lf = nest.inflate_likelihood_function(_GTRplusGamma) prefit = nest.get_expected_no_subs(pre_lf) aln = get_aln('GTRplusGamma', 100000) lfs = nest.hetero_fit(aln, pre_lf.tree, param_limit=20, return_lfs=True) postfit = nest.get_expected_no_subs(lfs[-1]) for taxon in prefit: assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2)
def check(self, row): try: armu = inflate_likelihood_function(row).allRateMatricesUnique() except (ArithmeticError, NotImplementedError): logging.debug(traceback.format_exc()) return False except AssertionError: logging.warning(traceback.format_exc()) return False return armu
def fit(aln, tree, result, model, omega_indep, genetic_code, format): """ Fit the selected model to the input fasta ALN with the selected TREE and output the RESULT. """ data = aln.read() data = _decompress_if_zipped(data) doc = {'tree': tree.read().strip(), 'aln': data} doc = ml.ml(doc, model=model, omega_indep=omega_indep, gc=genetic_code) if format == 'json': json.dump(doc, result) else: lf = nest.inflate_likelihood_function(doc['lf'], lambda: getattr(ml, model)()) result.write(str(lf) + '\n') return 0
def test_hetero_clock_fit(): """hetero_clock_fit should fit a molecular clock constrained GTR plus Gamma model nested in a GTR plus Gamma model""" pre_lf = nest.inflate_likelihood_function(_GTRplusGammaClockTest) prefit = nest.get_expected_no_subs(pre_lf) aln = get_aln('GTRplusGammaClockTest', 100000) lfs = nest.hetero_clock_fit(aln, pre_lf.tree, outgroup='Opossum', param_limit=20, return_lfs=True) lf_equal_length, lf = lfs assert_less(lf_equal_length.getLogLikelihood(), lf.getLogLikelihood()) postfit = nest.get_expected_no_subs(lf) postfit_equal_length = nest.get_expected_no_subs(lf_equal_length) for taxon in prefit: assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) assert_almost_equal(postfit_equal_length[taxon], prefit[taxon], decimal=2)
def test_seq_fit(): """seq_fit should fit nested GTR and General models""" for model in 'GTR', 'General': pre_lf = nest.inflate_likelihood_function(eval('_'+model)) prefit = nest.get_expected_no_subs(pre_lf) aln = get_aln(model, 100000) lfs = nest.seq_fit(aln, pre_lf.tree, param_limit=20, return_lfs=model) if model == 'General': assert_less(lfs[0].getLogLikelihood(), lfs[1].getLogLikelihood()) lf = lfs[-1] postfit = nest.get_expected_no_subs(lf) for taxon in prefit: assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2)
def generate_alignments(): from gzip import GzipFile from data import get_data_dir from os.path import join alns = [('GTRplusGamma', _GTRplusGamma['aln_length']), ('General', _General['aln_length']), ('GTR', 100000), ('General', 100000), ('GTRplusGamma', 100000), ('GTRplusGammaClockTest', 100000), ('GTRClockTest', 100000), ('GeneralBen', 100000)] alns = [('GTRClockTest', 100000), ('GeneralBen', 100000)] for model, aln_len in alns: lf = nest.inflate_likelihood_function(eval('_' + model)) aln = lf.simulateAlignment(aln_len) filename = '_'.join((model, str(aln_len))) + '.fasta.gz' with GzipFile(join(get_data_dir(), filename), 'w') as aln_file: aln_file.write(aln.toFasta()) return 0
def omega(aln, tree, result, model, genetic_code, outgroup, neutral, format): """ Fit the selected model to the input fasta ALN with the selected TREE and output the RESULT, with specific constraints on omega. """ data = aln.read() data = _decompress_if_zipped(data) doc = {'tree': tree.read().strip(), 'aln': data} doc = omega_module.ml(doc, model=model, gc=genetic_code, outgroup=outgroup, neutral=neutral) if format == 'json': json.dump(doc, result) else: lf = nest.inflate_likelihood_function(doc['lf'], lambda: getattr(ml, model)()) result.write(str(lf) + '\n') return 0
def rooted(aln, tree, result, genetic_code, format): """ Fit GNC to the input fasta ALN with the selected TREE and output the RESULT. Parameters other than the scale parameter are constrained to be equal on branches connected to the root.""" data = aln.read() data = _decompress_if_zipped(data) treestring = tree.read().strip() tree = LoadTree(treestring=treestring) assert len(tree.Children) == 2, 'Tree must be edge-rooted' rooted_edges = [child.Name for child in tree.Children] doc = {'tree': treestring, 'aln': data} doc = ml.rooted(doc, rooted_edges=rooted_edges, gc=genetic_code) if format == 'json': json.dump(doc, result) else: lf = nest.inflate_likelihood_function(doc['lf'], ml.GNC) result.write(str(lf) + '\n') return 0
def clock(aln, tree, outgroup, result, model, omega_indep, genetic_code, format): """ Fit the selected model to the input fasta ALN with the input TREE with genetic distance constrained to be equal on all branches but the OUTGROUP and output the RESULT. """ data = aln.read() data = _decompress_if_zipped(data) doc = {'tree': tree.read().strip(), 'aln': data} doc = clock_module.ml(doc, model=model, gc=genetic_code, outgroup=outgroup, omega_indep=omega_indep) if format == 'json': json.dump(doc, result) else: lf = nest.inflate_likelihood_function(doc['lf'], lambda: getattr(ml, model)()) result.write(str(lf) + '\n') return 0
def test_clock_fit(): """clock_fit should fit nested GTR, General, and GeneralBen models, some with equal branch lengths""" for modelname in ('GTRClockTest', 'GeneralBen'): model = eval('_' + modelname) pre_lf = nest.inflate_likelihood_function(model) prefit = nest.get_expected_no_subs(pre_lf) aln = get_aln(modelname, 100000) lfs = nest.clock_fit(aln, pre_lf.tree, outgroup='Opossum', param_limit=20, return_lfs='GTR' if modelname.startswith('GTR') else 'General') lf_equal_length, lf = lfs[:2] if modelname[:3] == 'GTR' else lfs[2:] assert_less(lf_equal_length.getLogLikelihood(), lf.getLogLikelihood()) if modelname == 'GeneralBen': assert_less(lfs[0].getLogLikelihood(), lf_equal_length.getLogLikelihood()) postfit = nest.get_expected_no_subs(lf) postfit_equal_length = nest.get_expected_no_subs(lf_equal_length) for taxon in prefit: assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) assert_almost_equal(postfit_equal_length[taxon], prefit[taxon], decimal=2)
def test_constrain_lengths(): lf_gen = inflate_likelihood_function(_General) aln = get_aln('General', _General['aln_length']) model = GeneralBen(DNA.Alphabet, recode_gaps=True, model_gaps=False, optimise_motif_probs=True) lf_ben = model.makeLikelihoodFunction(lf_gen.tree) for param in lf_ben.getParamNames(): if '/' in param: lf_ben.setParamRule(param, is_independent=True, is_constant=False) lf_ben.setParamRule('length', is_independent=False) lf_ben.setParamRule('length', edge='Opossum', is_independent=True) lf_ben.setAlignment(aln) lf_ben.optimise(local=True, show_progress=False) ens = get_expected_no_subs(lf_ben) lens = lf_ben.getParamValueDict(['edge'])['length'] assert_almost_equal(lens['Mouse'], lens['Human']) for edge in lf_ben.tree.getTipNames(): assert_almost_equal(lens[edge], ens[edge])
def check(self, row): return inflate_likelihood_function(row).allPsubsDLC()
def test_get_expected_no_subs(): """expected_no_subs should return dictionary of ENS by edge""" GS_lf = nest.inflate_likelihood_function(_GeneralStationary) EN = nest.get_expected_no_subs(GS_lf) for name in GS_lf.tree.getTipNames(): assert_almost_equal(EN[name], GS_lf.getParamValue('length', name))