def test_prior_model(start_tree, sim_length=100000, summaries=None, thinning_coef=1): posterior = initialize_prior_as_posterior() if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = adaptive_proposal() #basic_meta_proposal() sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme['posterior'] = (1, 1000) final_tree, final_posterior, results, _ = basic_chain( start_tree, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(thinning_coef + sim_length / 60000), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=True) print results save_to_csv(results, summaries) return results
def test_prior_model_no_admixes(start_tree, sim_length=100000, summaries=None, thinning_coef=1): posterior = initialize_prior_as_posterior() if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = adaptive_proposal_no_admix() #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} final_tree, final_posterior, results, _ = basic_chain( start_tree, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(thinning_coef + sim_length / 60000), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=False) save_to_csv(results, summaries) return results
def run_test(): from Rtree_operations import get_trivial_nodes, create_trivial_tree, get_number_of_ghost_populations, get_max_distance_to_root, get_min_distance_to_root, get_average_distance_to_root from posterior import initialize_prior_as_posterior, initialize_posterior from meta_proposal import basic_meta_proposal from copy import deepcopy from Rtree_to_covariance_matrix import make_covariance N = 3 true_tree = create_trivial_tree(N) proposal_function = basic_meta_proposal() post_fun = initialize_posterior(make_covariance(true_tree)) tree = create_trivial_tree(N) n = 6 import summary summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics(get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics(get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics(get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics(get_average_distance_to_root, 'average_root'), summary.s_variable('proposal_type', output='string') ] from temperature_scheme import fixed_geometrical sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme['posterior'] = (1, 100) #sample_verbose_scheme['min_root']=(1,100) ad = MCMCMC(starting_trees=[deepcopy(tree) for _ in range(n)], posterior_function=post_fun, summaries=summaries, temperature_scheme=fixed_geometrical(10.0, n), printing_schemes=[sample_verbose_scheme for _ in range(n)], iteration_scheme=[40] * 200, overall_thinnings=5, proposal_scheme=[proposal_function for _ in range(n)], cores=n, no_chains=n) ad[0].to_csv(path_or_buf='findme.csv') print set(map(tuple, ad[1])) return ad
def run_posterior_grid(tree_files, alpha, wishart_df): #true_trees= [tree_generation_laboratory.load_tree(tree_file) for tree_file in tree_files] summaries = [ summary.s_posterior(), summary.s_variable('mhr', output='double_missing'), summary.s_no_admixes(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_likelihood(), summary.s_prior() ] def f(x): unsuffixed_filename = '.'.join(x.split('.')[:-1]) true_tree = tree_generation_laboratory.identifier_to_tree_clean_wrapper( tree_generation_laboratory.load_tree(x)) s_tree = Rtree_operations.create_trivial_tree( Rtree_operations.get_no_leaves(true_tree)) simulation_sanity.test_posterior_model( true_tree, s_tree, 100, summaries=summaries, thinning_coef=30, wishart_df=wishart_df, resimulate_regrafted_branch_length=alpha, filename=unsuffixed_filename + '-results.csv') from pathos.multiprocessing import Pool p = Pool(len(tree_files)) p.map(f, tree_files)
def run_c(): n = 3 s_trees = [ Rtree_operations.create_trivial_tree(n), Rtree_operations.create_burled_leaved_tree(n, 1.0), Rtree_operations.create_balanced_tree(n, 1.0) ] summaries = [ summary.s_variable('posterior'), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_variable('proposal_type', output='string'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] simulation_sanity.test_prior_model_several_chains(s_trees, 100000, summaries=summaries, thinning_coef=3) print 'finished mcmc chains' list_of_summaries = summaries[2:10] nsim = 100000 prior_distribution = generate_prior_trees.get_distribution_under_prior( leaves=n, sim_length=nsim, list_of_summaries=list_of_summaries) #, thinning_criteria=max_two) analyse_results.save_to_csv( [tuple(range(nsim))] + [tuple(prior_distribution[summ.name]) for summ in list_of_summaries], list_of_summaries, filename='sim_prior.csv', origin_layer=None) analyse_results.generate_summary_csv(summaries)
def marginalize_out_data_in_posterior(no_leaves, no_trees=100, nsim=50000, wishart_df=None, prefix='', dest_folder='', sap_sim=False, sap_ana=False): summaries = [ s_posterior(), s_no_admixes(), s_average_branch_length(), s_total_branch_length(), s_basic_tree_statistics( get_number_of_ghost_populations, 'ghost_pops', output='integer'), s_basic_tree_statistics(get_max_distance_to_root, 'max_root'), s_basic_tree_statistics(get_min_distance_to_root, 'min_root'), s_basic_tree_statistics(get_average_distance_to_root, 'average_root') ] + [ s_variable(s, output='double') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] for i in xrange(no_trees): result_file = os.path.join(dest_folder, 'results_' + prefix + str(i + 1) + '.csv') test_posterior_model(thinning_coef=49, summaries=summaries, filename=result_file, sim_from_wishart=True, sim_length=nsim, wishart_df=wishart_df, no_leaves_true_tree=no_leaves, sap_sim=sap_sim, sap_ana=sap_ana)
def mcmcmc(observed_covariance, df , outgroup=False, chains=8, its=[50]*100): nodes=['s'+str(i+1) for i in range(observed_covariance.shape[0])] start_x=identifier_to_tree_clean(simulate_tree(4,0)),0 summaries=[summary.s_posterior(), summary.s_basic_tree_statistics(tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('add', output='double'), summary.s_no_admixes(),] options=options_object(outgroup, chains=chains) proposal=make_proposal(options) posterior_function=posterior_class(observed_covariance, M=df, nodes=nodes) sample_verbose_scheme=[{'posterior':(1,200), 'tree':(1,0),'add':(1,200),'no_admixes':(1,200)}]+[{s.name:(1,0) for s in summaries}]*(chains-1) res=MCMCMC(starting_trees=[(identifier_to_tree_clean(simulate_tree(4,0)),0) for _ in range(chains)], posterior_function= posterior_function, summaries=summaries, temperature_scheme=fixed_geometrical(800,chains), printing_schemes=sample_verbose_scheme, iteration_scheme=its, overall_thinnings=40, proposal_scheme= proposal, cores=chains, no_chains=chains, multiplier=None, result_file=None, store_permuts=False, stop_criteria=None) res=res.loc[res.layer==0,['iteration','posterior','tree','no_admixes']] return res
def analyse_data_single_chained(filename): emp_cov = load_data.read_data( filename, nodes=['French', 'Han', 'Karitiana', 'Sardinian', 'Yoruba'], noss=True) print emp_cov df = 100 summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param'), summary.s_variable('rescale_adap_param'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model(None, None, 300000, summaries=summaries, thinning_coef=20, wishart_df=df, emp_cov=emp_cov, no_leaves_true_tree=5)
def run_a(): n = 4 s_tree = Rtree_operations.create_burled_leaved_tree(n, 1) summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_variable('proposal_type', output='string'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] simulation_sanity.test_prior_model(s_tree, 50000, summaries=summaries, thinning_coef=3) def max_two(tree): if Rtree_operations.get_number_of_admixes(tree) > 2: return False return True list_of_summaries = summaries[2:10] nsim = 100000
def test_prior_model_several_chains(start_trees, sim_length=100000, summaries=None, thinning_coef=1): posterior = initialize_prior_as_posterior() if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = basic_meta_proposal() sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} p = Pool(len(start_trees)) def func(nstart_tree): n, start_tree = nstart_tree final_tree, final_posterior, results, _ = basic_chain( start_tree, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(thinning_coef + sim_length / 60000), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=True) save_to_csv(results, summaries, filename='results_' + str(n + 1) + 'csv', origin_layer=(n + 1, 1)) p.map(func, enumerate(start_trees))
def get_summaries(true_tree, df=10): m = make_covariance(true_tree) posterior = initialize_posterior(m, df) summaries = [ summary.s_variable_recalculated( 'posterior', output='double', pks_function=posterior), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable_recalculated( s, output='double', pks_function=posterior) for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] return summaries
def mcmc(observed_covariance, df, outgroup=False): nodes=['s'+str(i+1) for i in range(observed_covariance.shape[0])] start_x=identifier_to_tree_clean(simulate_tree(4,0)),0 summaries=[summary.s_posterior(), summary.s_basic_tree_statistics(tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_variable('add', output='double'), summary.s_no_admixes(),] options=options_object(outgroup) proposal=make_proposal(options)[0] posterior_function=posterior_class(observed_covariance, M=df, nodes=nodes) sample_verbose_scheme={'posterior':(1,200), 'tree':(1,0),'add':(1,200),'no_admixes':(1,200)} a=basic_chain(start_x, summaries, posterior_function, proposal, post=None, N=5000, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=100, i_start_from=0, temperature=1.0, proposal_update=None, multiplier=None, check_trees=False, appending_result_file=None, appending_result_frequency=10) return a[2]
def run_d(true_tree_as_file=None): #true_tree=generate_prior_trees.generate_phylogeny(8,2) if true_tree_as_file is None: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #true_tree=Rcatalogue_of_trees.tree_good s_tree = tree_statistics.identifier_to_tree_clean( 'w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23' ) print Rtree_operations.pretty_string(s_tree) print Rtree_operations.pretty_string(true_tree) else: with open(true_tree_as_file, 'r') as f: s = f.readline().rstrip() true_tree = tree_statistics.identifier_to_tree_clean(s) no_leaves = Rtree_operations.get_number_of_leaves(true_tree) s_tree = Rtree_operations.create_trivial_tree(no_leaves) summaries = [ summary.s_posterior(), summary.s_variable('mhr', output='double_missing'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.get_admixture_proportion_string, 'admixtures', output='string'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('sliding_rescale_adap_param', output='double_missing'), summary.s_variable('cutoff_distance', output='double_missing'), summary.s_variable('number_of_pieces', output='double_missing'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_constrained_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, s_tree, 100000, summaries=summaries, thinning_coef=20, wishart_df=10000, resimulate_regrafted_branch_length=False) #, #admixtures_of_true_tree=2, no_leaves_true_tree=8, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def get_summary_scheme(majority_tree=False, light_newick_tree_summaries=False, full_tree=True, proposals=None, acceptance_rate_information=False, admixture_proportion_string=False, priors=False, no_chains=1, nodes=None, verbose_level='normal', only_coldest_chain=True): if proposals is not None: props = proposals.props prop_names = [prop.proposal_name for prop in props] adaption = [prop.adaption for prop in props] summaries = [ summary.s_posterior(), summary.s_likelihood(), summary.s_prior(), summary.s_no_admixes(), summary.s_variable('add', output='double'), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( Rtree_to_covariance_matrix.get_populations_string, 'descendant_sets', output='string') ] if full_tree: summaries.append( summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string')) if admixture_proportion_string: summaries.append( summary.s_basic_tree_statistics( tree_statistics.get_admixture_proportion_string, 'admixtures', output='string')) if light_newick_tree_summaries: summaries.append( summary.s_basic_tree_statistics(tree_statistics.tree_to_0ntree, 'Zero_Ntree', output='string')) summaries.append( summary.s_basic_tree_statistics( tree_statistics.tree_to_random_ntree, 'random_Ntree', output='string')) summaries.append( summary.s_basic_tree_statistics(tree_statistics.tree_to_mode_ntree, 'mode_Ntree', output='string')) if acceptance_rate_information: summaries.append(summary.s_variable('mhr', output='double_missing')) summaries.append(summary.s_variable('proposal_type', output='string')) if proposals is not None: for prop_name, adapt in zip(prop_names, adaption): if adapt: summaries.append( summary.s_variable(prop_name + "_adap_param", output='double_missing')) sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme_first = deepcopy(sample_verbose_scheme) if 'posterior' in sample_verbose_scheme and verbose_level != 'silent': sample_verbose_scheme_first['posterior'] = (1, 1) #sample_verbose_scheme_first['prior']=(1,1) sample_verbose_scheme_first['no_admixes'] = (1, 1) if no_chains == 1: return [sample_verbose_scheme_first], summaries elif only_coldest_chain: return [sample_verbose_scheme_first ] + [{}] * (no_chains - 1), summaries else: return [sample_verbose_scheme_first ] + [sample_verbose_scheme] * (no_chains - 1), summaries
def test_posterior_model(true_tree=None, start_tree=None, sim_length=100000, summaries=None, thinning_coef=19, admixtures_of_true_tree=None, no_leaves_true_tree=4, filename='results.csv', sim_from_wishart=False, wishart_df=None, sap_sim=False, sap_ana=False, resimulate_regrafted_branch_length=False, emp_cov=None, big_posterior=False, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree, skewed_admixture_prior=sap_sim) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if big_posterior: posterior = initialize_big_posterior(m, wishart_df, use_skewed_distr=sap_ana) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=sap_ana, rescale=rescale_empirical_cov) print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_[:2]) if summaries is None: summaries = [s_posterior(), s_variable('mhr'), s_no_admixes()] proposal = adaptive_proposal( resimulate_regrafted_branch_length=resimulate_regrafted_branch_length) #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme['posterior'] = (1, 1) sample_verbose_scheme['no_admixes'] = (1, 1) final_tree, final_posterior, results, _ = basic_chain( start_x, summaries, posterior, proposal, post=None, N=sim_length, sample_verbose_scheme=sample_verbose_scheme, overall_thinning=int(max(thinning_coef, sim_length / 60000)), i_start_from=0, temperature=1.0, proposal_update=None, check_trees=False) save_to_csv(results, summaries, filename=filename) return true_tree
def test_posterior_model_multichain(true_tree=None, start_tree=None, sim_lengths=[250] * 800, summaries=None, thinning_coef=1, admixtures_of_true_tree=None, no_leaves_true_tree=4, wishart_df=None, sim_from_wishart=False, no_chains=8, result_file='results_mc3.csv', emp_cov=None, emp_remove=-1, rescale_empirical_cov=False): if true_tree is None: if admixtures_of_true_tree is None: admixtures_of_true_tree = geom.rvs(p=0.5) - 1 true_tree = generate_phylogeny(no_leaves_true_tree, admixtures_of_true_tree) else: no_leaves_true_tree = get_no_leaves(true_tree) admixtures_of_true_tree = get_number_of_admixes(true_tree) true_x = (true_tree, 0) m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree)) if start_tree is None: start_tree = true_tree start_x = (start_tree, 0) if wishart_df is None: wishart_df = n_mark(m) if sim_from_wishart: r = m.shape[0] print m m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df)) print m if emp_cov is not None: m = emp_cov if rescale_empirical_cov: posterior, multiplier = initialize_posterior( m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) else: posterior = initialize_posterior(m, wishart_df, use_skewed_distr=True, rescale=rescale_empirical_cov) multiplier = None print 'true_tree=', unique_identifier_and_branch_lengths(true_tree) if rescale_empirical_cov: post_ = posterior( (scale_tree_copy(true_x[0], 1.0 / multiplier), true_x[1] / multiplier)) else: post_ = posterior(true_x) print 'likelihood(true_tree)', post_[0] print 'prior(true_tree)', post_[1] print 'posterior(true_tree)', sum(post_) if summaries is None: summaries = [ s_variable('posterior'), s_variable('mhr'), s_no_admixes() ] proposal = basic_meta_proposal() #proposal.props=proposal.props[2:] #a little hack under the hood #proposal.params=proposal.params[2:] #a little hack under the hood. sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries} sample_verbose_scheme_first = deepcopy(sample_verbose_scheme) if 'posterior' in sample_verbose_scheme: sample_verbose_scheme_first['posterior'] = (1, 1) #(1,1) sample_verbose_scheme_first['no_admixes'] = (1, 1) #if 'likelihood' in sample_verbose_scheme: #sample_verbose_scheme_first['likelihood']=(1,1) print sample_verbose_scheme_first MCMCMC(starting_trees=[deepcopy(start_x) for _ in range(no_chains)], posterior_function=posterior, summaries=summaries, temperature_scheme=fixed_geometrical(800.0, no_chains), printing_schemes=[sample_verbose_scheme_first] + [sample_verbose_scheme for _ in range(no_chains - 1)], iteration_scheme=sim_lengths, overall_thinnings=int(thinning_coef), proposal_scheme=[adaptive_proposal() for _ in range(no_chains)], cores=no_chains, no_chains=no_chains, multiplier=multiplier, result_file=result_file, store_permuts=False) print 'finished MC3' #save_pandas_dataframe_to_csv(results, result_file) #save_permuts_to_csv(permuts, get_permut_filename(result_file)) return true_tree
def run_analysis_of_proposals(): #true_tree=generate_prior_trees.generate_phylogeny(8,2) true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.c.w.w.w.2.w-w.w.a.w.w.w.w-w.c.1.w.c.w.w.4-w.c.1.w.w.w-w.c.1.w.w-c.0.w.w-c.w.0-a.w-c.0.w-c.0;0.091-1.665-0.263-0.821-0.058-0.501-0.141-0.868-5.064-0.153-0.372-3.715-1.234-0.913-2.186-0.168-0.542-0.056-2.558-0.324;0.367-0.451' ) true_tree = Rcatalogue_of_trees.tree_good s_tree = Rtree_operations.create_trivial_tree(4) summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_bposterior_difference(lambda x: x[0], 'likelihood_difference'), summary.s_bposterior_difference(lambda x: x[1], 'prior_difference'), summary.s_bposterior_difference(lambda x: x[2][0], 'branch_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][1], 'no_admix_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][2], 'adix_prop_prior_difference'), summary.s_bposterior_difference(lambda x: x[2][3], 'top_prior_difference'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model( true_tree, true_tree, 100000, summaries=summaries, thinning_coef=2, wishart_df=1000, resimulate_regrafted_branch_length=False, admixtures_of_true_tree=2, no_leaves_true_tree=4, big_posterior=True, rescale_empirical_cov=True) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
def run_posterior_multichain(wishart_df=1000, true_tree_as_identifier=None, result_file='result_mc3.csv', emp_cov_file=None, emp_remove=-1, remove_outgroup=False, make_emp_cov_file=True): if true_tree_as_identifier is None: true_tree = Rcatalogue_of_trees.tree_good else: true_tree = tree_statistics.identifier_to_tree_clean( 'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485' ) #with open(true_tree_as_identifier, 'r') as f: # s=f.readline().rstrip() # true_tree=tree_statistics.identifier_to_tree_clean(s) if remove_outgroup: true_tree = Rtree_operations.remove_outgroup(true_tree) true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1( true_tree) if make_emp_cov_file: cov = tree_to_data.get_empirical_matrix(s, factor=0.01, reps=400) tree_to_data.emp_cov_to_file(cov, filename=emp_cov_file) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths( true_tree) no_leaves = Rtree_operations.get_no_leaves(true_tree) #s_tree=tree_statistics.identifier_to_tree_clean('w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23') s_tree = Rtree_operations.create_burled_leaved_tree(no_leaves, 1.0) print 'no_leaves', no_leaves summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('add', output='double'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param', output='double_missing'), summary.s_variable('rescale_adap_param', output='double_missing'), summary.s_likelihood(), summary.s_prior(), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] if emp_cov_file is not None: if emp_remove < 0: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file) else: emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file, emp_remove) else: emp_cov = None print 'emp_cov', emp_cov r = simulation_sanity.test_posterior_model_multichain( true_tree, s_tree, [50] * 20000, summaries=summaries, thinning_coef=24, wishart_df=wishart_df, result_file=result_file, emp_cov=emp_cov, rescale_empirical_cov=False) print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r) analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
#call_notebook() def call_notebook(): ## DOESNT WORK IN THE MAC dir_path = os.path.dirname(os.path.realpath(__file__)) cmd = ['Rscript', dir_path + os.path.sep + 'order_report.R'] print cmd call(cmd) if __name__ == '__main__': import summary summaries = [ summary.s_variable('posterior'), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s) for s in [ 'backward_choices', 'backward_density', 'forward_density', 'forward_choices', 'proposal_type', 'prior', 'branch_prior', 'no_admix_prior', 'top_prior' ] ] from generate_prior_trees import get_distribution_under_prior prior_distribution = get_distribution_under_prior(