Ejemplo n.º 1
0
def run_test():
    from Rtree_operations import get_trivial_nodes, create_trivial_tree, get_number_of_ghost_populations, get_max_distance_to_root, get_min_distance_to_root, get_average_distance_to_root
    from posterior import initialize_prior_as_posterior, initialize_posterior
    from meta_proposal import basic_meta_proposal
    from copy import deepcopy
    from Rtree_to_covariance_matrix import make_covariance

    N = 3
    true_tree = create_trivial_tree(N)
    proposal_function = basic_meta_proposal()
    post_fun = initialize_posterior(make_covariance(true_tree))
    tree = create_trivial_tree(N)

    n = 6
    import summary
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(get_number_of_ghost_populations,
                                        'ghost_pops',
                                        output='integer'),
        summary.s_basic_tree_statistics(get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(get_average_distance_to_root,
                                        'average_root'),
        summary.s_variable('proposal_type', output='string')
    ]

    from temperature_scheme import fixed_geometrical
    sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries}
    sample_verbose_scheme['posterior'] = (1, 100)
    #sample_verbose_scheme['min_root']=(1,100)

    ad = MCMCMC(starting_trees=[deepcopy(tree) for _ in range(n)],
                posterior_function=post_fun,
                summaries=summaries,
                temperature_scheme=fixed_geometrical(10.0, n),
                printing_schemes=[sample_verbose_scheme for _ in range(n)],
                iteration_scheme=[40] * 200,
                overall_thinnings=5,
                proposal_scheme=[proposal_function for _ in range(n)],
                cores=n,
                no_chains=n)

    ad[0].to_csv(path_or_buf='findme.csv')
    print set(map(tuple, ad[1]))
    return ad
Ejemplo n.º 2
0
def run_c():
    n = 3
    s_trees = [
        Rtree_operations.create_trivial_tree(n),
        Rtree_operations.create_burled_leaved_tree(n, 1.0),
        Rtree_operations.create_balanced_tree(n, 1.0)
    ]
    summaries = [
        summary.s_variable('posterior'),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]

    simulation_sanity.test_prior_model_several_chains(s_trees,
                                                      100000,
                                                      summaries=summaries,
                                                      thinning_coef=3)
    print 'finished mcmc chains'
    list_of_summaries = summaries[2:10]
    nsim = 100000
    prior_distribution = generate_prior_trees.get_distribution_under_prior(
        leaves=n, sim_length=nsim,
        list_of_summaries=list_of_summaries)  #, thinning_criteria=max_two)
    analyse_results.save_to_csv(
        [tuple(range(nsim))] +
        [tuple(prior_distribution[summ.name]) for summ in list_of_summaries],
        list_of_summaries,
        filename='sim_prior.csv',
        origin_layer=None)
    analyse_results.generate_summary_csv(summaries)
Ejemplo n.º 3
0
def analyse_data_single_chained(filename):
    emp_cov = load_data.read_data(
        filename,
        nodes=['French', 'Han', 'Karitiana', 'Sardinian', 'Yoruba'],
        noss=True)
    print emp_cov
    df = 100
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_basic_tree_statistics(
            tree_statistics.unique_identifier_and_branch_lengths,
            'tree',
            output='string'),
        summary.s_basic_tree_statistics(
            tree_statistics.majority_tree, 'majority_tree', output='string'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_variable('sliding_regraft_adap_param'),
        summary.s_variable('rescale_adap_param'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double_missing')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]
    r = simulation_sanity.test_posterior_model(None,
                                               None,
                                               300000,
                                               summaries=summaries,
                                               thinning_coef=20,
                                               wishart_df=df,
                                               emp_cov=emp_cov,
                                               no_leaves_true_tree=5)
Ejemplo n.º 4
0
def run_a():
    n = 4
    s_tree = Rtree_operations.create_burled_leaved_tree(n, 1)
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]

    simulation_sanity.test_prior_model(s_tree,
                                       50000,
                                       summaries=summaries,
                                       thinning_coef=3)

    def max_two(tree):
        if Rtree_operations.get_number_of_admixes(tree) > 2:
            return False
        return True

    list_of_summaries = summaries[2:10]
    nsim = 100000
Ejemplo n.º 5
0
def get_summaries(true_tree, df=10):
    m = make_covariance(true_tree)
    posterior = initialize_posterior(m, df)
    summaries = [
        summary.s_variable_recalculated(
            'posterior', output='double', pks_function=posterior),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_basic_tree_statistics(
            tree_statistics.unique_identifier_and_branch_lengths,
            'tree',
            output='string'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_variable('sliding_regraft_adap_param',
                           output='double_missing'),
        summary.s_variable('rescale_adap_param', output='double_missing'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable_recalculated(
            s, output='double', pks_function=posterior)
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]
    return summaries
Ejemplo n.º 6
0
def run_posterior_multichain(wishart_df=1000,
                             true_tree_as_identifier=None,
                             result_file='result_mc3.csv',
                             emp_cov_file=None,
                             emp_remove=-1,
                             remove_outgroup=False,
                             make_emp_cov_file=True):
    if true_tree_as_identifier is None:
        true_tree = Rcatalogue_of_trees.tree_good
    else:
        true_tree = tree_statistics.identifier_to_tree_clean(
            'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485'
        )

        #with open(true_tree_as_identifier, 'r') as f:
        #    s=f.readline().rstrip()
        #    true_tree=tree_statistics.identifier_to_tree_clean(s)
    if remove_outgroup:
        true_tree = Rtree_operations.remove_outgroup(true_tree)
        true_tree = Rtree_operations.simple_reorder_the_leaves_after_removal_of_s1(
            true_tree)
    if make_emp_cov_file:
        cov = tree_to_data.get_empirical_matrix(s, factor=0.01, reps=400)
        tree_to_data.emp_cov_to_file(cov, filename=emp_cov_file)
    print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(
        true_tree)
    no_leaves = Rtree_operations.get_no_leaves(true_tree)
    #s_tree=tree_statistics.identifier_to_tree_clean('w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23')
    s_tree = Rtree_operations.create_burled_leaved_tree(no_leaves, 1.0)
    print 'no_leaves', no_leaves
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_basic_tree_statistics(
            tree_statistics.unique_identifier_and_branch_lengths,
            'tree',
            output='string'),
        summary.s_basic_tree_statistics(
            tree_statistics.majority_tree, 'majority_tree', output='string'),
        summary.s_variable('add', output='double'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_variable('sliding_regraft_adap_param',
                           output='double_missing'),
        summary.s_variable('rescale_adap_param', output='double_missing'),
        summary.s_likelihood(),
        summary.s_prior(),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double_missing')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]
    if emp_cov_file is not None:
        if emp_remove < 0:
            emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file)
        else:
            emp_cov = tree_to_data.file_to_emp_cov(emp_cov_file, emp_remove)
    else:
        emp_cov = None
    print 'emp_cov', emp_cov
    r = simulation_sanity.test_posterior_model_multichain(
        true_tree,
        s_tree, [50] * 20000,
        summaries=summaries,
        thinning_coef=24,
        wishart_df=wishart_df,
        result_file=result_file,
        emp_cov=emp_cov,
        rescale_empirical_cov=False)
    print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r)
    analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
Ejemplo n.º 7
0
def run_analysis_of_proposals():
    #true_tree=generate_prior_trees.generate_phylogeny(8,2)
    true_tree = tree_statistics.identifier_to_tree_clean(
        'w.w.c.w.w.w.2.w-w.w.a.w.w.w.w-w.c.1.w.c.w.w.4-w.c.1.w.w.w-w.c.1.w.w-c.0.w.w-c.w.0-a.w-c.0.w-c.0;0.091-1.665-0.263-0.821-0.058-0.501-0.141-0.868-5.064-0.153-0.372-3.715-1.234-0.913-2.186-0.168-0.542-0.056-2.558-0.324;0.367-0.451'
    )
    true_tree = Rcatalogue_of_trees.tree_good
    s_tree = Rtree_operations.create_trivial_tree(4)
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_basic_tree_statistics(
            tree_statistics.unique_identifier_and_branch_lengths,
            'tree',
            output='string'),
        summary.s_basic_tree_statistics(
            tree_statistics.majority_tree, 'majority_tree', output='string'),
        summary.s_bposterior_difference(lambda x: x[0],
                                        'likelihood_difference'),
        summary.s_bposterior_difference(lambda x: x[1], 'prior_difference'),
        summary.s_bposterior_difference(lambda x: x[2][0],
                                        'branch_prior_difference'),
        summary.s_bposterior_difference(lambda x: x[2][1],
                                        'no_admix_prior_difference'),
        summary.s_bposterior_difference(lambda x: x[2][2],
                                        'adix_prop_prior_difference'),
        summary.s_bposterior_difference(lambda x: x[2][3],
                                        'top_prior_difference'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_variable('sliding_regraft_adap_param',
                           output='double_missing'),
        summary.s_variable('rescale_adap_param', output='double_missing'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double_missing')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]
    r = simulation_sanity.test_posterior_model(
        true_tree,
        true_tree,
        100000,
        summaries=summaries,
        thinning_coef=2,
        wishart_df=1000,
        resimulate_regrafted_branch_length=False,
        admixtures_of_true_tree=2,
        no_leaves_true_tree=4,
        big_posterior=True,
        rescale_empirical_cov=True)
    print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r)
    analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
Ejemplo n.º 8
0
def run_d(true_tree_as_file=None):
    #true_tree=generate_prior_trees.generate_phylogeny(8,2)
    if true_tree_as_file is None:
        true_tree = tree_statistics.identifier_to_tree_clean(
            'w.w.w.w.w.w.a.a.w-c.w.c.c.w.c.5.0.w.3.2-c.w.w.0.c.4.w-c.w.0.c.3-w.c.1-c.0;0.07-0.974-1.016-0.089-0.81-0.086-1.499-0.052-1.199-2.86-0.403-0.468-0.469-1.348-1.302-1.832-0.288-0.18-0.45-0.922-2.925-3.403;0.388-0.485'
        )
        #true_tree=Rcatalogue_of_trees.tree_good
        s_tree = tree_statistics.identifier_to_tree_clean(
            'w.w.a.w.w.a.a.a.w-c.w.c.c.w.w.c.0.w.w.6.3.2-c.w.w.0.w.c.5.w.w-c.w.0.c.3.w.w-c.w.c.2.0-w.c.1-c.0;0.828-0.21-0.197-0.247-0.568-1.06-0.799-1.162-2.632-2.001-0.45-1.048-0.834-0.469-0.191-2.759-0.871-1.896-0.473-0.019-1.236-0.287-0.179-0.981-0.456-0.91-2.114-3.368;0.655-0.506-0.389-0.23'
        )
        print Rtree_operations.pretty_string(s_tree)
        print Rtree_operations.pretty_string(true_tree)
    else:
        with open(true_tree_as_file, 'r') as f:
            s = f.readline().rstrip()
            true_tree = tree_statistics.identifier_to_tree_clean(s)
            no_leaves = Rtree_operations.get_number_of_leaves(true_tree)
            s_tree = Rtree_operations.create_trivial_tree(no_leaves)
    summaries = [
        summary.s_posterior(),
        summary.s_variable('mhr', output='double_missing'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_number_of_ghost_populations,
            'ghost_pops',
            output='integer'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_max_distance_to_root, 'max_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_min_distance_to_root, 'min_root'),
        summary.s_basic_tree_statistics(
            Rtree_operations.get_average_distance_to_root, 'average_root'),
        summary.s_basic_tree_statistics(
            tree_statistics.get_admixture_proportion_string,
            'admixtures',
            output='string'),
        summary.s_basic_tree_statistics(
            tree_statistics.unique_identifier_and_branch_lengths,
            'tree',
            output='string'),
        summary.s_basic_tree_statistics(
            tree_statistics.majority_tree, 'majority_tree', output='string'),
        summary.s_variable('add', output='double'),
        summary.s_variable('sliding_rescale_adap_param',
                           output='double_missing'),
        summary.s_variable('cutoff_distance', output='double_missing'),
        summary.s_variable('number_of_pieces', output='double_missing'),
        summary.s_variable('proposal_type', output='string'),
        summary.s_variable('sliding_regraft_adap_param',
                           output='double_missing'),
        summary.s_variable('rescale_constrained_adap_param',
                           output='double_missing'),
        summary.s_variable('rescale_adap_param', output='double_missing'),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s, output='double_missing')
        for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior']
    ]
    r = simulation_sanity.test_posterior_model(
        true_tree,
        s_tree,
        100000,
        summaries=summaries,
        thinning_coef=20,
        wishart_df=10000,
        resimulate_regrafted_branch_length=False)  #,
    #admixtures_of_true_tree=2, no_leaves_true_tree=8, rescale_empirical_cov=True)
    print 'true_tree', tree_statistics.unique_identifier_and_branch_lengths(r)
    analyse_results.generate_summary_csv(summaries, reference_tree=true_tree)
Ejemplo n.º 9
0
def call_notebook():
    ## DOESNT WORK IN THE MAC
    dir_path = os.path.dirname(os.path.realpath(__file__))
    cmd = ['Rscript', dir_path + os.path.sep + 'order_report.R']
    print cmd
    call(cmd)


if __name__ == '__main__':

    import summary
    summaries = [
        summary.s_variable('posterior'),
        summary.s_variable('mhr'),
        summary.s_no_admixes(),
        summary.s_tree_identifier(),
        summary.s_average_branch_length(),
        summary.s_total_branch_length(),
        summary.s_tree_identifier_new_tree()
    ] + [
        summary.s_variable(s) for s in [
            'backward_choices', 'backward_density', 'forward_density',
            'forward_choices', 'proposal_type', 'prior', 'branch_prior',
            'no_admix_prior', 'top_prior'
        ]
    ]
    from generate_prior_trees import get_distribution_under_prior
    prior_distribution = get_distribution_under_prior(
        leaves=4, sim_length=1000, list_of_summaries=[summaries[2]])
    print prior_distribution
    full_analysis(summaries,