def get_nodes(arguments,
              input_file,
              outgroup_name,
              reduce_node,
              backup_number=8):

    if not arguments[
            0]:  #this means that we should use the input file for nodes
        if ';' in input_file:
            nodes = get_trivial_nodes(len(input_file.split('-')[0].split('.')))
        elif '.' in input_file:
            nodes = read_one_line(input_file)
        elif ',' in input_file:
            nodes = get_trivial_nodes(int(input_file[1:].split(',')[0]))
        else:
            nodes = get_trivial_nodes(int(input_file))
    else:
        nodes = arguments
    before_added_outgroup = deepcopy(nodes)
    reduced_nodes = deepcopy(nodes)
    if outgroup_name in nodes:
        before_added_outgroup.remove(outgroup_name)
    elif outgroup_name:
        nodes.append(outgroup_name)
    if reduce_node in reduced_nodes:
        reduced_nodes.remove(reduce_node)
    if reduce_node and reduce_node not in nodes:
        nodes.append(reduce_node)
    return before_added_outgroup, nodes, reduced_nodes
Example #2
0
def emp_cov_to_file(m, filename='emp_cov.txt', nodes=None):
    if nodes is None:
        n=m.shape[0]
        nodes=get_trivial_nodes(n)
    with open(filename, 'w') as f:
        f.write(' '.join(nodes)+'\n')
        for i, node in enumerate(nodes):
            f.write(node+ ' '+ ' '.join(map(str, m[i]))+'\n')
Example #3
0
def ms_to_treemix3(filename='tmp.txt', samples_per_pop=20, no_pops=4, n_reps=1, filename2='tmp.treemix_in', nodes=None, 
                   convert_to_gz=True):
    if nodes is None:
        nodes=get_trivial_nodes(no_pops)
    total_sum=0
    total_number_of_genes=0
    with open(filename, 'r') as f:
        with open(filename2, 'w') as e:
            e.write(' '.join(nodes)+'\n')
            pop_count=0
            rep_count=0
            count=0
            data=[]
            s_vecs=[]
            for r in f.readlines():
                
                data.append(map(int,list(r.rstrip())))
                count+=1
                
                if count==samples_per_pop:
                    
                    count=0
                    pop_count+=1
                    
                    s_vec=sum(array(data), axis=0)
                    s_vecs.append(s_vec)
                    total_sum+=sum(s_vec)
                    total_number_of_genes+=len(s_vec)*samples_per_pop
                    data=[]
                    
                    #print rep_count, pop_count
                    
                    if pop_count==no_pops:
                        
                        pop_count=0
                        rep_count+=1
                        
                        for s in zip(*s_vecs):
                            e.write(' '.join([str(a)+','+str(samples_per_pop-a) for a in s])+'\n')
                        
                        s_vecs=[]
                        
                        if rep_count>=n_reps:
                            break
    muhat=float(total_sum)/float(total_number_of_genes)
    #print 'muhat', muhat
    if not convert_to_gz:
        return filename2
    return gzip(filename2,overwrite=True)
Example #4
0
def ms_to_treemix(filename='tmp.txt', samples_per_pop=20, no_pops=4, n_reps=1, filename2='tmp.treemix_in', treemix_files='tmp'):
    data=[]
    with open(filename, 'r') as f:
        for r in f.readlines():
            #print r[:5]
            data.append(map(int,list(r.rstrip())))
    m= array(data)
    if n_reps>1:#reorder the data so that there are more SNPs in stead of more samples/populations
        #print m.shape
        #print 'samples, pops, reps', samples_per_pop, no_pops, n_reps
        m=hstack(vsplit(m, n_reps))
        
    #print samples_per_pop
    sums=tuple([sum(m[(i*samples_per_pop):((i+1)*samples_per_pop), : ], axis=0) for i in xrange(no_pops)])
    #print sums, 'sums'
    with open(filename2, 'w') as f:
        f.write(' '.join(get_trivial_nodes(no_pops))+'\n')
        for s_vec in zip(*sums):
            f.write(' '.join([str(s)+','+str(samples_per_pop-s) for s in s_vec])+'\n')
    filename2_gz=filename2+'.gz'
    subprocess.call(['gzip','-f', filename2])
    return read_data(filename2_gz, blocksize=10000 ,outgroup='s3', noss=True, outfile=treemix_files)
def test_posterior_model_multichain(true_tree=None,
                                    start_tree=None,
                                    sim_lengths=[250] * 800,
                                    summaries=None,
                                    thinning_coef=1,
                                    admixtures_of_true_tree=None,
                                    no_leaves_true_tree=4,
                                    wishart_df=None,
                                    sim_from_wishart=False,
                                    no_chains=8,
                                    result_file='results_mc3.csv',
                                    emp_cov=None,
                                    emp_remove=-1,
                                    rescale_empirical_cov=False):
    if true_tree is None:
        if admixtures_of_true_tree is None:
            admixtures_of_true_tree = geom.rvs(p=0.5) - 1
        true_tree = generate_phylogeny(no_leaves_true_tree,
                                       admixtures_of_true_tree)
    else:
        no_leaves_true_tree = get_no_leaves(true_tree)
        admixtures_of_true_tree = get_number_of_admixes(true_tree)
    true_x = (true_tree, 0)

    m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree))
    if start_tree is None:
        start_tree = true_tree

    start_x = (start_tree, 0)
    if wishart_df is None:
        wishart_df = n_mark(m)
    if sim_from_wishart:
        r = m.shape[0]
        print m
        m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df))
        print m
    if emp_cov is not None:
        m = emp_cov
    if rescale_empirical_cov:
        posterior, multiplier = initialize_posterior(
            m,
            wishart_df,
            use_skewed_distr=True,
            rescale=rescale_empirical_cov)
    else:
        posterior = initialize_posterior(m,
                                         wishart_df,
                                         use_skewed_distr=True,
                                         rescale=rescale_empirical_cov)
        multiplier = None
    print 'true_tree=', unique_identifier_and_branch_lengths(true_tree)
    if rescale_empirical_cov:
        post_ = posterior(
            (scale_tree_copy(true_x[0],
                             1.0 / multiplier), true_x[1] / multiplier))
    else:
        post_ = posterior(true_x)
    print 'likelihood(true_tree)', post_[0]
    print 'prior(true_tree)', post_[1]
    print 'posterior(true_tree)', sum(post_)
    if summaries is None:
        summaries = [
            s_variable('posterior'),
            s_variable('mhr'),
            s_no_admixes()
        ]
    proposal = basic_meta_proposal()
    #proposal.props=proposal.props[2:] #a little hack under the hood
    #proposal.params=proposal.params[2:] #a little hack under the hood.
    sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries}
    sample_verbose_scheme_first = deepcopy(sample_verbose_scheme)
    if 'posterior' in sample_verbose_scheme:
        sample_verbose_scheme_first['posterior'] = (1, 1)  #(1,1)
        sample_verbose_scheme_first['no_admixes'] = (1, 1)
    #if 'likelihood' in sample_verbose_scheme:
    #sample_verbose_scheme_first['likelihood']=(1,1)
    print sample_verbose_scheme_first
    MCMCMC(starting_trees=[deepcopy(start_x) for _ in range(no_chains)],
           posterior_function=posterior,
           summaries=summaries,
           temperature_scheme=fixed_geometrical(800.0, no_chains),
           printing_schemes=[sample_verbose_scheme_first] +
           [sample_verbose_scheme for _ in range(no_chains - 1)],
           iteration_scheme=sim_lengths,
           overall_thinnings=int(thinning_coef),
           proposal_scheme=[adaptive_proposal() for _ in range(no_chains)],
           cores=no_chains,
           no_chains=no_chains,
           multiplier=multiplier,
           result_file=result_file,
           store_permuts=False)
    print 'finished MC3'
    #save_pandas_dataframe_to_csv(results, result_file)
    #save_permuts_to_csv(permuts, get_permut_filename(result_file))
    return true_tree
def test_posterior_model(true_tree=None,
                         start_tree=None,
                         sim_length=100000,
                         summaries=None,
                         thinning_coef=19,
                         admixtures_of_true_tree=None,
                         no_leaves_true_tree=4,
                         filename='results.csv',
                         sim_from_wishart=False,
                         wishart_df=None,
                         sap_sim=False,
                         sap_ana=False,
                         resimulate_regrafted_branch_length=False,
                         emp_cov=None,
                         big_posterior=False,
                         rescale_empirical_cov=False):
    if true_tree is None:
        if admixtures_of_true_tree is None:
            admixtures_of_true_tree = geom.rvs(p=0.5) - 1
        true_tree = generate_phylogeny(no_leaves_true_tree,
                                       admixtures_of_true_tree,
                                       skewed_admixture_prior=sap_sim)
    else:
        no_leaves_true_tree = get_no_leaves(true_tree)
        admixtures_of_true_tree = get_number_of_admixes(true_tree)

    true_x = (true_tree, 0)

    m = make_covariance(true_tree, get_trivial_nodes(no_leaves_true_tree))
    if start_tree is None:
        start_tree = true_tree

    start_x = (start_tree, 0)
    if wishart_df is None:
        wishart_df = n_mark(m)
    if sim_from_wishart:
        r = m.shape[0]
        print m
        m = wishart.rvs(df=r * wishart_df - 1, scale=m / (r * wishart_df))
        print m
    if emp_cov is not None:
        m = emp_cov
    if big_posterior:
        posterior = initialize_big_posterior(m,
                                             wishart_df,
                                             use_skewed_distr=sap_ana)
    else:
        posterior = initialize_posterior(m,
                                         wishart_df,
                                         use_skewed_distr=sap_ana,
                                         rescale=rescale_empirical_cov)
    print 'true_tree=', unique_identifier_and_branch_lengths(true_tree)
    post_ = posterior(true_x)
    print 'likelihood(true_tree)', post_[0]
    print 'prior(true_tree)', post_[1]
    print 'posterior(true_tree)', sum(post_[:2])
    if summaries is None:
        summaries = [s_posterior(), s_variable('mhr'), s_no_admixes()]
    proposal = adaptive_proposal(
        resimulate_regrafted_branch_length=resimulate_regrafted_branch_length)
    #proposal.props=proposal.props[2:] #a little hack under the hood
    #proposal.params=proposal.params[2:] #a little hack under the hood.
    sample_verbose_scheme = {summary.name: (1, 0) for summary in summaries}
    sample_verbose_scheme['posterior'] = (1, 1)
    sample_verbose_scheme['no_admixes'] = (1, 1)
    final_tree, final_posterior, results, _ = basic_chain(
        start_x,
        summaries,
        posterior,
        proposal,
        post=None,
        N=sim_length,
        sample_verbose_scheme=sample_verbose_scheme,
        overall_thinning=int(max(thinning_coef, sim_length / 60000)),
        i_start_from=0,
        temperature=1.0,
        proposal_update=None,
        check_trees=False)
    save_to_csv(results, summaries, filename=filename)
    return true_tree
Example #7
0
def identifier_to_tree(identifier, leaves=None, inner_nodes=None, branch_lengths=None, admixture_proportions=None):
    '''
    Transforms an identifier of the form qwert-uio-asdfg-jk into a dictionary tree using the generators of leaves, inner_nodes, branch_lengths and admixture_proportions.
    '''
    
    levels=identifier.split('-')
    n_leaves=len(levels[0].split('.'))
    
    #initiate leaves
    if leaves is None:
        leaf_values=sorted(get_trivial_nodes(n_leaves))
    else:
        leaf_values=[leaves() for _ in range(n_leaves)]
    tree={leaf:[None]*5 for leaf in leaf_values}
    trace_lineages=[(leaf,0) for leaf in leaf_values]
    
    #initiate generators
    if inner_nodes is None:
        inner_nodes=generate_numbered_nodes('n')
    if branch_lengths is None:
        def f(): 
            return 1.0
        branch_lengths= f
    if admixture_proportions is None:
        def g():
            return 0.4
        admixture_proportions=g
    for level in levels:
        identifier_lineages=level.split('.')
        assert len(trace_lineages)==len(identifier_lineages), 'the number of traced lineages did not match the number of lineages in the identifier '+\
                                                               '\n\n'+'trace_lineages:'+'\n'+str(trace_lineages)+\
                                                               '\n\n'+'identifier_lineages:'+'\n'+str(identifier_lineages)
        parent_index={}
        indexes_to_be_removed=[]
        for n,identifier_lineage in enumerate(identifier_lineages):
            if identifier_lineage=='c':
                ##there is a coalecence for the n'th lineage, and it should be replaced by a new lineage
                new_key=inner_nodes()
                old_key,old_branch=trace_lineages[n]
                new_branch_length=branch_lengths()
                tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length)
                tree[new_key]=[None]*5
                parent_index[n]=new_key
                trace_lineages[n]=(new_key,0)
            elif identifier_lineage=='w':
                pass
            elif identifier_lineage=='a':
                new_key=inner_nodes(admixture=True)
                old_key,old_branch=trace_lineages[n]
                new_branch_length=branch_lengths()
                tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length)
                new_admixture_proportion=admixture_proportions()
                tree[new_key]=[None,None,new_admixture_proportion,None,None]
                trace_lineages[n]=(new_key,0)
                trace_lineages.append((new_key,1))
            else:
                ##there is a coalescence but this lineage disappears
                try:
                    new_key=parent_index[int(identifier_lineage)]
                except KeyError as e:
                    print e
                    print 'new_key', new_key
                    print 'parent_index', parent_index
                    print 'identifier_lineage', identifier_lineage
                    print pretty_string(insert_children_in_tree(tree))
                
                old_key,old_branch=trace_lineages[n]
                new_branch_length=branch_lengths()
                tree=update_parent_and_branch_length(tree, old_key, old_branch, new_key, new_branch_length)
                indexes_to_be_removed.append(n)
        
        ##remove lineages
        trace_lineages=[trace_lineage for n,trace_lineage in enumerate(trace_lineages) if n not in indexes_to_be_removed]
    root_key=new_key
    del tree[root_key]
    tree=rename_root(tree, new_key)
    
    return insert_children_in_tree(tree)