Python get_leaf_keys Examples, Rtree_operations.get_leaf_keys Python Examples

Example #1

0

Show file

File: Rtree_to_covariance_matrix.py Project: Tmacme/AdmixtureBayes

def get_admixtured_populations(tree):
    node_keys = sorted(get_leaf_keys(tree))
    pops = [Population([1.0], [node]) for node in node_keys]
    ready_nodes = zip(node_keys, pops)
    waiting_nodes = {}
    taken_nodes = []
    covmat = dummy_covmat()
    admixed_populations = []
    while True:
        for key, pop in ready_nodes:
            upds, admixed = leave_node_and_check_admixtures(
                key, tree[key], pop, covmat)
            admixed_populations.extend(admixed)
            for upd in upds:
                waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree)
            taken_nodes.append(key)
        waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes,
                                                   taken_nodes[:])
        # print 'waiting_nodes', waiting_nodes
        # print 'ready_nodes', ready_nodes
        # print 'taken_nodes', taken_nodes
        if len(ready_nodes) == 0:
            return None
        if len(ready_nodes) == 1 and ready_nodes[0][0] == "r":
            break
    return admixed_populations

Example #2

0

Show file

File: tree_to_data.py Project: Tmacme/AdmixtureBayes

def tree_to_ms_command(rtree, sample_per_pop=50, nreps=2, 
                       theta=0.4, sites=500000, recomb_rate=1,
                       leaf_keys=None, final_pop_size=100.0):
    tree=deepcopy(rtree)
    drift_sum=sum(get_all_branch_lengths(tree))
    if recomb_rate is None:
        rec_part=' -s '+str(sites)
    else:
        rec_part=' -r '+str(recomb_rate)+ ' '+str(sites)
    n=get_no_leaves(tree)
    callstring='ms '+str(sample_per_pop*n)+' '+str(nreps)+' -t '+ str(theta)+' ' +rec_part + ' '
    callstring+=' -I '+str(n)+' '+' '.join([str(sample_per_pop) for _ in xrange(n)])+' '
    times=get_timing(tree)
    #print times
    tree=extend_branch_lengths(tree,times)
    tuple_branch_lengths=get_all_branch_lengths(tree)
    count_sum=sum((x[1] for x in tuple_branch_lengths))
    tree=scaled_tupled_branches(tree, drift_sum/count_sum)
    times={k:v*drift_sum/count_sum for k,v in times.items()}
    #print pretty_string(tree)
    if leaf_keys is None:
        leaf_keys= get_leaf_keys(tree)
    callstring+=construct_ej_en_es_string(tree, times, leaf_keys=leaf_keys, final_pop_size=final_pop_size)
    
    
    #print tree
    #popsizes=[[calculate_pop_size(node[3])] if node_is_non_admixture(node) else [calculate_pop_size(node[3]), calculate_pop_size(node[4])] for key,node in tree.items()]    
    #pops=[p for l in popsizes for p in l]
    
    return callstring#,(min(pops),max(pops), max(times.values()))  #TO CHANGE BACK

Example #3

0

Show file

def get_branches_to_keep(tree, subgraph_keys):
    node_keys = get_leaf_keys(tree)
    pops = [Population([1.0], [node]) for node in node_keys]
    follow_branch = follow_branch_class(subgraph_keys)
    ready_nodes = zip(node_keys, pops)
    waiting_nodes = {}
    taken_nodes = []
    target_nodes = []
    #print tree
    while True:
        #print ready_nodes
        for key, pop in ready_nodes:

            #pop_strings.append(pop.get_population_string(min_w))
            upds = leave_node(key, tree[key], pop, target_nodes, follow_branch)
            for upd in upds:
                waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree)
            taken_nodes.append(key)
        waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes,
                                                   taken_nodes[:])
        #print 'waiting_nodes', waiting_nodes
        #print 'ready_nodes', ready_nodes
        #print 'taken_nodes', taken_nodes
        if len(ready_nodes) == 0:
            return None
        if len(ready_nodes) == 1 and ready_nodes[0][0] == "r":
            #big_pop=ready_nodes[0][1]
            #pop_strings.append(big_pop.get_population_string(min_w))
            break
    #print 'finished tree'

    return target_nodes

Example #4

0

Show file

def addmix_with_correction(tree,
                           new_node_names=None,
                           pks={},
                           fixed_sink_source=None,
                           new_branch_length=None,
                           new_to_root_length=None):

    added_tree, forward, backward = addadmix(
        tree,
        new_node_names=new_node_names,
        pks=pks,
        fixed_sink_source=fixed_sink_source,
        new_branch_length=new_branch_length,
        new_to_root_length=new_to_root_length,
        check_opposite=False,
        preserve_root_distance=False)

    node_keys = sorted(get_leaf_keys(tree))

    A, _, bi1 = make_coefficient_matrix(tree, node_keys=node_keys)
    B, _, bi2 = make_coefficient_matrix(added_tree, node_keys=node_keys)

    x_A = get_specific_branch_lengths(tree, reverse_dic_to_list(bi1))
    x_B = get_specific_branch_lengths(added_tree, reverse_dic_to_list(bi2))

    Binverse = pinv(B)
    Ainverse = pinv(A)

    tilde_x_B = Binverse.dot(A.dot(x_A))
    #random_tilde_x_B=add_random_noise(tilde_x_B)
    tilde_x_A = Ainverse.dot(B.dot(tilde_x_B))
    #
    #     if all((float_equal(x,y) for x,y in zip(x_A, tilde_x_A))):
    #         added_tree=update_specific_branch_lengths(added_tree, bi2, tilde_x_B)
    #         if added_tree is None:
    #             return tree, 1,0
    #         else:
    #             return tree, forward, backward
    #     else:
    #         new_x_B=

    print array(x_B)
    print array(tilde_x_B)
    print array(x_A)
    print array(tilde_x_A)

    print B.dot(x_B)
    print A.dot(x_A)
    print B.dot(tilde_x_B)
    print A.dot(tilde_x_A)

    tilde2_x_A = Ainverse.dot(B.dot(x_B))
    tilde2_x_B = Binverse.dot(A.dot(tilde2_x_A))

    print B.dot(x_B)
    print A.dot(x_A)
    print B.dot(tilde2_x_B)
    print A.dot(tilde2_x_A)

    t = 5

Example #5

0

Show file

def make_coefficient_matrix(tree, node_keys=None, branch_keys=None):
    '''
    Instead of constructing the covariance matrix, this function calculates the coefficient matrix, C, to solve
    
    w=Cx
    
    where w is the diagonal of the covariance matrix and x is the vector of branch lengths. Hence, C depends on the admixture proportions and the topology.
    '''
    if node_keys is None:
        node_keys=sorted(get_leaf_keys(tree))
    if branch_keys is None:
        branch_keys=get_all_branches(tree)
    pops=[Population([1.0],[node]) for node in node_keys]
    ready_nodes=zip(node_keys,pops)
    ni={node_key:n for n,node_key in enumerate(node_keys)}
    bi={branch:n for n,branch in enumerate(branch_keys)}
    cofmat=Coefficient_Matrix(ni,bi, get_all_pairs(node_keys))
    waiting_nodes={}
    taken_nodes=[]
    while True:
        for key,pop in ready_nodes:
            upds=leave_node(key, tree[key], pop, cofmat)
            for upd in upds:
                waiting_nodes=_add_to_waiting(waiting_nodes, upd,tree)
            taken_nodes.append(key)
        waiting_nodes,ready_nodes=_thin_out_dic(waiting_nodes, taken_nodes[:])
        #print 'waiting_nodes', waiting_nodes
        #print 'ready_nodes', ready_nodes
        #print 'taken_nodes', taken_nodes
        if len(ready_nodes)==0:
            return None
        if len(ready_nodes)==1 and ready_nodes[0][0]=="r":
            break

    return cofmat.get_matrix(), ni,bi

Example #6

0

Show file

File: Rtree_to_covariance_matrix.py Project: Tmacme/AdmixtureBayes

def make_covariance(tree, node_keys=None, old_cov=False):
    if node_keys is None:
        node_keys = sorted(get_leaf_keys(tree))
    #print node_keys
    #print get_leaf_keys(tree)
    pops = [Population([1.0], [node]) for node in node_keys]
    ready_nodes = zip(node_keys, pops)
    covmat = Covariance_Matrix2(
        {node_key: n
         for n, node_key in enumerate(node_keys)})
    if old_cov:
        covmat = Covariance_Matrix2(
            {node_key: n
             for n, node_key in enumerate(node_keys)})
    waiting_nodes = {}
    taken_nodes = []
    while True:
        for key, pop in ready_nodes:
            upds = leave_node(key, tree[key], pop, covmat)
            for upd in upds:
                waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree)
            taken_nodes.append(key)
        waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes,
                                                   taken_nodes[:])
        #print 'waiting_nodes', waiting_nodes
        #print 'ready_nodes', ready_nodes
        #print 'taken_nodes', taken_nodes
        if len(ready_nodes) == 0:
            return None
        if len(ready_nodes) == 1 and ready_nodes[0][0] == "r":
            break

    return covmat.get_matrix()

Example #7

0

Show file

def getcorrection(old_tree, new_tree, sigma):

    node_keys = sorted(get_leaf_keys(old_tree))

    B, _, bi1 = make_coefficient_matrix(old_tree, node_keys=node_keys)
    A, _, bi2 = make_coefficient_matrix(new_tree, node_keys=node_keys)

    branches = reverse_dic_to_list(bi1)

    x_A = array(get_specific_branch_lengths(old_tree, branches))
    x_B = array(get_specific_branch_lengths(new_tree, branches))
    x_old = deepcopy(x_A)
    # print x_A
    #print x_B
    #print x_old

    upper = x_A.dot(B.T.dot(A) + identity(len(branches)))

    lower_first = A.T.dot(A) + identity(len(branches))

    mu_new = mm(U=upper, L=lower_first, initial_value=x_B)

    x_new = mu_new + norm.rvs(scale=sigma, size=len(mu_new))

    q_forward = sum(norm.logpdf(mu_new - x_new, scale=sigma))

    upper_reverse = x_new.dot((A.T.dot(B) + identity(len(branches))))
    lower_first_reverse = B.T.dot(B) + identity(len(branches))

    mu_reverse = mm(U=upper_reverse,
                    L=lower_first_reverse,
                    initial_value=array(x_new))

    #print 'matrix_rank , dimension (A)', matrix_rank(A), A.shape
    #print 'matrix_rank , dimension (B)', matrix_rank(B), B.shape
    #print 'x_reverse', reverse_mu_new

    q_backward = sum(norm.logpdf(mu_reverse - x_A, scale=sigma))

    #wear the new values
    #print branches

    new_tree = update_specific_branch_lengths(new_tree, branches, x_new)

    #print sum((A.dot(mu_new)-B.dot(x_old))**2)
    #print sum((A.dot(x_new)-B.dot(x_old))**2)
    #print sum((B.dot(x_old)-A.dot(x_old))**2)

    return new_tree, 1.0, exp(q_backward - q_forward)

Example #8

0

Show file

File: df_testing.py Project: Tmacme/AdmixtureBayes

def simulate_P_from_tree(tree, no_individuals_per_population, no_snps):
    filter=make_filter(filter_type='none')
    filename_gz=ms_simulate_wrapper(tree,
                                         sample_per_pop=no_individuals_per_population,
                                         nreps=no_snps//200,
                                         theta=0.4,
                                         sites=500000,
                                         recomb_rate=1.0,
                                         full_nodes=get_leaf_keys(tree),
                                         final_pop_size=100.0,
                                         ms_file='ms.txt',
                                         treemix_file='treemix.txt',
                                         time_adjust=False)
    x,n,_=get_xs_and_ns_from_treemix_file(filename_gz, filter)
    p=x/n
    return p.T, None

Example #9

0

Show file

File: Rproposal_admix_with_correction2.py Project: Tmacme/AdmixtureBayes

def getcorrection_adding(old_tree, new_tree, sigma, branches, U_matrix):

    node_keys = sorted(get_leaf_keys(old_tree))

    A, _, _ = make_coefficient_matrix(old_tree,
                                      node_keys=node_keys,
                                      branch_keys=branches[:-3])
    B, _, _ = make_coefficient_matrix(new_tree,
                                      node_keys=node_keys,
                                      branch_keys=branches)

    x_A = get_specific_branch_lengths(old_tree, branches[:-3])
    x_B = get_specific_branch_lengths(new_tree, branches)

    B2 = B.dot(U_matrix)

    lambd = pinv(B2.dot(B2.T)).dot(A - B2).dot(x_A)

    mu_new = (B2.T).dot(lambd) + x_A

    x_new_reduced = mu_new + norm.rvs(scale=sigma, size=len(mu_new))

    q_forward = reduce(mul, norm.pdf(mu_new - x_new_reduced, scale=sigma))

    x_new = U_matrix.dot(x_new_reduced)
    print 'x_A', x_A
    print 'x_B', x_B
    print 'mu_new', mu_new
    print 'x_new_reduced', x_new_reduced
    print 'x_new', x_new

    reverse_lambd = pinv(A.dot(A.T)).dot(B2 - A).dot(x_new_reduced)
    reverse_mu_new = (A.T).dot(reverse_lambd) + x_new_reduced

    print 'matrix_rank , dimension (A)', matrix_rank(A), A.shape
    print 'matrix_rank , dimension (B)', matrix_rank(B), B.shape
    print 'mu_reverse', reverse_mu_new

    q_backward = reduce(mul, norm.pdf(reverse_mu_new - x_A, scale=sigma))

    #wear the new values
    #print branches

    new_tree = update_specific_branch_lengths(new_tree, branches, x_new)

    return new_tree, q_forward, q_backward

Example #10

0

Show file

File: downstream_analysis_tool.py Project: Tmacme/AdmixtureBayes

    def __call__(self, Rtree=None, add=None, **kwargs):
        #print kwargs['full_tree']
        #print self.nodes
        if Rtree is None:
            full_tree = kwargs['full_tree']
            outgroup_name = list(
                set(get_leaf_keys(full_tree)) - set(self.nodes))[0]
            cov = make_covariance(full_tree,
                                  node_keys=[outgroup_name] + self.nodes)
            Rcov = reduce_covariance(cov, 0)
            return {'Rcov': Rcov}, False
        #print pretty_string(Rtree)
        #print get_leaf_keys(Rtree)
        #print self.nodes
        Rcov = make_covariance(
            Rtree, node_keys=self.nodes) + float(add) * self.add_multiplier

        return {'Rcov': Rcov}, False

Example #11

0

Show file

File: downstream_analysis_tool.py Project: Tmacme/AdmixtureBayes

 def __call__(self, Rtree=None, **kwargs):
     if 'string_tree' in kwargs:
         topology = kwargs['string_tree'].split('=')[-1].split(';')[0]
         return {'topology': topology}, False
     if Rtree is None:
         full_tree = kwargs['full_tree']
         outgroup = list(set(get_leaf_keys(full_tree)) - set(self.nodes))[0]
         #print full_tree, outgroup
         cfull_tree = rearrange_root_foolproof(
             deepcopy(full_tree), outgroup
         )  #this removes the admixtures between the outgroup and the root.
         Rtree = remove_outgroup(cfull_tree, outgroup)
     #print 'topology calculation'
     #print self.nodes
     #print Rtree
     top = admixture_sorted_unique_identifier(Rtree,
                                              leaf_order=self.nodes,
                                              not_opposite=True)
     return {'topology': top}, False

Example #12

0

Show file

File: tree_to_data.py Project: Tmacme/AdmixtureBayes

def time_adjusted_tree_to_ms_command(time_adjusted_tree, sample_per_pop=50, nreps=2, 
                       theta=0.4, sites=500000, recomb_rate=1,
                       leaf_keys=None, final_pop_size=100.0,  verbose_level='normal'):
    
    tree=deepcopy(time_adjusted_tree)
    if recomb_rate is None:
        rec_part=' -s '+str(sites)
    else:
        rec_part=' -r '+str(recomb_rate)+ ' '+str(sites)
    n=get_no_leaves(tree)
    callstring='ms '+str(sample_per_pop*n)+' '+str(nreps)+' -t '+ str(theta)+' ' +rec_part + ' '
    callstring+=' -I '+str(n)+' '+' '.join([str(sample_per_pop) for _ in xrange(n)])+' '
    times=get_max_timing(tree)
    #print times
    tree=extend_branch_lengths(tree,times)
    #print pretty_string(tree)
    if leaf_keys is None:
        leaf_keys= get_leaf_keys(tree)
    callstring+=construct_ej_es_string(tree, times, leaf_keys=leaf_keys, final_pop_size=final_pop_size)
    return callstring

Example #13

0

Show file

File: Rtree_to_covariance_matrix.py Project: Tmacme/AdmixtureBayes

def get_populations(tree, min_w=0.0, keys_to_include=None):

    node_keys = sorted(get_leaf_keys(tree))
    if keys_to_include is None:
        keys_to_remove = []
    else:
        keys_to_remove = list(set(node_keys) - set(keys_to_include))
    pops = [Population([1.0], [node]) for node in node_keys]
    ready_nodes = zip(node_keys, pops)
    waiting_nodes = {}
    taken_nodes = []
    covmat = dummy_covmat()
    pop_strings = []
    while True:
        for key, pop in ready_nodes:
            pop_strings.append(pop.get_population_string(
                min_w, keys_to_remove))
            upds = leave_node(key, tree[key], pop, covmat)
            for upd in upds:
                waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree)
            taken_nodes.append(key)
        waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes,
                                                   taken_nodes[:])
        #print 'waiting_nodes', waiting_nodes
        #print 'ready_nodes', ready_nodes
        #print 'taken_nodes', taken_nodes
        if len(ready_nodes) == 0:
            return None
        if len(ready_nodes) == 1 and ready_nodes[0][0] == "r":
            big_pop = ready_nodes[0][1]
            pop_strings.append(
                big_pop.get_population_string(min_w, keys_to_remove))
            break
    if '' in pop_strings:
        pop_strings.remove('')
    return sorted(list(set(pop_strings)))

Example #14

0

Show file

File: find_true_trees.py Project: Tmacme/AdmixtureBayes

def get_unique_plottable_tree(tree, nodes=None):
    if nodes is None:
        nodes = sorted(get_leaf_keys(tree))
    possible_strees = sorted(get_possible_strees(tree, nodes))
    return possible_strees[0]

Example #15

0

Show file

File: remove_populations_from_dataset.py Project: Tmacme/AdmixtureBayes

                    default='',
                    help='The file where the populations should be saved.')

options = parser.parse_args()

if options.input_type == 'tree':
    tree = identifier_file_to_tree_clean(options.input_file)
    if options.input_add:
        with open(options.input_add, 'r') as f:
            add = float(f.readline())
        tree = add_outgroup(tree,
                            inner_node_name='new_node',
                            to_new_root_length=float(add),
                            to_outgroup_length=0,
                            outgroup_name=options.outgroup_name)
    nodes = get_leaf_keys(tree)
    assert all((a in nodes for a in options.populations
                )), 'Requested population was not found in the tree'
    subtree = get_subtree(tree, options.populations)
    if not options.output_file:
        options.output_file = options.input_file + '_'.join(
            options.populations)
    with open(options.output_file, 'w') as f:
        f.write(' '.join(sorted(options.populations)) + '\n')
        f.write(unique_identifier_and_branch_lengths(subtree))
if options.input_type == 'snps':
    if options.input_file.endswith('.gz'):
        options.input_file = unzip(options.input_file, overwrite=False)
    df = pd.read_csv(options.input_file, usecols=options.populations, sep=' ')
    if not options.output_file:
        options.output_file = options.input_file + '_'.join(

Example #16

0

Show file

File: Treemix_to_AdmixtureBayes.py Project: Tmacme/AdmixtureBayes

def treemix_file_to_admb_files(filename_treeout,
                               filename_vertices,
                               filename_edges,
                               outgroup=None,
                               snodes=None,
                               prefix='',
                               force=True,
                               return_format=[
                                   'None', 'arbitrary_rooted',
                                   'outgroup_rooted', 'outgroup_removed',
                                   'outgroup_removed_tuple'
                               ]):
    return_format = initor(return_format)
    tree = read_treemix_file2(filename_treeout, filename_vertices,
                              filename_edges)

    arbitrary_rooted = deepcopy(tree)
    nodes = get_leaf_keys(tree)
    if snodes is not None:
        snodes_set = set(snodes)
        if outgroup is not None:
            snodes_set = set(snodes + [outgroup])
            if outgroup not in snodes:
                warnings.warn(
                    'outgroup added to the beginning of the admbayes realization of the treemix mle, even though it is not requested in snodes.'
                )
                snodes.append(outgroup)
        assert set(nodes) == set(
            snodes
        ), 'the nodes of the treemix file does not match, the supplied nodes'
    else:
        snodes = nodes
    save_stage(tree,
               4,
               prefix='not_needed',
               full_nodes=snodes,
               before_added_outgroup_nodes=['not_needed'],
               after_reduce_nodes=['not_needed'],
               filename=prefix + '_treemix_arbitrary_rooted_tree.txt')

    if outgroup is not None:
        if force:
            tree = rearrange_root_foolproof(tree, outgroup)
        else:
            tree = rearrange_root(tree, outgroup)
        save_stage(tree,
                   4,
                   prefix='not_needed',
                   full_nodes=snodes,
                   before_added_outgroup_nodes=['not_needed'],
                   after_reduce_nodes=['not_needed'],
                   filename=prefix + '_treemix_outgroup_rooted_tree.txt')
        outgroup_rooted = deepcopy(tree)
        tree, add = remove_outgroup(tree,
                                    remove_key=outgroup,
                                    return_add_distance=True)
        snodes.remove(outgroup)
        save_stage(tree,
                   4,
                   prefix='not_needed',
                   full_nodes=snodes,
                   before_added_outgroup_nodes=['not_needed'],
                   after_reduce_nodes=['not_needed'],
                   filename=prefix +
                   '_treemix_outgroup_rooted_removed_tree.txt')
        save_stage(add,
                   2,
                   prefix='not_needed',
                   full_nodes=snodes,
                   before_added_outgroup_nodes=['not_needed'],
                   after_reduce_nodes=['not_needed'],
                   filename=prefix +
                   '_treemix_outgroup_rooted_removed_add.txt')
        outgroup_removed = deepcopy(tree)
    if return_format == 'arbitrary_rooted':
        return arbitrary_rooted
    if return_format == 'outgroup_rooted':
        return outgroup_rooted
    if return_format == 'outgroup_removed':
        return outgroup_removed
    if return_format == 'outgroup_removed_tuple':
        return outgroup_removed, add

Example #17

0

Show file

File: all_rappers.py Project: Tmacme/AdmixtureBayes

def tree_to_covariance(stree):
    tree=identifier_to_tree_clean(stree)
    nodes=sorted(get_leaf_keys(tree))
    return make_covariance(tree, node_keys=nodes)

Example #18

0

Show file

File: downstream_analysis_parser.py Project: Tmacme/AdmixtureBayes

def run_posterior_main(args):

    possible_summaries = {
        'Rtree': make_Rtree,
        'full_tree': make_full_tree,
        'string_tree': make_string_tree,
        'Rcov': make_Rcovariance,
        'cov_dist': cov_truecov,
        'topology': topology,
        'subgraph': subgraph,
        'subsets': subsets,
        'top_identity': topology_identity,
        'pops': get_pops,
        'set_differences': compare_pops,
        'no_sadmixes': extract_number_of_sadmixes
    }
    possible_summaries.update(all_custom_summaries())
    print possible_summaries

    parser = ArgumentParser(usage='pipeline for post analysis',
                            version='1.0.0')

    parser.add_argument('--input_file',
                        required=True,
                        type=str,
                        help='The output file from an AdmixtureBayes run.')
    parser.add_argument(
        '--covariance_matrix_file',
        required=True,
        type=str,
        help=
        'file containing the covariance matrix with a header with all the population names and a line with the multiplier. It has the ending covariance_and_multiplier.txt.'
    )
    parser.add_argument(
        '--subnodes',
        default=[],
        type=str,
        nargs='+',
        help=
        'The subset of populations to perform the analysis on. If not declared, the analysis will be done on the full dataset.'
    )
    parser.add_argument(
        '--result_file',
        default='posterior_distributions.csv',
        type=str,
        help=
        'The resulting file. It will be comma-separated and contain one column per summary plus a header.'
    )
    parser.add_argument('--prefix',
                        default='',
                        type=str,
                        help='place to put the temporary files')
    parser.add_argument(
        '--total',
        default=886,
        type=int,
        help=
        'an upper limit on the number of rows to reduce computational pressure'
    )
    parser.add_argument(
        '--burn_in_fraction',
        default=0.5,
        type=float,
        help='the proportion of the rows that are discarded as burn in period')
    parser.add_argument(
        '--calculate_summaries',
        default=['Rtree', 'pops', 'full_tree', 'string_tree', 'topology'],
        choices=possible_summaries.keys(),
        nargs='*',
        type=str,
        help='The summaries to calculate')
    parser.add_argument(
        '--save_summaries',
        default=['no_admixes', 'topology', 'pops', 'string_tree'],
        nargs='*',
        type=str,
        help='The list of summaries to save')
    parser.add_argument(
        '--custom_summaries',
        default=[],
        nargs='*',
        choices=possible_summaries.keys(),
        help=
        'This will add summaries (to both calculate_summaries and save_summaries). They are defined in the class custom_summary.py.'
    )
    parser.add_argument(
        '--summarize_posterior_distributions',
        default=False,
        help=
        'If set to true, the posterior distibutions will be summarized even further.'
    )
    parser.add_argument(
        '--min_w',
        default=0.0,
        type=float,
        help=
        'a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.'
    )
    parser.add_argument(
        '--constrain_number_of_admixes',
        default='',
        type=str,
        choices=['', 'true_val'] + map(str, range(21)),
        help=
        'The number of admixture events that there are constrained on in the data set. If negative there are no constraints'
    )
    parser.add_argument(
        '--constrain_number_of_effective_admixes',
        default='',
        choices=['', 'true_val'] + map(str, range(21)),
        type=str,
        help=
        'The number of effective(visible)_admixture events that there are constrained on in the data set. If negative there are no constraints.'
    )
    parser.add_argument(
        '--constrain_sadmix_trees',
        default=False,
        action='store_true',
        help=
        'this will remove the graphs which has invisible admixtures. This will produce worse, but more easily interpretable results.'
    )
    parser.add_argument(
        '--no_sort',
        default=False,
        action='store_true',
        help=
        'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to '
    )
    parser.add_argument('--use_cols',
                        default=['tree', 'add', 'layer', 'no_admixes'],
                        type=str,
                        nargs='+',
                        help='The columns to load from the input file')
    parser.add_argument('--outgroup_name',
                        default='',
                        type=str,
                        help='name of the outgroup')
    parser.add_argument('--emp_m_scale', type=str, default='')
    parser.add_argument('--emp_variance_correction', type=str, default='')
    parser.add_argument('--emp_df', type=str, default='')
    parser.add_argument('--emp_covariance_and_multiplier',
                        default='',
                        type=str)
    parser.add_argument('--emp_covariance_reduced', default='', type=str)

    parser.add_argument(
        '--choice_if_no_thinned_graphs',
        default='error',
        choices=['error', 'nearest_admixture_events'],
        help=
        'If the thinning leaves no graphs left, this is what will be done in stead. error will throw an error and nearest_admixture_events will expand the band of allowed number of admixture events(if the chain has been thinned on number of admixture events).'
    )
    parser.add_argument(
        '--test_run',
        default=False,
        action='store_true',
        help='will overwrite everything and run a test function')

    parser.add_argument(
        '--summary_summaries',
        default=['mean'],
        nargs='*',
        type=str,
        help=
        'How each list is summarized as a single, numerical value. If it doesnt have the same length as save summaries the arguments will be repeated until it does'
    )
    parser.add_argument(
        '--number_of_top_pops',
        default=10,
        type=int,
        help=
        'if top_pops is added to summary_summaries this is the number of set topologies saved. negative values means all topologies are saved.'
    )
    parser.add_argument('--true_scaled_tree', type=str, default='')
    parser.add_argument('--true_tree', type=str, default='')
    parser.add_argument('--true_add', type=str, default='')
    parser.add_argument('--true_covariance_reduced', type=str, default='')
    parser.add_argument('--true_covariance_and_multiplier',
                        type=str,
                        default='')
    parser.add_argument('--true_no_admix', type=str, default='')
    parser.add_argument(
        '--treemix_post_analysis',
        action='store_true',
        default=False,
        help=
        'this will convert the treemix input fil ../../../../Dropbox/Bioinformatik/AdmixtureBayes/test_final_grid/ai_2_5true/_true_tree.txtes into a suitable csv file for '
    )
    parser.add_argument('--treemix_tree', default='', type=str, help='')
    parser.add_argument('--treemix_add', default='', type=str, help='')
    parser.add_argument('--treemix_full_tree', default='')
    parser.add_argument('--treemix_csv_output',
                        default='treemix.csv',
                        type=str,
                        help='')
    parser.add_argument(
        '--subgraph_file',
        default='',
        type=str,
        help=
        'file where each line has a space separated list of leaf labels to calculate subtrees from. If a double underscore(__) occurs, it means that the following two arguments are max number of sub topologies and total posterior probability.'
    )

    options = parser.parse_args(args)

    assert not ('string_tree' in options.calculate_summaries
                and not 'full_tree' in options.calculate_summaries
                ), 'The full tree flag is needed for the string tree'
    if 'full_tree' in options.calculate_summaries:
        assert options.outgroup_name, 'The outgroup is specified to calculate the full tree'

    if options.subnodes:
        assert options.outgroup_name, 'when '
        if options.outgroup_name in options.subnodes:
            subnodes_with_outgroup = options.subnodes
            subnodes_wo_outgroup = deepcopy(options.subnodes)
            subnodes_wo_outgroup.remove(options.outgroup_name)
        else:
            subnodes_with_outgroup = deepcopy(
                options.subnodes) + [options.outgroup_name]
            subnodes_wo_outgroup = options.subnodes
    else:
        subnodes_with_outgroup = []
        subnodes_wo_outgroup = []

    outp = read_true_values(
        true_scaled_tree=options.true_scaled_tree,
        true_tree=options.true_tree,
        true_add=options.true_add,
        true_covariance_reduced=options.true_covariance_reduced,
        true_covariance_and_multiplier=options.true_covariance_and_multiplier,
        true_no_admix=options.true_no_admix,
        subnodes_with_outgroup=subnodes_with_outgroup,
        subnodes_wo_outgroup=subnodes_wo_outgroup)
    true_scaled_tree, true_tree, true_add, true_covariance_reduced, (
        true_covariance_scaled, true_multiplier), true_no_admix, _, _, _ = outp
    outp = read_true_values(
        true_covariance_reduced=options.emp_covariance_reduced,
        true_covariance_and_multiplier=options.covariance_matrix_file,
        true_m_scale=options.emp_m_scale,
        subnodes_with_outgroup=subnodes_with_outgroup,
        subnodes_wo_outgroup=subnodes_wo_outgroup)
    _, _, _, emp_covariance_reduced, (
        emp_covariance_scaled, multiplier), _, emp_m_scale, vc, df = outp

    if options.treemix_post_analysis:
        if not options.treemix_full_tree:
            outp = read_true_values(
                true_tree=options.treemix_tree,
                true_add=options.treemix_add,
                subnodes_with_outgroup=subnodes_with_outgroup,
                subnodes_wo_outgroup=subnodes_wo_outgroup)
            _, treemix_tree, treemix_add, _, _, _, _, _, _ = outp
            create_treemix_csv_output(treemix_tree, treemix_add * multiplier,
                                      emp_m_scale, options.treemix_csv_output)
        elif options.treemix_full_tree:
            outp = read_true_values(
                true_scaled_tree=options.treemix_full_tree,
                subnodes_with_outgroup=subnodes_with_outgroup,
                subnodes_wo_outgroup=subnodes_wo_outgroup)
            full_treemix_tree, _, _, _, _, _, _, _, _ = outp
            create_treemix_sfull_tree_csv_output(full_treemix_tree,
                                                 emp_m_scale,
                                                 options.treemix_csv_output)
            full_nodes = sorted(get_leaf_keys(full_treemix_tree))

    if options.constrain_number_of_admixes:
        if options.constrain_number_of_admixes == 'true_val':
            thinner = thinning_on_admixture_events(
                burn_in_fraction=options.burn_in_fraction,
                total=options.total,
                no_admixes=true_no_admix,
                if_no_trees=options.choice_if_no_thinned_graphs)
        else:
            thinner = thinning_on_admixture_events(
                burn_in_fraction=options.burn_in_fraction,
                total=options.total,
                no_admixes=options.constrain_number_of_admixes,
                if_no_trees=options.choice_if_no_thinned_graphs)
    else:
        thinner = thinning(burn_in_fraction=options.burn_in_fraction,
                           total=options.total)

    nodes = read_one_line(options.covariance_matrix_file).split()
    if not options.no_sort:
        nodes = sorted(nodes)

    row_sums = []

    class pointers(object):
        def __init__(self):
            self.count = 0
            self.dic = {}

        def __call__(self, name):
            self.dic[name] = self.count
            self.count += 1

        def __getitem__(self, key):
            return self.dic[key]

    name_to_rowsum_index = pointers()
    possible_summary_summaries = {'mean': float_mean}

    #print 'subnodes_wo_outgroup', subnodes_wo_outgroup
    special_summaries = [
        'Rtree', 'full_tree', 'string_tree', 'subgraph', 'Rcov', 'cov_dist',
        'topology', 'top_identity', 'pops', 'subsets', 'set_differences',
        'no_admixes'
    ]
    if 'Rtree' in options.calculate_summaries:
        row_sums.append(possible_summaries['Rtree'](
            deepcopy(nodes),
            options.constrain_sadmix_trees,
            subnodes=subnodes_wo_outgroup))
        name_to_rowsum_index('Rtree')
    if 'full_tree' in options.calculate_summaries:
        if multiplier is None:
            add_multiplier = 1.0
        else:
            add_multiplier = 1.0 / multiplier
        row_sums.append(possible_summaries['full_tree'](
            add_multiplier=add_multiplier,
            outgroup_name=options.outgroup_name,
            remove_sadtrees=options.constrain_sadmix_trees,
            subnodes=subnodes_with_outgroup))
        name_to_rowsum_index('full_tree')
    if 'string_tree' in options.calculate_summaries:
        if options.subnodes:
            row_sums.append(possible_summaries['string_tree'](
                deepcopy(subnodes_wo_outgroup), options.outgroup_name,
                tree_unifier()))
        else:
            row_sums.append(possible_summaries['string_tree'](
                deepcopy(nodes), options.outgroup_name, tree_unifier()))
        name_to_rowsum_index('string_tree')
    if options.subnodes:
        nodes = subnodes_wo_outgroup
        full_nodes = sorted(list(set(nodes[:] + [options.outgroup_name])))
    if 'subgraph' in options.calculate_summaries:
        subgraph_dicts = read_subgraphing_dict(options.subgraph_file,
                                               types=['full'])
        for dic in subgraph_dicts:
            skeys = dic['subgraph_keys']
            identifier = '.'.join(skeys)
            code = 'subgraph_' + identifier
            sum_func = possible_summaries['subgraph'](skeys, identifier)
            row_sums.append(sum_func)
            name_to_rowsum_index(code)
            options.save_summaries.append(code)
            options.summary_summaries.append(code)
            possible_summary_summaries[code] = sum_func.summarise
    if 'Rcov' in options.calculate_summaries:
        row_sums.append(possible_summaries['Rcov'](deepcopy(nodes),
                                                   add_multiplier=1.0 /
                                                   multiplier))
        name_to_rowsum_index('Rcov')
    if 'cov_dist' in options.calculate_summaries:
        row_sums.append(
            possible_summaries['cov_dist'](true_covariance_reduced))
        name_to_rowsum_index('cov_dist')
    if 'topology' in options.calculate_summaries:
        row_sums.append(possible_summaries['topology'](nodes=nodes))
        name_to_rowsum_index('topology')
    if 'top_identity' in options.calculate_summaries:
        row_sums.append(possible_summaries['top_identity'](true_tree,
                                                           nodes=nodes))
        name_to_rowsum_index('top_identity')
    if 'pops' in options.calculate_summaries:
        row_sums.append(possible_summaries['pops'](min_w=options.min_w,
                                                   keys_to_include=nodes))
        name_to_rowsum_index('pops')
    if 'subsets' in options.calculate_summaries:
        subgraph_dicts = read_subgraphing_dict(options.subgraph_file,
                                               types=['topological'])
        for dic in subgraph_dicts:
            skeys = dic['subgraph_keys']
            identifier = '.'.join(skeys)
            code = 'subsets_' + identifier
            sum_func = possible_summaries['subsets'](identifier=identifier,
                                                     **dic)
            row_sums.append(sum_func)
            name_to_rowsum_index(code)
            options.save_summaries.append(code)
            options.summary_summaries.append(code)
            possible_summary_summaries[code] = sum_func.summarise
    if 'set_differences' in options.calculate_summaries:
        row_sums.append(possible_summaries['set_differences'](
            true_tree, min_w=options.min_w, keys_to_include=nodes))
        name_to_rowsum_index('set_differences')
    if 'no_sadmixes' in options.calculate_summaries:
        if options.constrain_number_of_effective_admixes:
            no_effective_admixes = int(
                options.constrain_number_of_effective_admixes)
        else:
            no_effective_admixes = None
        row_sums.append(
            possible_summaries['no_sadmixes'](no_effective_admixes))
        name_to_rowsum_index('no_sadmixes')

    for summary in possible_summaries:
        if summary not in special_summaries:
            if summary in options.calculate_summaries or summary in options.custom_summaries:
                row_sums.append(possible_summaries[summary]())
                name_to_rowsum_index(summary)

    print row_sums

    def save_thin_columns(d_dic):
        return {
            summ: d_dic[summ]
            for summ in list(
                set(options.save_summaries + options.custom_summaries))
        }

    if options.treemix_post_analysis:
        if options.treemix_full_tree:
            constant_kwargs = {'full_nodes': full_nodes}
        else:
            constant_kwargs = {}
        all_results, _ = iterate_over_output_file(
            options.treemix_csv_output,
            cols=options.use_cols,
            pre_thin_data_set_function=thinner,
            while_thin_data_set_function=always_true,
            row_summarize_functions=row_sums,
            thinned_d_dic=save_thin_columns,
            full_summarize_functions=[],
            **constant_kwargs)
    else:
        all_results, _ = iterate_over_output_file(
            options.input_file,
            cols=options.use_cols,
            pre_thin_data_set_function=thinner,
            while_thin_data_set_function=always_true,
            row_summarize_functions=row_sums,
            thinned_d_dic=save_thin_columns,
            full_summarize_functions=[])

    if not options.summarize_posterior_distributions:
        summaries = all_results[0].keys()
        with open(options.result_file, 'w') as f:
            f.write(','.join(summaries) + '\n')
            for row in all_results:
                s_summs = [str(row[summ]) for summ in summaries]
                f.write(','.join(s_summs) + '\n')
        sys.exit()

    #print 'all_results:'
    #print all_results

    def save_wrapper(filename):
        def save(listi):
            with open(filename, 'w') as f:
                for ele in listi:
                    f.write(str(ele))

    if 'mode_topology_compare' in options.summary_summaries or 'mode_topology' in options.summary_summaries:

        def mode_topology_compare(v):
            a = mode(v)
            #print a
            return row_sums[name_to_rowsum_index['top_identity']](
                a)[0]['top_identity']

        def mode_topology(v):
            #print 'tops', v
            a = mode(v)
            return (a)

        possible_summary_summaries['mode_topology'] = mode_topology
        possible_summary_summaries[
            'mode_topology_compare'] = mode_topology_compare
    if 'mode_pops_compare' in options.summary_summaries or 'mode_pops' in options.summary_summaries:

        def mode_pops(v):
            #print 'pops',v
            v2 = ['-'.join(sorted(vi)) for vi in v]
            vmax_s = mode(v2)
            return vmax_s

        def mode_pops_compare(v):
            v2 = ['-'.join(sorted(vi)) for vi in v]
            vmax_s = mode(v2)
            vmax = vmax_s.split('-')
            #print vmax
            return row_sums[name_to_rowsum_index['set_differences']](
                vmax)[0]['set_differences']

        possible_summary_summaries['mode_pops_compare'] = mode_pops_compare
        possible_summary_summaries['mode_pops'] = mode_pops
    if 'node_count' in options.summary_summaries:

        def count_measure(v):
            ad = Counter([a for k in v for a in k])
            l = ad.most_common(4000)
            total = len(v)
            with open('node_counts.txt', 'w') as f:
                for key, count_num in l:
                    f.write(key + ' ' + str(float(count_num) / total) + '\n')

        possible_summary_summaries['node_count'] = count_measure
    if 'top_pops' in options.summary_summaries:

        def write_top_pops(v):
            v2 = ['_'.join(sorted(vi)) for vi in v]
            ad = Counter(v2)
            if options.number_of_top_pops <= 0:
                l = ad.most_common()
            else:
                l = ad.most_common(options.number_of_top_pops)
            total = len(v)
            with open('top_pops.txt', 'w') as f:
                for n, (key, count_num) in enumerate(l):
                    f.write(
                        str(n + 1) + ',' + str(float(count_num) / total) +
                        ',' + key + '\n')

        possible_summary_summaries['top_pops'] = write_top_pops
    if 'subgraph' in options.summary_summaries:
        subgraph_dicts = read_subgraphing_dict(options.subgraph_file)

        def subgraphing(trees):
            for dic in subgraph_dicts:
                sgraphs = get_most_likely_subgraphs_list(
                    strees=trees,
                    nodes=nodes,
                    subgraph_keys=dic['subgraph_keys'])
                save_top_subgraphs(topologies=sgraphs,
                                   nodes=dic['subgraph_keys'],
                                   **dic)

        possible_summary_summaries['subgraph'] = subgraphing

    n = len(options.save_summaries)
    summary_summaries = options.summary_summaries
    while len(
            summary_summaries
    ) < n:  #repeat arguments until the number of arguments is correct
        summary_summaries += options.summary_summaries

    summary_summaries_functions = [
        possible_summary_summaries[summ] for summ in summary_summaries
    ]

    summ_results = summarize_all_results(all_results, options.save_summaries,
                                         summary_summaries_functions)
    res = []
    header = []
    with open(options.result_file, 'w') as f:
        for n, (summ_func_name, summ_name) in enumerate(
                zip(summary_summaries, options.save_summaries)):
            res.append(summ_results[n])
            header.append(summ_name + '_' + summ_func_name)
        f.write(','.join(['input_file'] + header) + '\n')
        f.write(','.join([options.input_file] + map(str, res)))

Example #19

0

Show file

File: downstream_analysis_tool.py Project: Tmacme/AdmixtureBayes

def get_list_of_turned_topologies(trees, true_tree):
    nodes = get_leaf_keys(true_tree)
    return [admixture_sorted_unique_identifier(tree, nodes) for tree in trees
            ], admixture_sorted_unique_identifier(true_tree, nodes)

Example #20

0

Show file

File: Rtree_to_covariance_matrix.py Project: Tmacme/AdmixtureBayes

def make_newicks(tree, node_keys=None):
    if node_keys is None:
        node_keys = sorted(get_leaf_keys(tree))
    pops = [Population([1.0], [node]) for node in node_keys]
    ready_nodes = zip(node_keys, pops)