def get_admixtured_populations(tree): node_keys = sorted(get_leaf_keys(tree)) pops = [Population([1.0], [node]) for node in node_keys] ready_nodes = zip(node_keys, pops) waiting_nodes = {} taken_nodes = [] covmat = dummy_covmat() admixed_populations = [] while True: for key, pop in ready_nodes: upds, admixed = leave_node_and_check_admixtures( key, tree[key], pop, covmat) admixed_populations.extend(admixed) for upd in upds: waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree) taken_nodes.append(key) waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes, taken_nodes[:]) # print 'waiting_nodes', waiting_nodes # print 'ready_nodes', ready_nodes # print 'taken_nodes', taken_nodes if len(ready_nodes) == 0: return None if len(ready_nodes) == 1 and ready_nodes[0][0] == "r": break return admixed_populations
def tree_to_ms_command(rtree, sample_per_pop=50, nreps=2, theta=0.4, sites=500000, recomb_rate=1, leaf_keys=None, final_pop_size=100.0): tree=deepcopy(rtree) drift_sum=sum(get_all_branch_lengths(tree)) if recomb_rate is None: rec_part=' -s '+str(sites) else: rec_part=' -r '+str(recomb_rate)+ ' '+str(sites) n=get_no_leaves(tree) callstring='ms '+str(sample_per_pop*n)+' '+str(nreps)+' -t '+ str(theta)+' ' +rec_part + ' ' callstring+=' -I '+str(n)+' '+' '.join([str(sample_per_pop) for _ in xrange(n)])+' ' times=get_timing(tree) #print times tree=extend_branch_lengths(tree,times) tuple_branch_lengths=get_all_branch_lengths(tree) count_sum=sum((x[1] for x in tuple_branch_lengths)) tree=scaled_tupled_branches(tree, drift_sum/count_sum) times={k:v*drift_sum/count_sum for k,v in times.items()} #print pretty_string(tree) if leaf_keys is None: leaf_keys= get_leaf_keys(tree) callstring+=construct_ej_en_es_string(tree, times, leaf_keys=leaf_keys, final_pop_size=final_pop_size) #print tree #popsizes=[[calculate_pop_size(node[3])] if node_is_non_admixture(node) else [calculate_pop_size(node[3]), calculate_pop_size(node[4])] for key,node in tree.items()] #pops=[p for l in popsizes for p in l] return callstring#,(min(pops),max(pops), max(times.values())) #TO CHANGE BACK
def get_branches_to_keep(tree, subgraph_keys): node_keys = get_leaf_keys(tree) pops = [Population([1.0], [node]) for node in node_keys] follow_branch = follow_branch_class(subgraph_keys) ready_nodes = zip(node_keys, pops) waiting_nodes = {} taken_nodes = [] target_nodes = [] #print tree while True: #print ready_nodes for key, pop in ready_nodes: #pop_strings.append(pop.get_population_string(min_w)) upds = leave_node(key, tree[key], pop, target_nodes, follow_branch) for upd in upds: waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree) taken_nodes.append(key) waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes, taken_nodes[:]) #print 'waiting_nodes', waiting_nodes #print 'ready_nodes', ready_nodes #print 'taken_nodes', taken_nodes if len(ready_nodes) == 0: return None if len(ready_nodes) == 1 and ready_nodes[0][0] == "r": #big_pop=ready_nodes[0][1] #pop_strings.append(big_pop.get_population_string(min_w)) break #print 'finished tree' return target_nodes
def addmix_with_correction(tree, new_node_names=None, pks={}, fixed_sink_source=None, new_branch_length=None, new_to_root_length=None): added_tree, forward, backward = addadmix( tree, new_node_names=new_node_names, pks=pks, fixed_sink_source=fixed_sink_source, new_branch_length=new_branch_length, new_to_root_length=new_to_root_length, check_opposite=False, preserve_root_distance=False) node_keys = sorted(get_leaf_keys(tree)) A, _, bi1 = make_coefficient_matrix(tree, node_keys=node_keys) B, _, bi2 = make_coefficient_matrix(added_tree, node_keys=node_keys) x_A = get_specific_branch_lengths(tree, reverse_dic_to_list(bi1)) x_B = get_specific_branch_lengths(added_tree, reverse_dic_to_list(bi2)) Binverse = pinv(B) Ainverse = pinv(A) tilde_x_B = Binverse.dot(A.dot(x_A)) #random_tilde_x_B=add_random_noise(tilde_x_B) tilde_x_A = Ainverse.dot(B.dot(tilde_x_B)) # # if all((float_equal(x,y) for x,y in zip(x_A, tilde_x_A))): # added_tree=update_specific_branch_lengths(added_tree, bi2, tilde_x_B) # if added_tree is None: # return tree, 1,0 # else: # return tree, forward, backward # else: # new_x_B= print array(x_B) print array(tilde_x_B) print array(x_A) print array(tilde_x_A) print B.dot(x_B) print A.dot(x_A) print B.dot(tilde_x_B) print A.dot(tilde_x_A) tilde2_x_A = Ainverse.dot(B.dot(x_B)) tilde2_x_B = Binverse.dot(A.dot(tilde2_x_A)) print B.dot(x_B) print A.dot(x_A) print B.dot(tilde2_x_B) print A.dot(tilde2_x_A) t = 5
def make_coefficient_matrix(tree, node_keys=None, branch_keys=None): ''' Instead of constructing the covariance matrix, this function calculates the coefficient matrix, C, to solve w=Cx where w is the diagonal of the covariance matrix and x is the vector of branch lengths. Hence, C depends on the admixture proportions and the topology. ''' if node_keys is None: node_keys=sorted(get_leaf_keys(tree)) if branch_keys is None: branch_keys=get_all_branches(tree) pops=[Population([1.0],[node]) for node in node_keys] ready_nodes=zip(node_keys,pops) ni={node_key:n for n,node_key in enumerate(node_keys)} bi={branch:n for n,branch in enumerate(branch_keys)} cofmat=Coefficient_Matrix(ni,bi, get_all_pairs(node_keys)) waiting_nodes={} taken_nodes=[] while True: for key,pop in ready_nodes: upds=leave_node(key, tree[key], pop, cofmat) for upd in upds: waiting_nodes=_add_to_waiting(waiting_nodes, upd,tree) taken_nodes.append(key) waiting_nodes,ready_nodes=_thin_out_dic(waiting_nodes, taken_nodes[:]) #print 'waiting_nodes', waiting_nodes #print 'ready_nodes', ready_nodes #print 'taken_nodes', taken_nodes if len(ready_nodes)==0: return None if len(ready_nodes)==1 and ready_nodes[0][0]=="r": break return cofmat.get_matrix(), ni,bi
def make_covariance(tree, node_keys=None, old_cov=False): if node_keys is None: node_keys = sorted(get_leaf_keys(tree)) #print node_keys #print get_leaf_keys(tree) pops = [Population([1.0], [node]) for node in node_keys] ready_nodes = zip(node_keys, pops) covmat = Covariance_Matrix2( {node_key: n for n, node_key in enumerate(node_keys)}) if old_cov: covmat = Covariance_Matrix2( {node_key: n for n, node_key in enumerate(node_keys)}) waiting_nodes = {} taken_nodes = [] while True: for key, pop in ready_nodes: upds = leave_node(key, tree[key], pop, covmat) for upd in upds: waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree) taken_nodes.append(key) waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes, taken_nodes[:]) #print 'waiting_nodes', waiting_nodes #print 'ready_nodes', ready_nodes #print 'taken_nodes', taken_nodes if len(ready_nodes) == 0: return None if len(ready_nodes) == 1 and ready_nodes[0][0] == "r": break return covmat.get_matrix()
def getcorrection(old_tree, new_tree, sigma): node_keys = sorted(get_leaf_keys(old_tree)) B, _, bi1 = make_coefficient_matrix(old_tree, node_keys=node_keys) A, _, bi2 = make_coefficient_matrix(new_tree, node_keys=node_keys) branches = reverse_dic_to_list(bi1) x_A = array(get_specific_branch_lengths(old_tree, branches)) x_B = array(get_specific_branch_lengths(new_tree, branches)) x_old = deepcopy(x_A) # print x_A #print x_B #print x_old upper = x_A.dot(B.T.dot(A) + identity(len(branches))) lower_first = A.T.dot(A) + identity(len(branches)) mu_new = mm(U=upper, L=lower_first, initial_value=x_B) x_new = mu_new + norm.rvs(scale=sigma, size=len(mu_new)) q_forward = sum(norm.logpdf(mu_new - x_new, scale=sigma)) upper_reverse = x_new.dot((A.T.dot(B) + identity(len(branches)))) lower_first_reverse = B.T.dot(B) + identity(len(branches)) mu_reverse = mm(U=upper_reverse, L=lower_first_reverse, initial_value=array(x_new)) #print 'matrix_rank , dimension (A)', matrix_rank(A), A.shape #print 'matrix_rank , dimension (B)', matrix_rank(B), B.shape #print 'x_reverse', reverse_mu_new q_backward = sum(norm.logpdf(mu_reverse - x_A, scale=sigma)) #wear the new values #print branches new_tree = update_specific_branch_lengths(new_tree, branches, x_new) #print sum((A.dot(mu_new)-B.dot(x_old))**2) #print sum((A.dot(x_new)-B.dot(x_old))**2) #print sum((B.dot(x_old)-A.dot(x_old))**2) return new_tree, 1.0, exp(q_backward - q_forward)
def simulate_P_from_tree(tree, no_individuals_per_population, no_snps): filter=make_filter(filter_type='none') filename_gz=ms_simulate_wrapper(tree, sample_per_pop=no_individuals_per_population, nreps=no_snps//200, theta=0.4, sites=500000, recomb_rate=1.0, full_nodes=get_leaf_keys(tree), final_pop_size=100.0, ms_file='ms.txt', treemix_file='treemix.txt', time_adjust=False) x,n,_=get_xs_and_ns_from_treemix_file(filename_gz, filter) p=x/n return p.T, None
def getcorrection_adding(old_tree, new_tree, sigma, branches, U_matrix): node_keys = sorted(get_leaf_keys(old_tree)) A, _, _ = make_coefficient_matrix(old_tree, node_keys=node_keys, branch_keys=branches[:-3]) B, _, _ = make_coefficient_matrix(new_tree, node_keys=node_keys, branch_keys=branches) x_A = get_specific_branch_lengths(old_tree, branches[:-3]) x_B = get_specific_branch_lengths(new_tree, branches) B2 = B.dot(U_matrix) lambd = pinv(B2.dot(B2.T)).dot(A - B2).dot(x_A) mu_new = (B2.T).dot(lambd) + x_A x_new_reduced = mu_new + norm.rvs(scale=sigma, size=len(mu_new)) q_forward = reduce(mul, norm.pdf(mu_new - x_new_reduced, scale=sigma)) x_new = U_matrix.dot(x_new_reduced) print 'x_A', x_A print 'x_B', x_B print 'mu_new', mu_new print 'x_new_reduced', x_new_reduced print 'x_new', x_new reverse_lambd = pinv(A.dot(A.T)).dot(B2 - A).dot(x_new_reduced) reverse_mu_new = (A.T).dot(reverse_lambd) + x_new_reduced print 'matrix_rank , dimension (A)', matrix_rank(A), A.shape print 'matrix_rank , dimension (B)', matrix_rank(B), B.shape print 'mu_reverse', reverse_mu_new q_backward = reduce(mul, norm.pdf(reverse_mu_new - x_A, scale=sigma)) #wear the new values #print branches new_tree = update_specific_branch_lengths(new_tree, branches, x_new) return new_tree, q_forward, q_backward
def __call__(self, Rtree=None, add=None, **kwargs): #print kwargs['full_tree'] #print self.nodes if Rtree is None: full_tree = kwargs['full_tree'] outgroup_name = list( set(get_leaf_keys(full_tree)) - set(self.nodes))[0] cov = make_covariance(full_tree, node_keys=[outgroup_name] + self.nodes) Rcov = reduce_covariance(cov, 0) return {'Rcov': Rcov}, False #print pretty_string(Rtree) #print get_leaf_keys(Rtree) #print self.nodes Rcov = make_covariance( Rtree, node_keys=self.nodes) + float(add) * self.add_multiplier return {'Rcov': Rcov}, False
def __call__(self, Rtree=None, **kwargs): if 'string_tree' in kwargs: topology = kwargs['string_tree'].split('=')[-1].split(';')[0] return {'topology': topology}, False if Rtree is None: full_tree = kwargs['full_tree'] outgroup = list(set(get_leaf_keys(full_tree)) - set(self.nodes))[0] #print full_tree, outgroup cfull_tree = rearrange_root_foolproof( deepcopy(full_tree), outgroup ) #this removes the admixtures between the outgroup and the root. Rtree = remove_outgroup(cfull_tree, outgroup) #print 'topology calculation' #print self.nodes #print Rtree top = admixture_sorted_unique_identifier(Rtree, leaf_order=self.nodes, not_opposite=True) return {'topology': top}, False
def time_adjusted_tree_to_ms_command(time_adjusted_tree, sample_per_pop=50, nreps=2, theta=0.4, sites=500000, recomb_rate=1, leaf_keys=None, final_pop_size=100.0, verbose_level='normal'): tree=deepcopy(time_adjusted_tree) if recomb_rate is None: rec_part=' -s '+str(sites) else: rec_part=' -r '+str(recomb_rate)+ ' '+str(sites) n=get_no_leaves(tree) callstring='ms '+str(sample_per_pop*n)+' '+str(nreps)+' -t '+ str(theta)+' ' +rec_part + ' ' callstring+=' -I '+str(n)+' '+' '.join([str(sample_per_pop) for _ in xrange(n)])+' ' times=get_max_timing(tree) #print times tree=extend_branch_lengths(tree,times) #print pretty_string(tree) if leaf_keys is None: leaf_keys= get_leaf_keys(tree) callstring+=construct_ej_es_string(tree, times, leaf_keys=leaf_keys, final_pop_size=final_pop_size) return callstring
def get_populations(tree, min_w=0.0, keys_to_include=None): node_keys = sorted(get_leaf_keys(tree)) if keys_to_include is None: keys_to_remove = [] else: keys_to_remove = list(set(node_keys) - set(keys_to_include)) pops = [Population([1.0], [node]) for node in node_keys] ready_nodes = zip(node_keys, pops) waiting_nodes = {} taken_nodes = [] covmat = dummy_covmat() pop_strings = [] while True: for key, pop in ready_nodes: pop_strings.append(pop.get_population_string( min_w, keys_to_remove)) upds = leave_node(key, tree[key], pop, covmat) for upd in upds: waiting_nodes = _add_to_waiting(waiting_nodes, upd, tree) taken_nodes.append(key) waiting_nodes, ready_nodes = _thin_out_dic(waiting_nodes, taken_nodes[:]) #print 'waiting_nodes', waiting_nodes #print 'ready_nodes', ready_nodes #print 'taken_nodes', taken_nodes if len(ready_nodes) == 0: return None if len(ready_nodes) == 1 and ready_nodes[0][0] == "r": big_pop = ready_nodes[0][1] pop_strings.append( big_pop.get_population_string(min_w, keys_to_remove)) break if '' in pop_strings: pop_strings.remove('') return sorted(list(set(pop_strings)))
def get_unique_plottable_tree(tree, nodes=None): if nodes is None: nodes = sorted(get_leaf_keys(tree)) possible_strees = sorted(get_possible_strees(tree, nodes)) return possible_strees[0]
default='', help='The file where the populations should be saved.') options = parser.parse_args() if options.input_type == 'tree': tree = identifier_file_to_tree_clean(options.input_file) if options.input_add: with open(options.input_add, 'r') as f: add = float(f.readline()) tree = add_outgroup(tree, inner_node_name='new_node', to_new_root_length=float(add), to_outgroup_length=0, outgroup_name=options.outgroup_name) nodes = get_leaf_keys(tree) assert all((a in nodes for a in options.populations )), 'Requested population was not found in the tree' subtree = get_subtree(tree, options.populations) if not options.output_file: options.output_file = options.input_file + '_'.join( options.populations) with open(options.output_file, 'w') as f: f.write(' '.join(sorted(options.populations)) + '\n') f.write(unique_identifier_and_branch_lengths(subtree)) if options.input_type == 'snps': if options.input_file.endswith('.gz'): options.input_file = unzip(options.input_file, overwrite=False) df = pd.read_csv(options.input_file, usecols=options.populations, sep=' ') if not options.output_file: options.output_file = options.input_file + '_'.join(
def treemix_file_to_admb_files(filename_treeout, filename_vertices, filename_edges, outgroup=None, snodes=None, prefix='', force=True, return_format=[ 'None', 'arbitrary_rooted', 'outgroup_rooted', 'outgroup_removed', 'outgroup_removed_tuple' ]): return_format = initor(return_format) tree = read_treemix_file2(filename_treeout, filename_vertices, filename_edges) arbitrary_rooted = deepcopy(tree) nodes = get_leaf_keys(tree) if snodes is not None: snodes_set = set(snodes) if outgroup is not None: snodes_set = set(snodes + [outgroup]) if outgroup not in snodes: warnings.warn( 'outgroup added to the beginning of the admbayes realization of the treemix mle, even though it is not requested in snodes.' ) snodes.append(outgroup) assert set(nodes) == set( snodes ), 'the nodes of the treemix file does not match, the supplied nodes' else: snodes = nodes save_stage(tree, 4, prefix='not_needed', full_nodes=snodes, before_added_outgroup_nodes=['not_needed'], after_reduce_nodes=['not_needed'], filename=prefix + '_treemix_arbitrary_rooted_tree.txt') if outgroup is not None: if force: tree = rearrange_root_foolproof(tree, outgroup) else: tree = rearrange_root(tree, outgroup) save_stage(tree, 4, prefix='not_needed', full_nodes=snodes, before_added_outgroup_nodes=['not_needed'], after_reduce_nodes=['not_needed'], filename=prefix + '_treemix_outgroup_rooted_tree.txt') outgroup_rooted = deepcopy(tree) tree, add = remove_outgroup(tree, remove_key=outgroup, return_add_distance=True) snodes.remove(outgroup) save_stage(tree, 4, prefix='not_needed', full_nodes=snodes, before_added_outgroup_nodes=['not_needed'], after_reduce_nodes=['not_needed'], filename=prefix + '_treemix_outgroup_rooted_removed_tree.txt') save_stage(add, 2, prefix='not_needed', full_nodes=snodes, before_added_outgroup_nodes=['not_needed'], after_reduce_nodes=['not_needed'], filename=prefix + '_treemix_outgroup_rooted_removed_add.txt') outgroup_removed = deepcopy(tree) if return_format == 'arbitrary_rooted': return arbitrary_rooted if return_format == 'outgroup_rooted': return outgroup_rooted if return_format == 'outgroup_removed': return outgroup_removed if return_format == 'outgroup_removed_tuple': return outgroup_removed, add
def tree_to_covariance(stree): tree=identifier_to_tree_clean(stree) nodes=sorted(get_leaf_keys(tree)) return make_covariance(tree, node_keys=nodes)
def run_posterior_main(args): possible_summaries = { 'Rtree': make_Rtree, 'full_tree': make_full_tree, 'string_tree': make_string_tree, 'Rcov': make_Rcovariance, 'cov_dist': cov_truecov, 'topology': topology, 'subgraph': subgraph, 'subsets': subsets, 'top_identity': topology_identity, 'pops': get_pops, 'set_differences': compare_pops, 'no_sadmixes': extract_number_of_sadmixes } possible_summaries.update(all_custom_summaries()) print possible_summaries parser = ArgumentParser(usage='pipeline for post analysis', version='1.0.0') parser.add_argument('--input_file', required=True, type=str, help='The output file from an AdmixtureBayes run.') parser.add_argument( '--covariance_matrix_file', required=True, type=str, help= 'file containing the covariance matrix with a header with all the population names and a line with the multiplier. It has the ending covariance_and_multiplier.txt.' ) parser.add_argument( '--subnodes', default=[], type=str, nargs='+', help= 'The subset of populations to perform the analysis on. If not declared, the analysis will be done on the full dataset.' ) parser.add_argument( '--result_file', default='posterior_distributions.csv', type=str, help= 'The resulting file. It will be comma-separated and contain one column per summary plus a header.' ) parser.add_argument('--prefix', default='', type=str, help='place to put the temporary files') parser.add_argument( '--total', default=886, type=int, help= 'an upper limit on the number of rows to reduce computational pressure' ) parser.add_argument( '--burn_in_fraction', default=0.5, type=float, help='the proportion of the rows that are discarded as burn in period') parser.add_argument( '--calculate_summaries', default=['Rtree', 'pops', 'full_tree', 'string_tree', 'topology'], choices=possible_summaries.keys(), nargs='*', type=str, help='The summaries to calculate') parser.add_argument( '--save_summaries', default=['no_admixes', 'topology', 'pops', 'string_tree'], nargs='*', type=str, help='The list of summaries to save') parser.add_argument( '--custom_summaries', default=[], nargs='*', choices=possible_summaries.keys(), help= 'This will add summaries (to both calculate_summaries and save_summaries). They are defined in the class custom_summary.py.' ) parser.add_argument( '--summarize_posterior_distributions', default=False, help= 'If set to true, the posterior distibutions will be summarized even further.' ) parser.add_argument( '--min_w', default=0.0, type=float, help= 'a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.' ) parser.add_argument( '--constrain_number_of_admixes', default='', type=str, choices=['', 'true_val'] + map(str, range(21)), help= 'The number of admixture events that there are constrained on in the data set. If negative there are no constraints' ) parser.add_argument( '--constrain_number_of_effective_admixes', default='', choices=['', 'true_val'] + map(str, range(21)), type=str, help= 'The number of effective(visible)_admixture events that there are constrained on in the data set. If negative there are no constraints.' ) parser.add_argument( '--constrain_sadmix_trees', default=False, action='store_true', help= 'this will remove the graphs which has invisible admixtures. This will produce worse, but more easily interpretable results.' ) parser.add_argument( '--no_sort', default=False, action='store_true', help= 'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to ' ) parser.add_argument('--use_cols', default=['tree', 'add', 'layer', 'no_admixes'], type=str, nargs='+', help='The columns to load from the input file') parser.add_argument('--outgroup_name', default='', type=str, help='name of the outgroup') parser.add_argument('--emp_m_scale', type=str, default='') parser.add_argument('--emp_variance_correction', type=str, default='') parser.add_argument('--emp_df', type=str, default='') parser.add_argument('--emp_covariance_and_multiplier', default='', type=str) parser.add_argument('--emp_covariance_reduced', default='', type=str) parser.add_argument( '--choice_if_no_thinned_graphs', default='error', choices=['error', 'nearest_admixture_events'], help= 'If the thinning leaves no graphs left, this is what will be done in stead. error will throw an error and nearest_admixture_events will expand the band of allowed number of admixture events(if the chain has been thinned on number of admixture events).' ) parser.add_argument( '--test_run', default=False, action='store_true', help='will overwrite everything and run a test function') parser.add_argument( '--summary_summaries', default=['mean'], nargs='*', type=str, help= 'How each list is summarized as a single, numerical value. If it doesnt have the same length as save summaries the arguments will be repeated until it does' ) parser.add_argument( '--number_of_top_pops', default=10, type=int, help= 'if top_pops is added to summary_summaries this is the number of set topologies saved. negative values means all topologies are saved.' ) parser.add_argument('--true_scaled_tree', type=str, default='') parser.add_argument('--true_tree', type=str, default='') parser.add_argument('--true_add', type=str, default='') parser.add_argument('--true_covariance_reduced', type=str, default='') parser.add_argument('--true_covariance_and_multiplier', type=str, default='') parser.add_argument('--true_no_admix', type=str, default='') parser.add_argument( '--treemix_post_analysis', action='store_true', default=False, help= 'this will convert the treemix input fil ../../../../Dropbox/Bioinformatik/AdmixtureBayes/test_final_grid/ai_2_5true/_true_tree.txtes into a suitable csv file for ' ) parser.add_argument('--treemix_tree', default='', type=str, help='') parser.add_argument('--treemix_add', default='', type=str, help='') parser.add_argument('--treemix_full_tree', default='') parser.add_argument('--treemix_csv_output', default='treemix.csv', type=str, help='') parser.add_argument( '--subgraph_file', default='', type=str, help= 'file where each line has a space separated list of leaf labels to calculate subtrees from. If a double underscore(__) occurs, it means that the following two arguments are max number of sub topologies and total posterior probability.' ) options = parser.parse_args(args) assert not ('string_tree' in options.calculate_summaries and not 'full_tree' in options.calculate_summaries ), 'The full tree flag is needed for the string tree' if 'full_tree' in options.calculate_summaries: assert options.outgroup_name, 'The outgroup is specified to calculate the full tree' if options.subnodes: assert options.outgroup_name, 'when ' if options.outgroup_name in options.subnodes: subnodes_with_outgroup = options.subnodes subnodes_wo_outgroup = deepcopy(options.subnodes) subnodes_wo_outgroup.remove(options.outgroup_name) else: subnodes_with_outgroup = deepcopy( options.subnodes) + [options.outgroup_name] subnodes_wo_outgroup = options.subnodes else: subnodes_with_outgroup = [] subnodes_wo_outgroup = [] outp = read_true_values( true_scaled_tree=options.true_scaled_tree, true_tree=options.true_tree, true_add=options.true_add, true_covariance_reduced=options.true_covariance_reduced, true_covariance_and_multiplier=options.true_covariance_and_multiplier, true_no_admix=options.true_no_admix, subnodes_with_outgroup=subnodes_with_outgroup, subnodes_wo_outgroup=subnodes_wo_outgroup) true_scaled_tree, true_tree, true_add, true_covariance_reduced, ( true_covariance_scaled, true_multiplier), true_no_admix, _, _, _ = outp outp = read_true_values( true_covariance_reduced=options.emp_covariance_reduced, true_covariance_and_multiplier=options.covariance_matrix_file, true_m_scale=options.emp_m_scale, subnodes_with_outgroup=subnodes_with_outgroup, subnodes_wo_outgroup=subnodes_wo_outgroup) _, _, _, emp_covariance_reduced, ( emp_covariance_scaled, multiplier), _, emp_m_scale, vc, df = outp if options.treemix_post_analysis: if not options.treemix_full_tree: outp = read_true_values( true_tree=options.treemix_tree, true_add=options.treemix_add, subnodes_with_outgroup=subnodes_with_outgroup, subnodes_wo_outgroup=subnodes_wo_outgroup) _, treemix_tree, treemix_add, _, _, _, _, _, _ = outp create_treemix_csv_output(treemix_tree, treemix_add * multiplier, emp_m_scale, options.treemix_csv_output) elif options.treemix_full_tree: outp = read_true_values( true_scaled_tree=options.treemix_full_tree, subnodes_with_outgroup=subnodes_with_outgroup, subnodes_wo_outgroup=subnodes_wo_outgroup) full_treemix_tree, _, _, _, _, _, _, _, _ = outp create_treemix_sfull_tree_csv_output(full_treemix_tree, emp_m_scale, options.treemix_csv_output) full_nodes = sorted(get_leaf_keys(full_treemix_tree)) if options.constrain_number_of_admixes: if options.constrain_number_of_admixes == 'true_val': thinner = thinning_on_admixture_events( burn_in_fraction=options.burn_in_fraction, total=options.total, no_admixes=true_no_admix, if_no_trees=options.choice_if_no_thinned_graphs) else: thinner = thinning_on_admixture_events( burn_in_fraction=options.burn_in_fraction, total=options.total, no_admixes=options.constrain_number_of_admixes, if_no_trees=options.choice_if_no_thinned_graphs) else: thinner = thinning(burn_in_fraction=options.burn_in_fraction, total=options.total) nodes = read_one_line(options.covariance_matrix_file).split() if not options.no_sort: nodes = sorted(nodes) row_sums = [] class pointers(object): def __init__(self): self.count = 0 self.dic = {} def __call__(self, name): self.dic[name] = self.count self.count += 1 def __getitem__(self, key): return self.dic[key] name_to_rowsum_index = pointers() possible_summary_summaries = {'mean': float_mean} #print 'subnodes_wo_outgroup', subnodes_wo_outgroup special_summaries = [ 'Rtree', 'full_tree', 'string_tree', 'subgraph', 'Rcov', 'cov_dist', 'topology', 'top_identity', 'pops', 'subsets', 'set_differences', 'no_admixes' ] if 'Rtree' in options.calculate_summaries: row_sums.append(possible_summaries['Rtree']( deepcopy(nodes), options.constrain_sadmix_trees, subnodes=subnodes_wo_outgroup)) name_to_rowsum_index('Rtree') if 'full_tree' in options.calculate_summaries: if multiplier is None: add_multiplier = 1.0 else: add_multiplier = 1.0 / multiplier row_sums.append(possible_summaries['full_tree']( add_multiplier=add_multiplier, outgroup_name=options.outgroup_name, remove_sadtrees=options.constrain_sadmix_trees, subnodes=subnodes_with_outgroup)) name_to_rowsum_index('full_tree') if 'string_tree' in options.calculate_summaries: if options.subnodes: row_sums.append(possible_summaries['string_tree']( deepcopy(subnodes_wo_outgroup), options.outgroup_name, tree_unifier())) else: row_sums.append(possible_summaries['string_tree']( deepcopy(nodes), options.outgroup_name, tree_unifier())) name_to_rowsum_index('string_tree') if options.subnodes: nodes = subnodes_wo_outgroup full_nodes = sorted(list(set(nodes[:] + [options.outgroup_name]))) if 'subgraph' in options.calculate_summaries: subgraph_dicts = read_subgraphing_dict(options.subgraph_file, types=['full']) for dic in subgraph_dicts: skeys = dic['subgraph_keys'] identifier = '.'.join(skeys) code = 'subgraph_' + identifier sum_func = possible_summaries['subgraph'](skeys, identifier) row_sums.append(sum_func) name_to_rowsum_index(code) options.save_summaries.append(code) options.summary_summaries.append(code) possible_summary_summaries[code] = sum_func.summarise if 'Rcov' in options.calculate_summaries: row_sums.append(possible_summaries['Rcov'](deepcopy(nodes), add_multiplier=1.0 / multiplier)) name_to_rowsum_index('Rcov') if 'cov_dist' in options.calculate_summaries: row_sums.append( possible_summaries['cov_dist'](true_covariance_reduced)) name_to_rowsum_index('cov_dist') if 'topology' in options.calculate_summaries: row_sums.append(possible_summaries['topology'](nodes=nodes)) name_to_rowsum_index('topology') if 'top_identity' in options.calculate_summaries: row_sums.append(possible_summaries['top_identity'](true_tree, nodes=nodes)) name_to_rowsum_index('top_identity') if 'pops' in options.calculate_summaries: row_sums.append(possible_summaries['pops'](min_w=options.min_w, keys_to_include=nodes)) name_to_rowsum_index('pops') if 'subsets' in options.calculate_summaries: subgraph_dicts = read_subgraphing_dict(options.subgraph_file, types=['topological']) for dic in subgraph_dicts: skeys = dic['subgraph_keys'] identifier = '.'.join(skeys) code = 'subsets_' + identifier sum_func = possible_summaries['subsets'](identifier=identifier, **dic) row_sums.append(sum_func) name_to_rowsum_index(code) options.save_summaries.append(code) options.summary_summaries.append(code) possible_summary_summaries[code] = sum_func.summarise if 'set_differences' in options.calculate_summaries: row_sums.append(possible_summaries['set_differences']( true_tree, min_w=options.min_w, keys_to_include=nodes)) name_to_rowsum_index('set_differences') if 'no_sadmixes' in options.calculate_summaries: if options.constrain_number_of_effective_admixes: no_effective_admixes = int( options.constrain_number_of_effective_admixes) else: no_effective_admixes = None row_sums.append( possible_summaries['no_sadmixes'](no_effective_admixes)) name_to_rowsum_index('no_sadmixes') for summary in possible_summaries: if summary not in special_summaries: if summary in options.calculate_summaries or summary in options.custom_summaries: row_sums.append(possible_summaries[summary]()) name_to_rowsum_index(summary) print row_sums def save_thin_columns(d_dic): return { summ: d_dic[summ] for summ in list( set(options.save_summaries + options.custom_summaries)) } if options.treemix_post_analysis: if options.treemix_full_tree: constant_kwargs = {'full_nodes': full_nodes} else: constant_kwargs = {} all_results, _ = iterate_over_output_file( options.treemix_csv_output, cols=options.use_cols, pre_thin_data_set_function=thinner, while_thin_data_set_function=always_true, row_summarize_functions=row_sums, thinned_d_dic=save_thin_columns, full_summarize_functions=[], **constant_kwargs) else: all_results, _ = iterate_over_output_file( options.input_file, cols=options.use_cols, pre_thin_data_set_function=thinner, while_thin_data_set_function=always_true, row_summarize_functions=row_sums, thinned_d_dic=save_thin_columns, full_summarize_functions=[]) if not options.summarize_posterior_distributions: summaries = all_results[0].keys() with open(options.result_file, 'w') as f: f.write(','.join(summaries) + '\n') for row in all_results: s_summs = [str(row[summ]) for summ in summaries] f.write(','.join(s_summs) + '\n') sys.exit() #print 'all_results:' #print all_results def save_wrapper(filename): def save(listi): with open(filename, 'w') as f: for ele in listi: f.write(str(ele)) if 'mode_topology_compare' in options.summary_summaries or 'mode_topology' in options.summary_summaries: def mode_topology_compare(v): a = mode(v) #print a return row_sums[name_to_rowsum_index['top_identity']]( a)[0]['top_identity'] def mode_topology(v): #print 'tops', v a = mode(v) return (a) possible_summary_summaries['mode_topology'] = mode_topology possible_summary_summaries[ 'mode_topology_compare'] = mode_topology_compare if 'mode_pops_compare' in options.summary_summaries or 'mode_pops' in options.summary_summaries: def mode_pops(v): #print 'pops',v v2 = ['-'.join(sorted(vi)) for vi in v] vmax_s = mode(v2) return vmax_s def mode_pops_compare(v): v2 = ['-'.join(sorted(vi)) for vi in v] vmax_s = mode(v2) vmax = vmax_s.split('-') #print vmax return row_sums[name_to_rowsum_index['set_differences']]( vmax)[0]['set_differences'] possible_summary_summaries['mode_pops_compare'] = mode_pops_compare possible_summary_summaries['mode_pops'] = mode_pops if 'node_count' in options.summary_summaries: def count_measure(v): ad = Counter([a for k in v for a in k]) l = ad.most_common(4000) total = len(v) with open('node_counts.txt', 'w') as f: for key, count_num in l: f.write(key + ' ' + str(float(count_num) / total) + '\n') possible_summary_summaries['node_count'] = count_measure if 'top_pops' in options.summary_summaries: def write_top_pops(v): v2 = ['_'.join(sorted(vi)) for vi in v] ad = Counter(v2) if options.number_of_top_pops <= 0: l = ad.most_common() else: l = ad.most_common(options.number_of_top_pops) total = len(v) with open('top_pops.txt', 'w') as f: for n, (key, count_num) in enumerate(l): f.write( str(n + 1) + ',' + str(float(count_num) / total) + ',' + key + '\n') possible_summary_summaries['top_pops'] = write_top_pops if 'subgraph' in options.summary_summaries: subgraph_dicts = read_subgraphing_dict(options.subgraph_file) def subgraphing(trees): for dic in subgraph_dicts: sgraphs = get_most_likely_subgraphs_list( strees=trees, nodes=nodes, subgraph_keys=dic['subgraph_keys']) save_top_subgraphs(topologies=sgraphs, nodes=dic['subgraph_keys'], **dic) possible_summary_summaries['subgraph'] = subgraphing n = len(options.save_summaries) summary_summaries = options.summary_summaries while len( summary_summaries ) < n: #repeat arguments until the number of arguments is correct summary_summaries += options.summary_summaries summary_summaries_functions = [ possible_summary_summaries[summ] for summ in summary_summaries ] summ_results = summarize_all_results(all_results, options.save_summaries, summary_summaries_functions) res = [] header = [] with open(options.result_file, 'w') as f: for n, (summ_func_name, summ_name) in enumerate( zip(summary_summaries, options.save_summaries)): res.append(summ_results[n]) header.append(summ_name + '_' + summ_func_name) f.write(','.join(['input_file'] + header) + '\n') f.write(','.join([options.input_file] + map(str, res)))
def get_list_of_turned_topologies(trees, true_tree): nodes = get_leaf_keys(true_tree) return [admixture_sorted_unique_identifier(tree, nodes) for tree in trees ], admixture_sorted_unique_identifier(true_tree, nodes)
def make_newicks(tree, node_keys=None): if node_keys is None: node_keys = sorted(get_leaf_keys(tree)) pops = [Population([1.0], [node]) for node in node_keys] ready_nodes = zip(node_keys, pops)