def test_replacing_hgt(self): N = 20 S = te.simulate_species_tree(N, model='innovation') # true gene tree (with losses) TGT = te.simulate_dated_gene_tree( S, dupl_rate=0.0, loss_rate=0.0, hgt_rate=1.0, prohibit_extinction='per_species', replace_prob=1.0, ) # observable gene tree OGT = te.observable_tree(TGT) leaves = [v for v in OGT.leaves()] colors = {v.color for v in leaves} # print(TGT.to_newick()) # print(OGT.to_newick()) self.assertTrue(len(colors) == N and len(leaves) == N)
def test_rs_edges(self): S = te.simulate_species_tree(10) TGT = te.simulate_dated_gene_tree(S, dupl_rate=1.0, loss_rate=0.5, hgt_rate=0.5) OGT = te.observable_tree(TGT) transf1 = analysis.true_transfer_edges(OGT) transf2 = analysis.rs_transfer_edges(OGT, S) self.assertTrue(transf1.issuperset(transf2))
def generate_solutions_unique_species(n, i_p=0.5, d_p=0.5): done = False count = 0 while not done: S = te.simulate_species_tree(10, model='innovation') TGT = te.simulate_dated_gene_tree(S, dupl_rate=0.5, loss_rate=0.5, hgt_rate=0.5, prohibit_extinction="per_family", replace_prob=0.0) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if len(ldt.nodes()) == n: IG = InvestigateGraph(ldt) IG.perturb_graph(i_p, d_p) solver = LDTEditor(IG._G_perturbed) solver.build_model() solver.optimize(time_limit=None) sol_graph, sol_distance = solver.get_solution() properly_colored = is_properly_colored(sol_graph) cograph = is_cograph(sol_graph) compatible = is_compatible(sol_graph) edit_dist = gt.symmetric_diff(IG._G_perturbed, sol_graph) print("Runtime: {}".format(solver.get_solve_time())) if properly_colored and cograph and compatible: print("Saving data...") solver._save_ILP_data( IG._G_perturbed, sol_graph, solver.get_solve_time(), edit_dist, only_add=False, only_delete=False, filename="{}nodes/LDTEdit_exact_solution".format(n)) else: print("No solution found!") count += 1 if count == 100: done = True
def generate_trees(n=100, m=10, model='innovation', dupl_rate=0.5, loss_rate=0.5, hgt_rate=0.5, prohibit_extinction="per_family", replace_prob=0.0, size=10): i = 0 dirName = 'exact_solutions/trees/{}trees'.format(size) # create folder if it doesnt exist if not os.path.exists(dirName): os.makedirs(dirName) ID = 0 else: ID = find_next_ID('exact_solutions/trees/{}trees/'.format(size)) while i < n: S = te.simulate_species_tree(m, model=model) TGT = te.simulate_dated_gene_tree( S, dupl_rate=dupl_rate, loss_rate=loss_rate, hgt_rate=hgt_rate, prohibit_extinction=prohibit_extinction, replace_prob=replace_prob) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) amount_nodes = len(ldt.nodes()) if amount_nodes == size: # save trees filename_species = 'exact_solutions/trees/{}trees/species_{}_{}_{}.json'.format( size, m, model, ID) filename_gene = 'exact_solutions/trees/{}trees/gene_{}_{}_{}_{}_{}_{}.json'.format( size, dupl_rate, loss_rate, hgt_rate, prohibit_extinction, replace_prob, ID) S.serialize(filename_species) TGT.serialize(filename_gene) ID += 1 i += 1
def test_ldt_fitch(self): S = te.simulate_species_tree(20, model='innovation') # true gene tree (with losses) TGT = te.simulate_dated_gene_tree(S, dupl_rate=1.0, loss_rate=0.5, hgt_rate=0.2) # observable gene tree OGT = te.observable_tree(TGT) # finally we can extract the LDT and Fitch graph ldt = analysis.ldt_graph(OGT, S) transfer_edges = analysis.rs_transfer_edges(OGT, S) fitch = analysis.undirected_fitch(OGT, transfer_edges) cotree = to_cotree(ldt) self.assertTrue(gt.is_subgraph(ldt, fitch) and cotree)
def simulate_gene_trees(self, N, **kwargs): self.number_of_families = N self.true_gene_trees = te.simulate_gene_trees(self.S, N=N, **kwargs) if N == 1: self.true_gene_trees = [self.true_gene_trees] self.observable_gene_trees = [ te.observable_tree(tree) for tree in self.true_gene_trees ] # sequences should be emptied here if methods were called before if hasattr(self, 'sequence_dicts'): self.sequence_dicts.clear() if self.outdir: for i in range(N): filename = self._path('true_gene_trees', 'gene_tree{}.json'.format(i)) self.true_gene_trees[i].serialize(filename, mode='json')
# -*- coding: utf-8 -*- import tralda.tools.GraphTools as gt import asymmetree.treeevolve as te from asymmetree.analysis import (undirected_fitch, rs_transfer_edges, below_equal_above, ldt_graph, RsScenarioConstructor,) from asymmetree.tools.PhyloTreeTools import (to_newick,) S = te.simulate_species_tree(10) TGT = te.simulate_dated_gene_tree(S, dupl_rate=1.0, loss_rate=0.5, hgt_rate=0.5) OGT = te.observable_tree(TGT) print('--- S ---\n', to_newick(S)) print(to_newick(S, distance=False, label_inner=False)) print('--- OGT ---\n', to_newick(OGT)) ldt, above, equal = below_equal_above(OGT, S) fitch = undirected_fitch(OGT, rs_transfer_edges(OGT, S)) n = ldt.order() print('Genes:', n, 'Total relations:', int(n * (n-1) / 2)) print('< {}\n= {}\n> {}'.format(ldt.size(), equal.size(), above.size())) rs_scen_constr = RsScenarioConstructor(ldt) result = rs_scen_constr.run() if result:
# build loop len(parameter_Df.index) for ind in range(len(parameter_Df.index)-1): # species tree of type ’PhyloTree’ s = te.simulate_species_tree(int(parameter_Df.loc[ind, 'num_of_leaves']), model = parameter_Df.loc[ind, 'model'], non_binary_prob = parameter_Df.loc[ind, 'non_binary_prob'], planted = parameter_Df.loc[ind, 'planted'], remove_extinct = parameter_Df.loc[ind, 'remove_extinct'], rescale_to_height = parameter_Df.loc[ind, 'rescale_to_height'] ) # true gene tree (contains losses) of type ’PhyloTree’ tgt = te.simulate_dated_gene_tree(s, dupl_rate = parameter_Df.loc[ind, 'dupl_rate'], loss_rate = parameter_Df.loc[ind, 'loss_rate'], hgt_rate = parameter_Df.loc[ind, 'hgt_rate'], dupl_polytomy = 0.0, prohibit_extinction= parameter_Df.loc[ind, 'prohibit_extinction'], replace_prob = parameter_Df.loc[ind, 'replace_prob'] ) # serialization s.serialize(wk_dir / '01_Data' / str(parameter_Df.loc[ind, 'ID'] + '_species_tree.pickle')) tgt.serialize(wk_dir / '01_Data' / str(parameter_Df.loc[ind, 'ID'] + '_gene_tree.pickle')) print('Simulating Tree :', ind) ogt = te.observable_tree(tgt)
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] #probabilities = [0.15, 0.30, 0.50] #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)] #nodes = [10, 14, 18] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) c1_graphs, c1_edge_count, c1_is_ldt, c1_edit_dist = ([] for i in range(4) ) # cograph editing t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ( [] for i in range(4)) # triples editing with both insertion/deletion t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = ( [] for i in range(4)) # triples editing with deletion only t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = ( [] for i in range(4)) # triples editing with insertion only t4_graphs, t4_edge_count, t4_is_ldt, t4_edit_dist = ([] for i in range(4) ) # ldt editing IG1 = None for i in range(len(species_trees)): print("Tree pair {}".format(i)) S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if not IG1: IG1 = InvestigateGraph(ldt) IG2 = copy.deepcopy(IG1) IG3 = copy.deepcopy(IG1) IG4 = copy.deepcopy(IG1) #IG5 = copy.deepcopy(IG1) IG1.perturb_graph_terminate(p1, p2) IG2.perturb_graph_terminate(p1, p2) IG3.perturb_graph_terminate(p1, p2) IG4.perturb_graph_terminate(p1, p2) #IG5.perturb_graph_terminate(p1, p2) cograph_edited_G, is_c1_ldt, c1_num_edges, c1_ldt_edit_dist = cograph_editing( IG1) triples1_edited_G, is_t1_ldt, t1_num_edges, t1_ldt_edit_dist = triples_editing( IG2, n=100) triples2_edited_G, is_t2_ldt, t2_num_edges, t2_ldt_edit_dist = triples_editing( IG3, deletion=True) triples3_edited_G, is_t3_ldt, t3_num_edges, t3_ldt_edit_dist = triples_editing( IG4, insertion=True) #ldt_edited_G, is_t4_ldt, t4_num_edges, t4_ldt_edit_dist = LDT_editing(IG5, deletion = True) c1_graphs.append(cograph_edited_G) t1_graphs.append(triples1_edited_G) t2_graphs.append(triples2_edited_G) t3_graphs.append(triples3_edited_G) #t4_graphs.append(ldt_edited_G) c1_is_ldt.append(is_c1_ldt) t1_is_ldt.append(is_t1_ldt) t2_is_ldt.append(is_t2_ldt) t3_is_ldt.append(is_t3_ldt) #t4_is_ldt.append(is_t4_ldt) c1_edge_count.append(c1_num_edges) t1_edge_count.append(t1_num_edges) t2_edge_count.append(t2_num_edges) t3_edge_count.append(t3_num_edges) #t4_edge_count.append(t4_num_edges) c1_edit_dist.append(c1_ldt_edit_dist) t1_edit_dist.append(t1_ldt_edit_dist) t2_edit_dist.append(t2_ldt_edit_dist) t3_edit_dist.append(t3_ldt_edit_dist) #t4_edit_dist.append(t4_ldt_edit_dist) _, cograph_freq, _ = get_freq(IG1) triples1_freq, _, _ = get_freq(IG2) triples2_freq, _, _ = get_freq(IG3) triples3_freq, _, _ = get_freq(IG4) #_, _, ldt_freq = get_freq(IG5) frequencies = [cograph_freq, triples1_freq, triples2_freq, triples3_freq] return frequencies
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] #probabilities = [0.15, 0.30, 0.50] #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)] #nodes = [10, 14, 18] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ([] for i in range(4) ) # ldt editing t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = ( [] for i in range(4)) # ldt editing (triples edit deletion) t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = ( [] for i in range(4)) # ldt editing (triples edit insertion) IG1 = None for i in range(len(species_trees)): print("Tree pair {}".format(i)) S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if not IG1: IG1 = InvestigateGraph(ldt) IG2 = copy.deepcopy(IG1) IG3 = copy.deepcopy(IG1) IG1.perturb_graph_terminate(p1, p2) IG2.perturb_graph_terminate(p1, p2) IG3.perturb_graph_terminate(p1, p2) t1_edited_G, is_t1_ldt, _, t1_ldt_edit_dist = LDT_editing( IG1, n=100 ) # ldt editing with triples editing allowing both deletions and insertions for n = 100. t2_edited_G, is_t2_ldt, _, t2_ldt_edit_dist = LDT_editing( IG2, deletion=True ) # ldt editing with triples editing allowing only deletions. t3_edited_G, is_t3_ldt, _, t3_ldt_edit_dist = LDT_editing( IG3, insertion=True ) # ldt editing with triples editing allowing only insertions. t1_graphs.append(t1_edited_G) t2_graphs.append(t2_edited_G) t3_graphs.append(t3_edited_G) t1_is_ldt.append(is_t1_ldt) t2_is_ldt.append(is_t2_ldt) t3_is_ldt.append(is_t3_ldt) t1_edit_dist.append(t1_ldt_edit_dist) t2_edit_dist.append(t2_ldt_edit_dist) t3_edit_dist.append(t3_ldt_edit_dist) _, _, ldt1_freq = get_freq(IG1) _, _, ldt2_freq = get_freq(IG2) _, _, ldt3_freq = get_freq(IG3) frequencies = [ldt1_freq, ldt2_freq, ldt3_freq] return frequencies
def generate_solutions_fromTrees(n, filename): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] probabilities = [0.15, 0.30, 0.50] #nodes = [10, 14, 18] restrictions = ['', 'insertion', 'deletion'] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) ID = 0 for i in range(len(species_trees)): S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) # perturb using p = 0.15, 0.3, 0.5 # for each p, solve using ILP with deletion, insertion and both for p1 in probabilities: for p2 in probabilities: p_i = str(p1).replace('.', '') if len(p_i) < 3: p_i = p_i + '0' p_d = str(p2).replace('.', '') if len(p_d) < 3: p_d = p_d + '0' IG = InvestigateGraph(ldt) perturbed = IG.perturb_graph_terminate(p1, p2) if not perturbed: print("failed") else: # solve 3 times using deletion, insertion and both solver1 = LDTEditor(IG._G_perturbed) solver1.build_model() solver1.optimize(time_limit=None) solver2 = LDTEditor(IG._G_perturbed, only_delete = True) solver2.build_model() solver2.optimize(time_limit=None) solver3 = LDTEditor(IG._G_perturbed, only_add = True) solver3.build_model() solver3.optimize(time_limit=None) sol_graph1, sol_distance1 = solver1.get_solution() sol_graph2, sol_distance2 = solver2.get_solution() sol_graph3, sol_distance3 = solver3.get_solution() properly_colored1 = is_properly_colored(sol_graph1) cograph1 = is_cograph(sol_graph1) compatible1 = is_compatible(sol_graph1) properly_colored2 = is_properly_colored(sol_graph2) cograph2 = is_cograph(sol_graph2) compatible2 = is_compatible(sol_graph2) properly_colored3 = is_properly_colored(sol_graph3) cograph3 = is_cograph(sol_graph3) compatible3 = is_compatible(sol_graph3) folderName = 'exact_solutions/{}_{}_{}nodes{}/' saveFolder1 = folderName.format(p1, p2, n, '') if properly_colored1 and cograph1 and compatible1: print("Saving data...") solver1._save_ILP_data(IG._G_perturbed, sol_graph1, solver1.get_solve_time(), sol_distance1, i_p = p1, d_p = p2, only_add=False, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, ''), ID = ID) else: print("No solution found!") if properly_colored2 and cograph2 and compatible2: print("Saving data (deletion)...") solver2._save_ILP_data(IG._G_perturbed, sol_graph2, solver2.get_solve_time(), sol_distance2, i_p = p1, d_p = p2, only_add=False, only_delete=True, saveFolder = folderName.format(p_i, p_d, n, '_deletion'), ID = ID) else: print("No solution found for deletion only!") if properly_colored3 and cograph3 and compatible3: print("Saving data (insertion)...") solver3._save_ILP_data(IG._G_perturbed, sol_graph3, solver3.get_solve_time(), sol_distance3, i_p = p1, d_p = p2, only_add=True, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, '_insertion'), ID = ID) else: print("No solution found for insertion only!") ID += 1