def generate_solutions_unique_species(n, i_p=0.5, d_p=0.5): done = False count = 0 while not done: S = te.simulate_species_tree(10, model='innovation') TGT = te.simulate_dated_gene_tree(S, dupl_rate=0.5, loss_rate=0.5, hgt_rate=0.5, prohibit_extinction="per_family", replace_prob=0.0) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if len(ldt.nodes()) == n: IG = InvestigateGraph(ldt) IG.perturb_graph(i_p, d_p) solver = LDTEditor(IG._G_perturbed) solver.build_model() solver.optimize(time_limit=None) sol_graph, sol_distance = solver.get_solution() properly_colored = is_properly_colored(sol_graph) cograph = is_cograph(sol_graph) compatible = is_compatible(sol_graph) edit_dist = gt.symmetric_diff(IG._G_perturbed, sol_graph) print("Runtime: {}".format(solver.get_solve_time())) if properly_colored and cograph and compatible: print("Saving data...") solver._save_ILP_data( IG._G_perturbed, sol_graph, solver.get_solve_time(), edit_dist, only_add=False, only_delete=False, filename="{}nodes/LDTEdit_exact_solution".format(n)) else: print("No solution found!") count += 1 if count == 100: done = True
def generate_trees(n=100, m=10, model='innovation', dupl_rate=0.5, loss_rate=0.5, hgt_rate=0.5, prohibit_extinction="per_family", replace_prob=0.0, size=10): i = 0 dirName = 'exact_solutions/trees/{}trees'.format(size) # create folder if it doesnt exist if not os.path.exists(dirName): os.makedirs(dirName) ID = 0 else: ID = find_next_ID('exact_solutions/trees/{}trees/'.format(size)) while i < n: S = te.simulate_species_tree(m, model=model) TGT = te.simulate_dated_gene_tree( S, dupl_rate=dupl_rate, loss_rate=loss_rate, hgt_rate=hgt_rate, prohibit_extinction=prohibit_extinction, replace_prob=replace_prob) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) amount_nodes = len(ldt.nodes()) if amount_nodes == size: # save trees filename_species = 'exact_solutions/trees/{}trees/species_{}_{}_{}.json'.format( size, m, model, ID) filename_gene = 'exact_solutions/trees/{}trees/gene_{}_{}_{}_{}_{}_{}.json'.format( size, dupl_rate, loss_rate, hgt_rate, prohibit_extinction, replace_prob, ID) S.serialize(filename_species) TGT.serialize(filename_gene) ID += 1 i += 1
import asymmetree.treeevolve as te from asymmetree.datastructures import PhyloTree from asymmetree.hgt import ldt_graph from tools.LDT_ILP import LDTEditor import asymmetree.tools.GraphTools as gt import os S = te.simulate_species_tree(20, model='innovation') TGT = te.simulate_dated_gene_tree(S, dupl_rate=0.5, loss_rate=0.5, hgt_rate=0.5, prohibit_extinction="per_family", replace_prob=0.0) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) colors = gt.sort_by_colors(ldt) #print("edges of G: \n{}".format(G._G.edges())) #a, b, c = get_P3_data(G._G) #print("\nThe regions of P3s: {}".format(a)) #print("\nThe amounts in the regions: {}".format(b)) #print("\nThe distance between regions: {}\n".format(c)) print("Amount of nodes: {}".format(len(ldt.nodes()))) print("Amount of colors: {}".format(len(colors))) print("Amount of edges: {}".format(len(ldt.edges()))) def run_investigation():
# %% load parameter csv parameter_Df = pd.read_csv( Path(wk_dir / '01_Data') / '01_Simulation_Parameters.csv') # %% Iterate over all data ind = 0 for index, item in enumerate(parameter_Df.ID): print('Working on Tree # ', ind) path_s = str(item) + '_species_tree.pickle' path_tgt = str(item) + '_gene_tree.pickle' # load data s = PhyloTree.load(Path(wk_dir / '01_Data' / path_s)) tgt = PhyloTree.load(Path(wk_dir / '01_Data' / path_tgt)) # create graphs ogt = te.observable_tree(tgt) ldt = hgt.ldt_graph(ogt, s) # calculate some interesting parameters and store them in the dataframe transfer_edges_true = hgt.true_transfer_edges(ogt) fitch_true = hgt.undirected_fitch(ogt, transfer_edges_true) parameter_Df.loc[index, ('LDT_Edges')] = len(ldt.edges()) parameter_Df.loc[index, ('Fitch_true_Edges')] = fitch_true.number_of_edges() a = np.array(len(ldt.edges()), dtype=np.float64) b = np.array(fitch_true.number_of_edges(), dtype=np.float64) parameter_Df.loc[index, ('Fraction_of_Xenologs')] = np.divide( a, b, out=np.zeros_like(a), where=b != 0) parameter_Df.loc[index, ('Number_of_Species')] = s.number_of_species
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] #probabilities = [0.15, 0.30, 0.50] #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)] #nodes = [10, 14, 18] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) c1_graphs, c1_edge_count, c1_is_ldt, c1_edit_dist = ([] for i in range(4) ) # cograph editing t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ( [] for i in range(4)) # triples editing with both insertion/deletion t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = ( [] for i in range(4)) # triples editing with deletion only t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = ( [] for i in range(4)) # triples editing with insertion only t4_graphs, t4_edge_count, t4_is_ldt, t4_edit_dist = ([] for i in range(4) ) # ldt editing IG1 = None for i in range(len(species_trees)): print("Tree pair {}".format(i)) S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if not IG1: IG1 = InvestigateGraph(ldt) IG2 = copy.deepcopy(IG1) IG3 = copy.deepcopy(IG1) IG4 = copy.deepcopy(IG1) #IG5 = copy.deepcopy(IG1) IG1.perturb_graph_terminate(p1, p2) IG2.perturb_graph_terminate(p1, p2) IG3.perturb_graph_terminate(p1, p2) IG4.perturb_graph_terminate(p1, p2) #IG5.perturb_graph_terminate(p1, p2) cograph_edited_G, is_c1_ldt, c1_num_edges, c1_ldt_edit_dist = cograph_editing( IG1) triples1_edited_G, is_t1_ldt, t1_num_edges, t1_ldt_edit_dist = triples_editing( IG2, n=100) triples2_edited_G, is_t2_ldt, t2_num_edges, t2_ldt_edit_dist = triples_editing( IG3, deletion=True) triples3_edited_G, is_t3_ldt, t3_num_edges, t3_ldt_edit_dist = triples_editing( IG4, insertion=True) #ldt_edited_G, is_t4_ldt, t4_num_edges, t4_ldt_edit_dist = LDT_editing(IG5, deletion = True) c1_graphs.append(cograph_edited_G) t1_graphs.append(triples1_edited_G) t2_graphs.append(triples2_edited_G) t3_graphs.append(triples3_edited_G) #t4_graphs.append(ldt_edited_G) c1_is_ldt.append(is_c1_ldt) t1_is_ldt.append(is_t1_ldt) t2_is_ldt.append(is_t2_ldt) t3_is_ldt.append(is_t3_ldt) #t4_is_ldt.append(is_t4_ldt) c1_edge_count.append(c1_num_edges) t1_edge_count.append(t1_num_edges) t2_edge_count.append(t2_num_edges) t3_edge_count.append(t3_num_edges) #t4_edge_count.append(t4_num_edges) c1_edit_dist.append(c1_ldt_edit_dist) t1_edit_dist.append(t1_ldt_edit_dist) t2_edit_dist.append(t2_ldt_edit_dist) t3_edit_dist.append(t3_ldt_edit_dist) #t4_edit_dist.append(t4_ldt_edit_dist) _, cograph_freq, _ = get_freq(IG1) triples1_freq, _, _ = get_freq(IG2) triples2_freq, _, _ = get_freq(IG3) triples3_freq, _, _ = get_freq(IG4) #_, _, ldt_freq = get_freq(IG5) frequencies = [cograph_freq, triples1_freq, triples2_freq, triples3_freq] return frequencies
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] #probabilities = [0.15, 0.30, 0.50] #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)] #nodes = [10, 14, 18] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ([] for i in range(4) ) # ldt editing t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = ( [] for i in range(4)) # ldt editing (triples edit deletion) t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = ( [] for i in range(4)) # ldt editing (triples edit insertion) IG1 = None for i in range(len(species_trees)): print("Tree pair {}".format(i)) S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) if not IG1: IG1 = InvestigateGraph(ldt) IG2 = copy.deepcopy(IG1) IG3 = copy.deepcopy(IG1) IG1.perturb_graph_terminate(p1, p2) IG2.perturb_graph_terminate(p1, p2) IG3.perturb_graph_terminate(p1, p2) t1_edited_G, is_t1_ldt, _, t1_ldt_edit_dist = LDT_editing( IG1, n=100 ) # ldt editing with triples editing allowing both deletions and insertions for n = 100. t2_edited_G, is_t2_ldt, _, t2_ldt_edit_dist = LDT_editing( IG2, deletion=True ) # ldt editing with triples editing allowing only deletions. t3_edited_G, is_t3_ldt, _, t3_ldt_edit_dist = LDT_editing( IG3, insertion=True ) # ldt editing with triples editing allowing only insertions. t1_graphs.append(t1_edited_G) t2_graphs.append(t2_edited_G) t3_graphs.append(t3_edited_G) t1_is_ldt.append(is_t1_ldt) t2_is_ldt.append(is_t2_ldt) t3_is_ldt.append(is_t3_ldt) t1_edit_dist.append(t1_ldt_edit_dist) t2_edit_dist.append(t2_ldt_edit_dist) t3_edit_dist.append(t3_ldt_edit_dist) _, _, ldt1_freq = get_freq(IG1) _, _, ldt2_freq = get_freq(IG2) _, _, ldt3_freq = get_freq(IG3) frequencies = [ldt1_freq, ldt2_freq, ldt3_freq] return frequencies
def generate_solutions_fromTrees(n, filename): # load species+gene trees name = filename + '/{}trees'.format(n) tree_files = [] probabilities = [0.15, 0.30, 0.50] #nodes = [10, 14, 18] restrictions = ['', 'insertion', 'deletion'] for _, _, files in os.walk(name): for file in files: tree_files.append(file) species_trees = [] gene_trees = [] for f in tree_files: if f.startswith('species'): species_trees.append(f) else: gene_trees.append(f) ID = 0 for i in range(len(species_trees)): S = PhyloTree.load(name + '/{}'.format(species_trees[i])) TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i])) OGT = te.observable_tree(TGT) ldt = ldt_graph(OGT, S) # perturb using p = 0.15, 0.3, 0.5 # for each p, solve using ILP with deletion, insertion and both for p1 in probabilities: for p2 in probabilities: p_i = str(p1).replace('.', '') if len(p_i) < 3: p_i = p_i + '0' p_d = str(p2).replace('.', '') if len(p_d) < 3: p_d = p_d + '0' IG = InvestigateGraph(ldt) perturbed = IG.perturb_graph_terminate(p1, p2) if not perturbed: print("failed") else: # solve 3 times using deletion, insertion and both solver1 = LDTEditor(IG._G_perturbed) solver1.build_model() solver1.optimize(time_limit=None) solver2 = LDTEditor(IG._G_perturbed, only_delete = True) solver2.build_model() solver2.optimize(time_limit=None) solver3 = LDTEditor(IG._G_perturbed, only_add = True) solver3.build_model() solver3.optimize(time_limit=None) sol_graph1, sol_distance1 = solver1.get_solution() sol_graph2, sol_distance2 = solver2.get_solution() sol_graph3, sol_distance3 = solver3.get_solution() properly_colored1 = is_properly_colored(sol_graph1) cograph1 = is_cograph(sol_graph1) compatible1 = is_compatible(sol_graph1) properly_colored2 = is_properly_colored(sol_graph2) cograph2 = is_cograph(sol_graph2) compatible2 = is_compatible(sol_graph2) properly_colored3 = is_properly_colored(sol_graph3) cograph3 = is_cograph(sol_graph3) compatible3 = is_compatible(sol_graph3) folderName = 'exact_solutions/{}_{}_{}nodes{}/' saveFolder1 = folderName.format(p1, p2, n, '') if properly_colored1 and cograph1 and compatible1: print("Saving data...") solver1._save_ILP_data(IG._G_perturbed, sol_graph1, solver1.get_solve_time(), sol_distance1, i_p = p1, d_p = p2, only_add=False, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, ''), ID = ID) else: print("No solution found!") if properly_colored2 and cograph2 and compatible2: print("Saving data (deletion)...") solver2._save_ILP_data(IG._G_perturbed, sol_graph2, solver2.get_solve_time(), sol_distance2, i_p = p1, d_p = p2, only_add=False, only_delete=True, saveFolder = folderName.format(p_i, p_d, n, '_deletion'), ID = ID) else: print("No solution found for deletion only!") if properly_colored3 and cograph3 and compatible3: print("Saving data (insertion)...") solver3._save_ILP_data(IG._G_perturbed, sol_graph3, solver3.get_solve_time(), sol_distance3, i_p = p1, d_p = p2, only_add=True, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, '_insertion'), ID = ID) else: print("No solution found for insertion only!") ID += 1