Example #1
0
def generate_solutions_unique_species(n, i_p=0.5, d_p=0.5):
    done = False
    count = 0
    while not done:
        S = te.simulate_species_tree(10, model='innovation')
        TGT = te.simulate_dated_gene_tree(S,
                                          dupl_rate=0.5,
                                          loss_rate=0.5,
                                          hgt_rate=0.5,
                                          prohibit_extinction="per_family",
                                          replace_prob=0.0)
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)
        if len(ldt.nodes()) == n:
            IG = InvestigateGraph(ldt)
            IG.perturb_graph(i_p, d_p)

            solver = LDTEditor(IG._G_perturbed)
            solver.build_model()
            solver.optimize(time_limit=None)

            sol_graph, sol_distance = solver.get_solution()

            properly_colored = is_properly_colored(sol_graph)
            cograph = is_cograph(sol_graph)
            compatible = is_compatible(sol_graph)

            edit_dist = gt.symmetric_diff(IG._G_perturbed, sol_graph)
            print("Runtime: {}".format(solver.get_solve_time()))
            if properly_colored and cograph and compatible:
                print("Saving data...")
                solver._save_ILP_data(
                    IG._G_perturbed,
                    sol_graph,
                    solver.get_solve_time(),
                    edit_dist,
                    only_add=False,
                    only_delete=False,
                    filename="{}nodes/LDTEdit_exact_solution".format(n))
            else:
                print("No solution found!")
            count += 1
        if count == 100:
            done = True
Example #2
0
def generate_trees(n=100,
                   m=10,
                   model='innovation',
                   dupl_rate=0.5,
                   loss_rate=0.5,
                   hgt_rate=0.5,
                   prohibit_extinction="per_family",
                   replace_prob=0.0,
                   size=10):
    i = 0
    dirName = 'exact_solutions/trees/{}trees'.format(size)
    # create folder if it doesnt exist
    if not os.path.exists(dirName):
        os.makedirs(dirName)
        ID = 0
    else:
        ID = find_next_ID('exact_solutions/trees/{}trees/'.format(size))

    while i < n:

        S = te.simulate_species_tree(m, model=model)
        TGT = te.simulate_dated_gene_tree(
            S,
            dupl_rate=dupl_rate,
            loss_rate=loss_rate,
            hgt_rate=hgt_rate,
            prohibit_extinction=prohibit_extinction,
            replace_prob=replace_prob)

        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)
        amount_nodes = len(ldt.nodes())
        if amount_nodes == size:
            # save trees
            filename_species = 'exact_solutions/trees/{}trees/species_{}_{}_{}.json'.format(
                size, m, model, ID)
            filename_gene = 'exact_solutions/trees/{}trees/gene_{}_{}_{}_{}_{}_{}.json'.format(
                size, dupl_rate, loss_rate, hgt_rate, prohibit_extinction,
                replace_prob, ID)
            S.serialize(filename_species)
            TGT.serialize(filename_gene)
            ID += 1
            i += 1
Example #3
0
import asymmetree.treeevolve as te
from asymmetree.datastructures import PhyloTree
from asymmetree.hgt import ldt_graph
from tools.LDT_ILP import LDTEditor
import asymmetree.tools.GraphTools as gt
import os

S = te.simulate_species_tree(20, model='innovation')
TGT = te.simulate_dated_gene_tree(S,
                                  dupl_rate=0.5,
                                  loss_rate=0.5,
                                  hgt_rate=0.5,
                                  prohibit_extinction="per_family",
                                  replace_prob=0.0)
OGT = te.observable_tree(TGT)
ldt = ldt_graph(OGT, S)

colors = gt.sort_by_colors(ldt)

#print("edges of G: \n{}".format(G._G.edges()))
#a, b, c = get_P3_data(G._G)
#print("\nThe regions of P3s: {}".format(a))
#print("\nThe amounts in the regions: {}".format(b))
#print("\nThe distance between regions: {}\n".format(c))

print("Amount of nodes: {}".format(len(ldt.nodes())))
print("Amount of colors: {}".format(len(colors)))
print("Amount of edges: {}".format(len(ldt.edges())))


def run_investigation():
Example #4
0
# %% load parameter csv
parameter_Df = pd.read_csv(
    Path(wk_dir / '01_Data') / '01_Simulation_Parameters.csv')

# %% Iterate over all data
ind = 0
for index, item in enumerate(parameter_Df.ID):
    print('Working on Tree # ', ind)
    path_s = str(item) + '_species_tree.pickle'
    path_tgt = str(item) + '_gene_tree.pickle'
    # load data
    s = PhyloTree.load(Path(wk_dir / '01_Data' / path_s))
    tgt = PhyloTree.load(Path(wk_dir / '01_Data' / path_tgt))
    # create graphs
    ogt = te.observable_tree(tgt)
    ldt = hgt.ldt_graph(ogt, s)

    # calculate some interesting parameters and store them in the dataframe
    transfer_edges_true = hgt.true_transfer_edges(ogt)
    fitch_true = hgt.undirected_fitch(ogt, transfer_edges_true)

    parameter_Df.loc[index, ('LDT_Edges')] = len(ldt.edges())
    parameter_Df.loc[index,
                     ('Fitch_true_Edges')] = fitch_true.number_of_edges()

    a = np.array(len(ldt.edges()), dtype=np.float64)
    b = np.array(fitch_true.number_of_edges(), dtype=np.float64)

    parameter_Df.loc[index, ('Fraction_of_Xenologs')] = np.divide(
        a, b, out=np.zeros_like(a), where=b != 0)
    parameter_Df.loc[index, ('Number_of_Species')] = s.number_of_species
Example #5
0
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'):
    # load species+gene trees
    name = filename + '/{}trees'.format(n)
    tree_files = []

    #probabilities = [0.15, 0.30, 0.50]
    #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)]
    #nodes = [10, 14, 18]

    for _, _, files in os.walk(name):
        for file in files:
            tree_files.append(file)
    species_trees = []
    gene_trees = []

    for f in tree_files:
        if f.startswith('species'):
            species_trees.append(f)
        else:
            gene_trees.append(f)

    c1_graphs, c1_edge_count, c1_is_ldt, c1_edit_dist = ([] for i in range(4)
                                                         )  # cograph editing
    t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = (
        [] for i in range(4))  # triples editing with both insertion/deletion
    t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = (
        [] for i in range(4))  # triples editing with deletion only
    t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = (
        [] for i in range(4))  # triples editing with insertion only
    t4_graphs, t4_edge_count, t4_is_ldt, t4_edit_dist = ([] for i in range(4)
                                                         )  # ldt editing

    IG1 = None
    for i in range(len(species_trees)):
        print("Tree pair {}".format(i))
        S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
        TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)

        if not IG1:
            IG1 = InvestigateGraph(ldt)
            IG2 = copy.deepcopy(IG1)
            IG3 = copy.deepcopy(IG1)
            IG4 = copy.deepcopy(IG1)
            #IG5 = copy.deepcopy(IG1)

        IG1.perturb_graph_terminate(p1, p2)
        IG2.perturb_graph_terminate(p1, p2)
        IG3.perturb_graph_terminate(p1, p2)
        IG4.perturb_graph_terminate(p1, p2)
        #IG5.perturb_graph_terminate(p1, p2)

        cograph_edited_G, is_c1_ldt, c1_num_edges, c1_ldt_edit_dist = cograph_editing(
            IG1)
        triples1_edited_G, is_t1_ldt, t1_num_edges, t1_ldt_edit_dist = triples_editing(
            IG2, n=100)
        triples2_edited_G, is_t2_ldt, t2_num_edges, t2_ldt_edit_dist = triples_editing(
            IG3, deletion=True)
        triples3_edited_G, is_t3_ldt, t3_num_edges, t3_ldt_edit_dist = triples_editing(
            IG4, insertion=True)
        #ldt_edited_G, is_t4_ldt, t4_num_edges, t4_ldt_edit_dist = LDT_editing(IG5, deletion = True)

        c1_graphs.append(cograph_edited_G)
        t1_graphs.append(triples1_edited_G)
        t2_graphs.append(triples2_edited_G)
        t3_graphs.append(triples3_edited_G)
        #t4_graphs.append(ldt_edited_G)

        c1_is_ldt.append(is_c1_ldt)
        t1_is_ldt.append(is_t1_ldt)
        t2_is_ldt.append(is_t2_ldt)
        t3_is_ldt.append(is_t3_ldt)
        #t4_is_ldt.append(is_t4_ldt)

        c1_edge_count.append(c1_num_edges)
        t1_edge_count.append(t1_num_edges)
        t2_edge_count.append(t2_num_edges)
        t3_edge_count.append(t3_num_edges)
        #t4_edge_count.append(t4_num_edges)

        c1_edit_dist.append(c1_ldt_edit_dist)
        t1_edit_dist.append(t1_ldt_edit_dist)
        t2_edit_dist.append(t2_ldt_edit_dist)
        t3_edit_dist.append(t3_ldt_edit_dist)
        #t4_edit_dist.append(t4_ldt_edit_dist)

    _, cograph_freq, _ = get_freq(IG1)
    triples1_freq, _, _ = get_freq(IG2)
    triples2_freq, _, _ = get_freq(IG3)
    triples3_freq, _, _ = get_freq(IG4)
    #_, 		_, 		ldt_freq 	= get_freq(IG5)

    frequencies = [cograph_freq, triples1_freq, triples2_freq, triples3_freq]

    return frequencies
Example #6
0
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'):
    # load species+gene trees
    name = filename + '/{}trees'.format(n)
    tree_files = []

    #probabilities = [0.15, 0.30, 0.50]
    #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)]
    #nodes = [10, 14, 18]

    for _, _, files in os.walk(name):
        for file in files:
            tree_files.append(file)
    species_trees = []
    gene_trees = []

    for f in tree_files:
        if f.startswith('species'):
            species_trees.append(f)
        else:
            gene_trees.append(f)

    t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ([] for i in range(4)
                                                         )  # ldt editing
    t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = (
        [] for i in range(4))  # ldt editing (triples edit deletion)
    t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = (
        [] for i in range(4))  # ldt editing (triples edit insertion)

    IG1 = None
    for i in range(len(species_trees)):
        print("Tree pair {}".format(i))
        S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
        TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)

        if not IG1:
            IG1 = InvestigateGraph(ldt)
            IG2 = copy.deepcopy(IG1)
            IG3 = copy.deepcopy(IG1)

        IG1.perturb_graph_terminate(p1, p2)
        IG2.perturb_graph_terminate(p1, p2)
        IG3.perturb_graph_terminate(p1, p2)

        t1_edited_G, is_t1_ldt, _, t1_ldt_edit_dist = LDT_editing(
            IG1, n=100
        )  # ldt editing with triples editing allowing both deletions and insertions for n = 100.
        t2_edited_G, is_t2_ldt, _, t2_ldt_edit_dist = LDT_editing(
            IG2, deletion=True
        )  # ldt editing with triples editing allowing only deletions.
        t3_edited_G, is_t3_ldt, _, t3_ldt_edit_dist = LDT_editing(
            IG3, insertion=True
        )  # ldt editing with triples editing allowing only insertions.

        t1_graphs.append(t1_edited_G)
        t2_graphs.append(t2_edited_G)
        t3_graphs.append(t3_edited_G)

        t1_is_ldt.append(is_t1_ldt)
        t2_is_ldt.append(is_t2_ldt)
        t3_is_ldt.append(is_t3_ldt)

        t1_edit_dist.append(t1_ldt_edit_dist)
        t2_edit_dist.append(t2_ldt_edit_dist)
        t3_edit_dist.append(t3_ldt_edit_dist)

    _, _, ldt1_freq = get_freq(IG1)
    _, _, ldt2_freq = get_freq(IG2)
    _, _, ldt3_freq = get_freq(IG3)

    frequencies = [ldt1_freq, ldt2_freq, ldt3_freq]

    return frequencies
Example #7
0
def generate_solutions_fromTrees(n, filename):
	# load species+gene trees
	name = filename + '/{}trees'.format(n)
	tree_files = []

	probabilities = [0.15, 0.30, 0.50]
	#nodes = [10, 14, 18]
	restrictions = ['', 'insertion', 'deletion']

	for _, _, files in os.walk(name):
		for file in files:
			tree_files.append(file)
	species_trees = []
	gene_trees 	  = []

	for f in tree_files:
		if f.startswith('species'):
			species_trees.append(f)
		else:
			gene_trees.append(f)
	ID = 0
	for i in range(len(species_trees)):
		S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
		TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
		OGT = te.observable_tree(TGT)
		ldt = ldt_graph(OGT, S)
		# perturb using p = 0.15, 0.3, 0.5
		# for each p, solve using ILP with deletion, insertion and both
		for p1 in probabilities:
			for p2 in probabilities:
				p_i = str(p1).replace('.', '')
				if len(p_i) < 3:
					p_i = p_i + '0'
				p_d = str(p2).replace('.', '')
				if len(p_d) < 3:
					p_d = p_d + '0'
				IG = InvestigateGraph(ldt)
				perturbed = IG.perturb_graph_terminate(p1, p2)
				if not perturbed:
					print("failed")
				else:
					# solve 3 times using deletion, insertion and both
					solver1 = LDTEditor(IG._G_perturbed)
					solver1.build_model()
					solver1.optimize(time_limit=None)

					solver2 = LDTEditor(IG._G_perturbed, only_delete = True)
					solver2.build_model()
					solver2.optimize(time_limit=None)
					
					solver3 = LDTEditor(IG._G_perturbed, only_add = True)
					solver3.build_model()
					solver3.optimize(time_limit=None)

					sol_graph1, sol_distance1 = solver1.get_solution()
					sol_graph2, sol_distance2 = solver2.get_solution()
					sol_graph3, sol_distance3 = solver3.get_solution()

					properly_colored1 = is_properly_colored(sol_graph1)
					cograph1 = is_cograph(sol_graph1)
					compatible1 = is_compatible(sol_graph1)

					properly_colored2 = is_properly_colored(sol_graph2)
					cograph2 = is_cograph(sol_graph2)
					compatible2 = is_compatible(sol_graph2)

					properly_colored3 = is_properly_colored(sol_graph3)
					cograph3 = is_cograph(sol_graph3)
					compatible3 = is_compatible(sol_graph3)

					folderName = 'exact_solutions/{}_{}_{}nodes{}/'
					saveFolder1 = folderName.format(p1, p2, n, '')

					if properly_colored1 and cograph1 and compatible1:
						print("Saving data...")
						solver1._save_ILP_data(IG._G_perturbed, sol_graph1, solver1.get_solve_time(), sol_distance1, i_p = p1, d_p = p2, only_add=False, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, ''), ID = ID)
					else:
						print("No solution found!")

					if properly_colored2 and cograph2 and compatible2:
						print("Saving data (deletion)...")
						solver2._save_ILP_data(IG._G_perturbed, sol_graph2, solver2.get_solve_time(), sol_distance2, i_p = p1, d_p = p2, only_add=False, only_delete=True, saveFolder = folderName.format(p_i, p_d, n, '_deletion'), ID = ID)
					else:
						print("No solution found for deletion only!")

					if properly_colored3 and cograph3 and compatible3:
						print("Saving data (insertion)...")
						solver3._save_ILP_data(IG._G_perturbed, sol_graph3, solver3.get_solve_time(), sol_distance3, i_p = p1, d_p = p2, only_add=True, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, '_insertion'), ID = ID)
					else:
						print("No solution found for insertion only!")
					ID += 1