Ejemplo n.º 1
0
def run_method(method, tree, m = 300, kappa = 2, mutation_rate=0.05, threshold = None, verbose = False):
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(m, tree_model=tree, seq_model=generation.HKY(kappa = kappa), mutation_rate=mutation_rate, alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)
    
    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(observations, taxa_meta, raxml_args="-T 2 --HKY85 -c 1")      
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = snj(observations, taxa_meta)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = nj(observations, taxa_meta)
    if method == "STR+NJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+SNJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+RaXML":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.RAxML, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold,
                                                            raxml_args = "-T 2 --HKY85 -c 1", min_split = 5, verbose = verbose)
    runtime = time.time() - start_time
    RF,F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ",RF)
    print("F1% = ",F1) 
    return([method, str(threshold), runtime, RF, F1])
N = 1000 
data_HKY = simulate_discrete_chars(N, H3N2_tree, Hky85(kappa = 2), mutation_rate=0.1)
ch_list = list()
for t in data_HKY.taxon_namespace:
    ch_list.append([x.symbol for x in data_HKY[t]])
ch_arr = np.array(ch_list)
identical = np.array([np.mean(a == b) for a, b in product(ch_arr, repeat = 2)])

#start_time = time.time()
#cProfile.run('S = HKY_similarity_matrix(ch_arr)')
#compute_s_time = time.time() - start_time
#print("--- %s seconds ---" % compute_s_time)
threshold = 128
t1 = time.time()
spectral_method = reconstruct_tree.STDR(reconstruct_tree.RAxML,
                                                              reconstruct_tree.HKY_similarity_matrix)
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_arr, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxon_namespace = H3N2_tree.taxon_namespace, 
                                                            threshhold = threshold,min_split = 30)
runtime = time.time()-t1


Deep_nj_RF, Deep_nj_RF_F1 = reconstruct_tree.compare_trees(tree_rec, H3N2_tree)
print("SNJ: ")
print("RF = ", Deep_nj_RF)
print("F1% = ", Deep_nj_RF_F1)
print("runtime = ", runtime)
print("")


Ejemplo n.º 3
0
def run_method(method,
               size,
               run_num,
               tree,
               m=300,
               kappa=2,
               mutation_rate=0.05,
               threshold=None,
               verbose=False):
    subtree_folder = "/gpfs/ysm/scratch60/morgan_levine/mw957/tree_merge_test/seqlen_" + str(
        m) + "_" + method + "_" + str(threshold) + "_" + str(run_num) + "/"
    if os.path.exists(subtree_folder):
        shutil.rmtree(subtree_folder)
    os.mkdir(subtree_folder)
    tree.write(path=subtree_folder + "true_tree.txt", schema="newick")
    subtree_filename = subtree_folder + "subtree_%s.txt"
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(
        m,
        tree_model=tree,
        seq_model=generation.Jukes_Cantor(),
        mutation_rate=mutation_rate,
        alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)

    spectral_method = reconstruct_tree.STDR(
        reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix)
    start_time = time.time()
    tree_rec = spectral_method.deep_spectral_tree_reconstruction(
        observations,
        reconstruct_tree.JC_similarity_matrix,
        taxa_metadata=taxa_meta,
        threshhold=threshold,
        raxml_args="-T 2 --HKY85 -c 1",
        min_split=5,
        verbose=verbose,
        subtree_filename=subtree_filename)
    runtime = time.time() - start_time
    tree_rec.write(path=subtree_folder + "STDR_tree.txt", schema="newick")

    distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta)
    distance_pd = pd.DataFrame(distance)
    taxa_list = [x.label for x in taxa_meta]

    with open(subtree_folder + 'taxa.txt', 'w') as f:
        for item in taxa_list:
            f.write("%s\n" % item)
    distance_pd.index = taxa_list
    distance_path = subtree_folder + "HKY_distance.txt"
    distance_pd.to_csv(distance_path, sep="\t", header=False)
    with open(distance_path, 'r') as original:
        data = original.read()
    with open(distance_path, 'w') as modified:
        modified.write(str(size) + "\n" + data)

    # accuracy of the STDR method
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)

    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])