def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA", codon_table=None): """Construct dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor if codon_table is None: codon_table = CodonTable.generic_by_id[1] dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method, codon_table=codon_table) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError(f"Unknown tree method ({tree_method})." " Only NJ and UPGMA are accepted.") return dn_tree, ds_tree
def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA", codon_table=default_codon_table): """Method for constructing dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method, codon_table=codon_table) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError("Unknown tree method ({0}). Only NJ and UPGMA " "are accepted.".format(tree_method)) return dn_tree, ds_tree
def make_newick_tree(dm): constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(dm) njtree = constructor.nj(dm) upgmatree.root_with_outgroup({'name': "KE136308.1"}) njtree.root_with_outgroup({'name': "KE136308.1"}) return upgmatree, njtree
def printGeneTree(self): """ Print gene trees with matplotlib and in the terminal for the four largest target ORFs of coronaviruses. Takes a .phy file containing multiple alligned sequences, generates a matrix based on sequence composition and compares each sequence (genome) to one another. sequences with grater scores (similarity) are ranked closer together on the phylogenetic trees. input: A .phy file that contains coronavirus gene sequences to draw phylogenetic tree output: A visual representation of a gene tree on terminal and matplotlib """ align = AlignIO.read( self.newPhylip, 'phylip') # Reads created .phy file containing the SeqRecord #print (align) # prints concatenated allignments calculator = DistanceCalculator('identity') dm = calculator.get_distance(align) # Calculate the distance matrix print( '\n======================================== DISTANCE MATRIX =======================================\n' ) print(dm, "\n\n") # Print the distance Matrix constructor = DistanceTreeConstructor( ) # Construct the phylogenetic tree using UPGMA algorithm tree = constructor.upgma(dm) print( '\n========================================= GENE TREE ===========================================\n' ) Phylo.draw( tree ) # Draw the phylogenetic tree (must install matplotlib to use this formatting) Phylo.draw_ascii(tree) # Print the phylogenetic tree in terminal
def construct_tree(matrix, nj=True): """Build a tree from a distance matrix Can either use neighbor-joining (nj) or UPGMA. """ if not (matrix and type(matrix) == list and len(matrix) > 0): print "matrix has invalid value" return dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))], matrix=matrix) constructor = DistanceTreeConstructor() if nj: tree = constructor.nj(dm) else: tree = constructor.upgma(dm) # this will remove the names from the inner nodes # this is critical for seq-gen to read in the tree for clade in tree.get_nonterminals(): clade.name = '' return tree
def dna(file_path, file_format, algorithm): # Read the sequences and align aln = AlignIO.read(file_path, file_format) # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(calculator) # Construct the phylogenetic tree using choosen algorithm constructor = DistanceTreeConstructor() if algorithm.lower() == 'upgma': tree = constructor.upgma(dm) elif algorithm.lower() == 'nj': tree = constructor.nj(dm) else: click.echo('Invalid algorithm!') # Draw the phylogenetic tree Phylo.draw(tree) # Print the phylogenetic tree in the terminal print('\nPhylogenetic Tree\n===================') Phylo.draw_ascii(tree)
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor""" def setUp(self): self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/upgma.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/nj.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close() def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/nj.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close()
def build_trees(filename, tree_name): # Compute alignment with ClustalW algorithm clustalw_cline = ClustalwCommandline("clustalw", infile="{}.fa".format(filename)) clustalw_cline() alignment = AlignIO.read("{}.aln".format(filename), format="clustal") # Create distance matrix calculator = DistanceCalculator('blosum62') dist_matrix = calculator.get_distance(alignment) # Build phylogenetic trees using upgma and nj methods constructor = DistanceTreeConstructor() upgma_tree = constructor.upgma(dist_matrix) nj_tree = constructor.nj(dist_matrix) # Draw the trees label_func = lambda clade: "" if clade.name.startswith("Inner") else clade Phylo.draw(upgma_tree, label_func=label_func, do_show=False) plt.title("{} × upgma".format(tree_name)) plt.show() Phylo.draw(nj_tree, label_func=label_func, do_show=False) plt.title("{} × nj".format(tree_name)) plt.show()
def main(argv): # Test table data and corresponding labels M_labels = [ 'Wuttagoonaspis', 'Romundina', 'Brindabellaspis', 'Eurycaraspis', 'Entelognathus' ] print(M_labels) #A through G M = np.loadtxt(open(argv[1], "rb"), delimiter=",") l = np.tril(M) temp = np.ones((5, 5)) u = np.triu(temp) l += u np.fill_diagonal(l, 0) M = l.tolist() for j in range(0, 5): for i in range(0, 5): M[i] = list(filter(lambda a: a != 1, M[i])) m = _Matrix(M_labels, M) print(type(m)) constructor = DistanceTreeConstructor() tree = constructor.upgma(m) Phylo.draw(tree)
def build_phylogeny_trees(): path = "out/homologous_gene_sequences/" output_path = "out/aligned_homologous_gene_sequences/" for homologous_gene_sequence in os.listdir(path): input = path + homologous_gene_sequence output = output_path + homologous_gene_sequence clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True) os.system(str(clustal_omega)) multi_seq_align = AlignIO.read(output, 'fasta') # Distance Matrix calculator = DistanceCalculator('identity') dist_mat = calculator.get_distance(multi_seq_align) tree_constructor = DistanceTreeConstructor() phylo_tree = tree_constructor.upgma(dist_mat) Phylo.draw(phylo_tree) print('\nPhylogenetic Tree\n', homologous_gene_sequence) Phylo.draw_ascii(phylo_tree) Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) print("Number of characters in alignment:", len(alignment[0])) #################### # Neighbor joining # #################### calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() start = time.time() tree = constructor.nj(dm) end = time.time() print("Neighbor joining ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label) ######### # UPGMA # ######### start = time.time() tree = constructor.upgma(dm) end = time.time() print("UPGMA ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label)
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor""" def setUp(self): self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def plot_phylo_tree(align: MultipleSeqAlignment, accession_numbers: dict): """ Plots a phylogenetic tree :param align: MultipleSeqAlignment with the alignment result to be plotted :param accession_numbers: dict of accession numbers and their translation to human-understandable names :return: figure-handle of the plotted phylogenetic tree """ # calculate distance - https://biopython.org/wiki/Phylo calculator = DistanceCalculator('identity') dm = calculator.get_distance(align) # construct a tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) # remove the names for the non-terminals for better visual appeal for non_terminal in tree.get_nonterminals(): non_terminal.name = '' # change accession numbers into human more understandable names for terminal in tree.get_terminals(): terminal.name = accession_numbers[re.match("(^\S*)(?=\.)", terminal.name)[0]] print(Phylo.draw_ascii(tree)) # plot the tree fig, ax = plt.subplots(1, 1) # draw the resulting tree Phylo.draw(tree, show_confidence=False, axes=ax, do_show=False) ax.set_xlim(right=0.8) return fig
def measure_D_net(G,qmod,qcon): D_net_dic = {} D_net_ret = {} D_net = [] for u in G: D_net_dic[u] = {} for u in sorted(G): key1 = "Taxon" + str(u) tmp_row = [] for v in sorted(G): key2 = "Taxon" + str(v) if u < v: continue D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon) tmp_row.append(D_net_dic[u][v]) print D_net_dic[u][v], D_net.append(tmp_row) print '\n' names = [] for u in G: names.append('Taxon'+str(u)) print names print D_net D_net_final = _DistanceMatrix(names,D_net) #print D_net_final.names constructor = DistanceTreeConstructor() tree_dmc = constructor.upgma(D_net_final) #print tree_dmc Phylo.write(tree_dmc,'ph_dmc.nre','newick') return D_net_final
def upgma_tree_constructor(x): constructor = DistanceTreeConstructor() calculator = DistanceCalculator("identity") dm = calculator.get_distance(x) upgmatree = constructor.upgma(dm) print(upgmatree) Phylo.draw_ascii(upgmatree)
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'): # Construct Tree with specific type (Default = UPGMA) if with_marburg == 1: print('Constructing Tree with All Viruses without Marburg') filename = algorithm + '_' + gene_name names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire'] else: print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name)) filename = algorithm + '_' + gene_name + '_with_Marburg' names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg'] marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta") Alignment.read_data() print('Aligning Genes for marburg_genome') gene_name += '_with_marburg' Alignment.read_genes(marburg_genome) print('Reading edit matrix and construct tree') edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None) # read edit matrix file constructor = DistanceTreeConstructor() # Create a tree constructor object edit_matrix = convert_tu_lower_triangular(edit_matrix) # Convert Edit Distance matrix to lower triangular distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix) if algorithm == 'NJ': # Neighbor-Joining Alogrithm tree = constructor.nj(distance_matrix) else: # UPGMA Algorithm tree = constructor.upgma(distance_matrix) save_tree(tree, filename) # Save Tree into a file return tree
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor""" def setUp(self): self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def tree_reconstruction(phy_file, method, model, phyformat): '''Construct tree with given method and model''' aln = AlignIO.read(phy_file, 'phylip-' + phyformat) constructor = DistanceTreeConstructor() calculator = DistanceCalculator(model) dm = calculator.get_distance(aln) if method == 'upgma': tree = constructor.upgma(dm) elif method == 'nj': tree = constructor.nj(dm) tree.ladderize() for c in tree.find_clades(): if 'Inner' in c.name: c.name = '' Phylo.write(tree, args.output + '/tree.nwk', 'newick') plt.rcParams['font.style'] = 'italic' plt.rc('font', size=8) plt.rc('axes', titlesize=14) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) plt.rc('figure', titlesize=18) draw(tree, do_show=False) plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
def D_seq_matrix(fasta_file): aln = AlignIO.read(fasta_file, 'fasta') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree_seq = constructor.upgma(dm) #print tree_dmc Phylo.write(tree_seq,'ph_seq.nre','newick') print dm.names return dm
def createTree(file): aln = AlignIO.read(file, 'phylip') # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) Phylo.write(tree, 'new.xml', 'phyloxml')
def print_trees(country, position_table): ### Pull out the concensus sequence concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0] concensus_seq position_table = position_table.set_index('seqid') ### Determine which samples are farthest from the concensus sequence distance_from_concensus_seq = position_table.apply( lambda row: sum(row != concensus_seq), axis=1) distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values( ascending=False) distance_from_concensus_seq_sorted ### Select 10 sequences to do our first analysis subset_seqs = distance_from_concensus_seq_sorted[:10].index subset_seqs ### Construct a distance matrix for our sequences distances = {} for i, seqid1 in enumerate(subset_seqs): distances[seqid1, seqid1] = 0 for j in range(i + 1, len(subset_seqs)): seqid2 = subset_seqs[j] distances[seqid1, seqid2] = sum( position_table.loc[seqid1] != position_table.loc[seqid2]) distances[seqid2, seqid1] = distances[seqid1, seqid2] distances = pd.Series(distances).unstack() matrix = np.tril(distances.values).tolist() for i in range(len(matrix)): matrix[i] = matrix[i][:i + 1] dm = DistanceMatrix(list(distances.index), matrix) ### Now construct our tree constructor = DistanceTreeConstructor() tree = constructor.nj(dm) print(country.upper()) print("Neighbor Joining Tree") tree.ladderize() # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree)) #**Please see the guidance at the top of the page for what to try** if (len(dm) > 1): tree2 = constructor.upgma(dm) #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences print("UPGMA Tree") tree2.ladderize( ) # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree2)) return
def phyloxml_from_msa(msa, phyloxml): from Bio import AlignIO from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio.Phylo.TreeConstruction import DistanceTreeConstructor from Bio import Phylo ms_alignment = AlignIO.read(msa, "fasta") calculator = DistanceCalculator("ident") dist_matrix = calculator.get_distance(ms_alignment) constructor = DistanceTreeConstructor() tree = constructor.upgma(dist_matrix) Phylo.write(tree, phyloxml, "phyloxml")
def draw(self): """ visualize the phylo tree """ mat = list( map(lambda x: list(filter(lambda x: x > 0, x)), self.distMat.tolist())) constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(DistanceMatrix(self.names, mat)) Phylo.draw_ascii(upgmatree)
def build_tree_UPGMA(msa, distanceMatrix=None): if not distanceMatrix: distCalculator = DistanceCalculator("identity") distanceMatrix = distCalculator.get_distance(msa) # Construct the tree with the distance Matrix constructor = DistanceTreeConstructor() tree = constructor.upgma(distanceMatrix) # Make the tree rooted #tree.root_at_midpoint() #return newick format return "[&R] " + tree.format("newick").strip()
def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA"): """Method for constructing dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError("Unkown tree method ({0}). Only NJ and UPGMA " "are accepted.".format(tree_method)) return dn_tree, ds_tree
def UPGMA_tree_reconstruction(dm): ''' input: dm: distance matrix output: tree: reconstructed tree ''' constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) #tree = constructor.nj(dm) print(tree) return(tree)
def display(self): # Create description to be shown on the tree self.create_description_labels() # Print the distance Matrix print('\nDistance Matrix\n===================') print(self.distance_matrix) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(self.distance_matrix) self.draw_tree(tree)
def construct_tree(align, ssr_regions, motifs, weights=[1, 0.1]): """ Construct an upgma tree based on a pairwise Levenshtein distance matrix. For each pairwise comparison, the Levenshtein distances are calculated for sequences of non-SSR and SSR regions separately, and the weighted sum of them are used as the distance to construct an upgma tree. By default, weights for non-SSR and SSR regions are 1 and 0.1, respectively. In SSR regions, one repeat difference is considered to be one edit distance. Parameters ---------- align: Bio.AlignIO.MultipleSeqAlignment input sequence alignment ssr_regions: list of tuple start and end positions of SSR regions in the alignment motifs: list repeat motifs weights: list weights for non-SSR and SSR regions to culculate pairwise distances (default: [1, 0.1]) """ non_ssr_seqs = [] ssr_seqs = [] for a in align: seq = str(a.seq.upper()) ssr_idx = np.array(list(chain(*[list(range(*x)) for x in ssr_regions]))) non_ssr_idx = list(set(range(len(seq))) - set(ssr_idx)) seq_arr = np.array(list(seq)) non_ssr_seq = "".join(seq_arr[non_ssr_idx]) non_ssr_seqs.append(non_ssr_seq) ssr_seq = "" for rr, mot in zip(ssr_regions, motifs): ssr_seq += seq[rr[0]:rr[1]].replace("-", "").replace(mot, "x") ssr_seqs.append(ssr_seq) mat1 = pairwise_dist_Levenstein(non_ssr_seqs) mat2 = pairwise_dist_Levenstein(ssr_seqs) mat = [ list(np.array(i) * weights[0] + np.array(j) * weights[1]) for i, j in zip(mat1, mat2) ] names = ["seq{}".format(i) for i in range(len(align))] dmat = _DistanceMatrix(names, mat) constructor = DistanceTreeConstructor() return constructor.upgma(dmat)
def ex01(): alignments = get_alignments() calculator = DistanceCalculator('blosum62') constructor = DistanceTreeConstructor() for a, name in alignments: dist_matrix = calculator.get_distance(a) upgma_tree = constructor.upgma(dist_matrix) nj_tree = constructor.nj(dist_matrix) print("\n\n>>> {}".format(name)) # print(dist_matrix) # draw_ascii(upgma_tree) # draw_ascii(nj_tree) draw(upgma_tree) draw(nj_tree)
def onClick5(self): with open("Conjunto_fasta.aln", "r") as aln: #usar AlignIO tpara leer el archivo de alineamiento en formato 'clustal' format alignment = align.read(aln, "clustal") #calcular la matriz de distancias calculator = DistanceCalculator('identity') # añade la matriz de distancias al objeto calculator y lo retorna dm = calculator.get_distance(alignment) #Construir el arbol filogenetico aprtir de las distancias constructor = DistanceTreeConstructor(calculator) upgma_tree = constructor.upgma(dm) Phylo.draw_ascii(upgma_tree) Phylo.draw(upgma_tree)
def run_optimization(): ''' ''' params = get_data() num_samples = 16 #--------------------------------------------------------------------------------------------------------------------------------------------------- NUM_OF_VERTICES = 200 distances = np.zeros((num_samples, num_samples)) for i in range(num_samples): for j in range(i + 1, num_samples): print("working on the pair", (i, j)) distances[i, j] = np.abs(compare_curves(params[i], params[j], num_of_verts=NUM_OF_VERTICES)) distances[j, i] = distances[i,j] #--------------------------------------------------------------------------------------------------------------------------------------------------- # Plot distance matrix and make phylogenetic tree #--------------------------------------------------------------------------------------------------------------------------------------------------- plt.matshow(distances) plt.colorbar() plt.show distaceMat = [list(distances[i, :i+1]) for i in range(16)] distaceMatrix = DistanceMatrix(names=['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'], matrix=distaceMat) constructor = DistanceTreeConstructor() tree_up = constructor.upgma(distaceMatrix) tree_nj = constructor.nj(distaceMatrix) Phylo.draw_ascii(tree_nj) Phylo.draw_ascii(tree_up) return distances
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor.""" def setUp(self): self.aln = AlignIO.read("TreeConstruction/msa.phy", "phylip") calculator = DistanceCalculator("blosum62") self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/upgma.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() # create a matrix of length 2 calculator = DistanceCalculator("blosum62") self.min_dm = calculator.get_distance(self.aln) for i in range(len(self.min_dm) - 2): del self.min_dm[len(self.min_dm) - 1] min_tree = self.constructor.nj(self.min_dm) self.assertIsInstance(min_tree, BaseTree.Tree) ref_min_tree = Phylo.read("./TreeConstruction/nj_min.tre", "newick") self.assertTrue(Consensus._equal_topology(min_tree, ref_min_tree)) def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def get_phylogenetic_tree(max_str_len=1, norm="JSD", cpc_function="Square25", joining_alg="nj"): desc, genes = iter_over_files() pm = pd_matrix(genes, max_str_len=max_str_len, norm=norm, cpc_function="Square25") pm = convert_triangle(pm) dm = DistanceMatrix(names=desc, matrix=pm) constructor = DistanceTreeConstructor() if (joining_alg == "nj"): tree = constructor.nj(dm) elif (joining_alg == "upgma"): tree = constructor.upgma(dm) Phylo.write(tree, 'phylo-tree/result.xml', 'newick')
def main(): dist_mat = ParseMatrix(args.DISTMAT) if args.distout is True: print 'Distance Matrix:' print dist_mat tree_constructor = DistanceTreeConstructor() if args.method == 'nj': tree = tree_constructor.nj(dist_mat) elif args.method == 'upgma': tree = tree_constructor.upgma(dist_mat) if args.draw is True: Phylo.draw(tree) #Write NEWICK file Phylo.write(tree, args.out + '.tree', args.outfmt)
def tree_from_scores(list_with_scores): """Generates Guide_tree object from list of pairwise scoring input from graph matching algorithms. Parameters ---------- list with scores : scores from the pairwise alignments of the graphs. Example for three graphs a, b, c: [["a", "b", 2], ["a", "c", 4], ["b", "c", 3]] Output ------ Guide_tree object """ matrix = Guide_tree_Generator.score_to_matrix(list_with_scores) constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(matrix) tree = Phylo.to_networkx(upgmatree) guide_tree = Guide_tree(tree) return guide_tree
def tree_from_random(list_of_scores): """Generates a random guide tree for MGA. Parameters ---------- list_of_scores : scores from the pairwise alignments of the graphs to get graph names. Example for three graphs a, b, c: [["a", "b", 2], ["a", "c", 4], ["b", "c", 3]] Output ------ Guide_tree object """ names = Guide_tree_Generator.make_graph_list(list_of_scores) matrix = Guide_tree_Generator.random_score_matrix(names) constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(matrix) tree = Phylo.to_networkx(upgmatree) guide_tree = Guide_tree(tree) return guide_tree
def build_tree(dist_matrix, names_list, clust): tree = None if clust == 'nj': # print(dist_matrix) dm = DistanceMatrix(dist_matrix, names_list) tree_scikit = nj(dm,result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() tree_biopython = constructor.upgma(dm) # remove InnerNode names for i in tree_biopython.get_nonterminals(): i.name = None output = StringIO() Phylo.write(tree_biopython,output, "newick") tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() return tree
def D_F_matrix(D_Seq,D_net,final_tree): names_Seq = D_Seq.names names_Net = D_net.names D_F = [] D_F_names = [] for key1 in names_Net: i = names_Net.index(key1) #print key1 temp_row = [] for j in range(0,i+1): key2 = names_Net[j] #print key2, if key1 in names_Net and key2 in names_Seq: if not key1 in D_F_names: D_F_names.append(key1) i1 = names_Net.index(key1) j2 = names_Net.index(key2) new_val = (0.5*D_net[key1,key2] + 0.5*D_Seq[key1,key2]) #print new_val, temp_row.append(new_val) #print temp_row D_F.append(temp_row) print D_F D_F_final = _DistanceMatrix(D_F_names,D_F) constructor = DistanceTreeConstructor() tree_D_F = constructor.upgma(D_F_final) #print tree_dmc Phylo.write(tree_D_F,final_tree,'newick') return D_F_final
def D_F_matrix(D_Seq,D_net,final_tree, alpha): names_Seq = D_Seq.names names_Net = D_net.names D_F = [] D_F_names = [] for key1 in names_Net: i = names_Net.index(key1) #print key1 temp_row = [] for j in range(0,i+1): key2 = names_Net[j] #print key2, if key1 in names_Net and key2 in names_Seq: if not key1 in D_F_names: D_F_names.append(key1) i1 = names_Net.index(key1) j2 = names_Net.index(key2) # should be 1-alpha * D_net and alpha * D_seq new_val = ((1-alpha) * D_net[key1,key2]) + (alpha * D_Seq[key1,key2]) # alpha can be set to any value (between 0 and 1) #print new_val, # we can change alpha to choose how much of D_Seq and D_net we want to use temp_row.append(new_val) #print temp_row D_F.append(temp_row) print D_F D_F_final = _DistanceMatrix(D_F_names,D_F) constructor = DistanceTreeConstructor() tree_D_F = constructor.upgma(D_F_final) #print tree_dmc Phylo.write(tree_D_F,final_tree,'newick') return D_F_final
def NNIheuristic(FASTAFile, sampleSize, threshold, outputDir): """"Find the maximum parsimony score for that tree""" random.seed(0) outputFile = FASTAFile.replace(".align", ".out") if "/" in outputFile: outputFile = outputFile[outputFile.rfind("/"):] output = open(outputDir + "/" + outputFile, 'w') output.write("*****************RUN STARTS HERE!*****************") #start time startTime = time.clock() output.write("\n" + "Filename: " + FASTAFile + "\n") output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Sample Size: " + str(sampleSize) + "\nThreshold: " + str(threshold) + "\n\n") # Import fasta alignment file myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) output.write("matrix constructed here") constructor = DistanceTreeConstructor() upgmaTree = constructor.upgma(myMatrix) output.write("constructed upgma tree") # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format output.write("created the original tree into newick format") # RLR tree required for maxParsimony function tree = NewicktoRLR(tree) score = maxParsimony(tree, tipMapping) graph = nx.Graph() makeGraph(graph, tree) output.write("made a graph") leaves = getLeaves(tree) currentFeasible = isFeasible(graph,leaves) output.write("tested isFeasible") # Perform NNI heuristic counter = 0 loopCounter = 0 while True: output.write("in the while loop") loopCounter += 1 output.write("Loop Iteration: " + str(loopCounter) + "\n") output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Current Tree\nFeasibility: " + str(currentFeasible) + "\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n") NNIs = allNNIs(tree) if len(NNIs)-1 < sampleSize: sampleSize = len(NNIs)-1 toScore = random.sample(NNIs, sampleSize) # add feasibility test output.write("starting feasibility test") feasible = [] infeasible = [] for tree in toScore: graph = nx.Graph() makeGraph(graph, tree) leaves = getLeaves(tree) if isFeasible(graph, leaves): #if this tree is possible feasible.append(tree) else: infeasible.append(tree) #if this tree is not possible output.write("Number of Feasible Neighbor Trees: " + str(len(feasible)) + "\n") output.write("Number of Infeasible Neighbor Trees: " + str(len(infeasible)) + "\n") if len(feasible) != 0: #if feasible trees were found if isFeasible(graph, leaves): #if this NNI is possible feasible.append(tree) else: infeasible.append(tree) #if this NNI is not possible if len(feasible) != 0: #if feasible NNIs were found scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), feasible) sortedList = sorted(scoredList) counter = 0 if not currentFeasible or sortedList[0][0] < score: score = sortedList[0][0] tree = sortedList[0][1] currentFeasible = True output.write("Found a New Feasible Tree!\n\n") else: output.write("Best Possible Feasible Tree Found\n" + str(tree) + "\n" + "Score: " + str(score) + "\n\n") break else: #if no possible trees we're found if currentFeasible: #checks if the original tree was feasible output.write("No Feasible Neighbors, Best Possible Feasible Tree\n" + str(tree) + "\n\n") break counter += 1 output.write("Threshold counter: " + str(counter) + "\n\n") if counter >= threshold: output.write("Threshold Met: No Feasible Tree Found\n") stopTime = (time.clock() - startTime) output.write("Program Stop: " + str(stopTime) + " seconds\n\n") return output.write("Searching Infeasible Space\n") scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), infeasible) sortedList = sorted(scoredList) choseNeighbor = False for neighbor in sortedList: #if the original tree was infeasible and no feasible neighbors were found, take the next best infeasible tree and run again if neighbor[0] > score: score = neighbor[0] tree = neighbor[1] choseNeighbor = True break if not choseNeighbor: score = sortedList[-1][0] tree = sortedList[-1][1] currentFeasible = False output.write("Next Best Infeasible Tree\n\n") endTime = (time.clock() - startTime) output.write("Program End: " + str(endTime) + " seconds\n\n") #outputTree = RLRtoNewick(tree) #print "Final score", score return
## pad sequences so that they all have the same length #for record in records: # if len(record.seq) != maxlen: # sequence = str(record.seq).ljust(maxlen, '.') # record.seq = Seq.Seq(sequence) #assert all(len(record.seq) == maxlen for record in records) ## write to temporary file and do alignment #output_file = '{}_padded.fasta'.format(os.path.splitext(input_file)[0]) #with open(output_file, 'w') as f: # SeqIO.write(records, f, 'fasta') #alignment = AlignIO.read(output_file, "fasta") #cline = ClustalwCommandline("clustalw2", infile=input_file) #print(cline) #print type(cline) muscle_cline = MuscleCommandline(input=input_file) stdout, stderr = muscle_cline() alignment = AlignIO.read(StringIO(stdout), "fasta") print(alignment) #alignment = AlignIO.read('../data/ls_orchid.fasta', 'fasta') #print alignment calculator = DistanceCalculator('ident') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) Phylo.write(tree, 'phyloxml.xml', 'phyloxml')
def noFeasibleTest(FASTAFile, sampleSize, outputDir): """"takes a FASTAFile, constructs a UPGMA Tree from the file data, converts this tree to RLR format, tries to find the tree with the lowest parsimony score (ignores feasibility check)""" random.seed(0) outputFile = FASTAFile.replace(".align", ".out") if "/" in outputFile: outputFile = outputFile[outputFile.rfind("/"):] output = open(outputDir + "/" + outputFile, 'w') output.write("*****************RUN STARTS HERE!*****************") #start time startTime = time.clock() output.write("\n" + "Filename: " + FASTAFile + "\n") output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Sample Size: " + str(sampleSize) + "\n\n") # Import fasta alignment file myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) constructor = DistanceTreeConstructor() upgmaTree = constructor.upgma(myMatrix) # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format # RLR tree required for maxParsimony function tree = NNI.NewicktoRLR(tree) score = NNI.maxParsimony(tree, tipMapping) # Perform NNI heuristic loopCounter = 0 while True: loopCounter += 1 output.write("Loop Iteration: " + str(loopCounter) + "\n") output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Current Tree\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n") NNIs = NNI.allNNIs(tree) if len(NNIs)-1 < sampleSize: sampleSize = len(NNIs)-1 toScore = random.sample(NNIs, sampleSize) scoredList = map(lambda x: (NNI.maxParsimony(x, tipMapping), x), toScore) sortedlist = sorted(scoredList) if sortedlist[0][0] < score: score = sortedlist[0][0] tree = sortedlist[0][1] output.write("Found A More Parsimonious Tree!\n\n") else: break output.write("No Neighbors With Better Scores Found\n\n") output.write("Final Tree:\n" + str(tree) + "\nScore: " + str(score) + "\n\n") endTime = (time.clock() - startTime) output.write("Program End: " + str(endTime) + " seconds\n\n") return
# Creates the distance matrix calculator = DistanceCalculator('ident') dm_ape = calculator.get_distance(alignApe) dm_hiv = calculator.get_distance(alignHIV) # Jukes Cantor corrections dm_ape_corrected = dm_ape for d in dm_ape_corrected.matrix: d[:] = [-3/4*np.log(1-4/3*x) for x in d] dm_hiv_corrected = dm_hiv for d in dm_hiv_corrected.matrix: d[:] = [-3/4*np.log(1-4/3*x) for x in d] # Constructs the tree using the upgma algorithm constructor = DistanceTreeConstructor() tree_ape = constructor.upgma(dm_ape) tree_ape_corrected = constructor.upgma(dm_ape_corrected) tree_hiv = constructor.upgma(dm_hiv) tree_hiv_corrected = constructor.upgma(dm_hiv_corrected) # Outputs the trees as a xml Phylo.write(tree_ape, 'treeApe.xml', 'phyloxml') Phylo.write(tree_ape_corrected, 'treeApe_corrected.xml', 'phyloxml') Phylo.write(tree_hiv, 'treeHIV.xml', 'phyloxml') Phylo.write(tree_hiv_corrected, 'treeHIV_corrected.xml', 'phyloxml')
def compute_tree(options, mat, names): """ make upgma hierarchical clustering and write it as png and graphviz dot """ # oops, convert to biopython matrix matrix = [] for i in xrange(len(names)): row = [] for j in xrange(i + 1): # tree constructor writes 0-distances as 1s for some reason # so we hack around here val = float(mat[names[i]][names[j]]) if val == 0.: val = 1e-10 elif val == 1.: val = 1.1 row.append(val) matrix.append(row) dm = _DistanceMatrix(names, matrix) # upgma tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) robust_makedirs(os.path.dirname(tree_path(options))) Phylo.write(tree, tree_path(options), "newick") # png tree -- note : doesn't work in toil def f(x): if "Inner" in str(x): return "" else: return x Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10) pylab.savefig(tree_path(options).replace("newick", "png")) # graphviz # get networkx graph nxgraph = Phylo.to_networkx(tree) # make undirected nxgraph = nx.Graph(nxgraph) # push names to name labels nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label") for node_id in nxgraph.nodes(): node = nxgraph.node[node_id] if "Inner" in str(node["label"]): node["label"] = "\"\"" node["width"] = 0.001 node["height"] = 0.001 else: node["fontsize"] = 18 for edge_id in nxgraph.edges(): edge = nxgraph.edge[edge_id[0]][edge_id[1]] # in graphviz, weight means something else, so make it a label weight = float(edge["weight"]) # undo hack from above if weight > 1: weight = 1. if weight <= 1e-10 or weight == 1.: weight = 0. edge["weight"] = None edge["label"] = "{0:.3g}".format(float(weight) * 100.) edge["fontsize"] = 14 edge["len"] = draw_len(weight) nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))