def writeNewick(species, distance, output): ''' Input is a list of species names and the top half of the distance matrix Newick tree is output to standard out. ''' outputFile = sys.stdout if args.output: outputFile = open(output, 'w') import Bio.Phylo.TreeConstruction as TreeConstruction constructor = TreeConstruction.DistanceTreeConstructor() distanceMatrix = TreeConstruction._DistanceMatrix(species, distance) treeConstructor = TreeConstruction.DistanceTreeConstructor(method='nj') njTree = treeConstructor.nj(distanceMatrix) TEMP_FILE_NUM = str(int(os.urandom(3).encode('hex'), 16)) while checkTempNum(TEMP_FILE_NUM): TEMP_FILE_NUM = str(int(os.urandom(3).encode('hex'), 16)) tempFile = open(".tempFile" + TEMP_FILE_NUM, 'w') from Bio import Phylo Phylo.write(njTree, tempFile, "newick") tempFile.close() import re treeF = open(".tempFile" + TEMP_FILE_NUM, 'r') tree = treeF.read() treeF.close() os.remove(".tempFile" + TEMP_FILE_NUM) tree = re.sub("Inner[0-9]+:[-0-9\.]+", "", tree) tree = re.sub(":[0-9\.]+", "", tree) tree = re.sub("_", " ", tree) outputFile.write(tree) if args.output: outputFile.close()
def labeler(files, etalon_tree, tree_path=".", rebuild=False): """ Constructs labels for given files. (Best phylogeny reconstruction method) :param files: an iterable with file paths to alignments :param etalon_tree: the path to etalon tree :param tree_path: a directory, where built trees will be stored :param rebuild: set it True, if you need to rebuild trees or build them from scratch :return: tensor with labels """ tree_path = osp.abspath(tree_path) # raxml needs absolute paths if rebuild: calculator = TreeConstruction.DistanceCalculator('blosum62') dist_constructor = TreeConstruction.DistanceTreeConstructor() # construct all trees with UPGMA, NJ and raxml for i, file in enumerate(files): aln = AlignIO.read(file, 'fasta') tree = dist_constructor.upgma(calculator.get_distance(aln)) name = file.split("/")[-1].split(".")[0] Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)), 'newick') tree = dist_constructor.nj(calculator.get_distance(aln)) Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)), 'newick') raxml = RaxmlCommandline(sequences=osp.abspath(file), model='PROTCATWAG', name='{}.tre'.format(name), threads=3, working_dir=tree_path) _, stderr = raxml() print(stderr) print('{} finished'.format(name)) # get best tree tns = dendropy.TaxonNamespace() act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree), "newick", taxon_namespace=tns) act_tree.encode_bipartitions() distances = np.zeros(shape=(len(files), 3)) for i, file in enumerate(files): name = file.split("/")[-1].split(".")[0] nj_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "nj_{}.tre".format(name)), "newick", taxon_namespace=tns) up_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "upgma_{}.tre".format(name)), "newick", taxon_namespace=tns) ml_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "RAxML_bestTree.{}.tre".format(name)), "newick", taxon_namespace=tns) distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference( nj_tree, act_tree) distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference( up_tree, act_tree) distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference( ml_tree, act_tree) return distances.argmin(1)
def _make_nj_tree(self, treesams, dm): """ **PRIVATE** Parameters ---------- treesams: dict {sam name: samid, sam name: samid, ...} Returns ------- nwkstring: str tree as newick string """ iNofSams = len(treesams.keys()) logging.info("Calculating %i distances. Patience!", ((iNofSams**2) - iNofSams) / 2) dist_mat = get_distance_matrix(self.cur, treesams.values()) if dm != None: logging.info("Distance matrix written to file: %s", dm) if os.path.exists(dm) == True: os.remove(dm) aSampleNames = treesams.keys() aSimpleMatrix = [] for i, sample_1 in enumerate(aSampleNames): mat_line = [] for j, sample_2 in enumerate(aSampleNames): if j < i: sid1 = treesams[sample_1] sid2 = treesams[sample_2] mat_line.append(dist_mat[sid1][sid2]) elif j == i: mat_line.append(0) else: pass aSimpleMatrix.append(mat_line) if dm != None: with open(dm, 'a') as f: f.write("%s\n" % ','.join([sample_1] + [str(x) for x in mat_line[:-1]])) logging.info("Bulding tree.") oDistMat = TreeConstruction._DistanceMatrix(aSampleNames, aSimpleMatrix) constructor = TreeConstruction.DistanceTreeConstructor() oTree = constructor.nj(oDistMat) # I don't know how to get newick string from this object without a file ... td = tempfile.mkdtemp() tmpfile = os.path.join(td, 'tree.nwk') Phylo.write(oTree, tmpfile, 'newick') nwkstring = "" with open(tmpfile, 'r') as f: nwkstring = f.read() shutil.rmtree(td) return nwkstring
def neighbor_joining_tree(aln, prot_model='blosum62'): """ Estimate a tree from an alignment using neighbor-joining """ calculator = TreeConstruction.DistanceCalculator(prot_model) constructor = TreeConstruction.DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) return(tree)
def NJ(self, f=min): m = self.distanceMatrix() for i in range(len(self.languages)): for j in range(i, len(self.languages)): m[i][j] = f(m[i][j], m[j][i]) m[j][i] = m[i][j] predm = [[m[i][j] for j in range(i + 1)] for i in range(len(self.languages))] # print(predm) dm = TreeConstruction._DistanceMatrix([l.name for l in self.languages], predm) constructor = TreeConstruction.DistanceTreeConstructor(method='nj') njtree = constructor.nj(dm) Phylo.draw_ascii(njtree)
def __init__(self, fastas, labels, max_seqs=45): """ Creates dataset object from given alignments. Drops ones with len>max_lens. You can create an empty dataset to load your data later. It is more memory-friendly approach. :param fastas: alignment files :param labels: labels of each fasta :param max_seqs: maximum number of sequences in one file """ self.is_sparse = False self.edge_indices = None self.edge_weights = None self.f = None # holder for hdf5 file self.data = np.zeros((len(fastas), max_seqs, 3)) self.labels = np.array(labels, dtype=np.long) self.dist = np.zeros((len(fastas), max_seqs, max_seqs)) calc = TreeConstruction.DistanceCalculator( 'blosum62') # calculator for distance matrices for file_index, file in enumerate(fastas): aln = AlignIO.read(file, "fasta") for seq_index, seq in enumerate(aln): for col_index, col in enumerate(seq): self.data[file_index, seq_index] += aesnn3[res_to_index[col]] self.dist[file_index, :len(aln), :len(aln)] = np.array( calc.get_distance(aln))
def _read_matrix(filename): with open(filename) as f: header = f.readline().split() buf = list(map(lambda l: list(map(float, l[1:].split())), f.readlines())) if len(buf) != len(header): raise ValueError('Distance matrix must be square') buf = list(zip(*buf)) for i, l in enumerate(buf): if len(l) != len(header): raise ValueError('Distance matrix must be square') buf[i] = list(l[:i + 1:]) if not buf: return None res = TreeConstruction.DistanceMatrix(names=header, matrix=buf) return res
def evaluate_directory(data_dir, eval_limit=50000, lim=5, limited_expand=False): ''' Processes a directory of FASTA files, generating and evaluating trees using branch and bound with early stopping and limited expansion (optional) Args: data_dir: str, the path to the data directory which stores fasta files eval_limit: int, max number of trees to evaluate for each file. Good setting depends on seq length and time you're willing to wait. lim: int, number of files in the data directory to process limited_expand: bool, whether or not to use limited expansion Returns: List[Tree], a list of BioPython Phylo trees with tied best scores ''' scorer = TreeConstruction.ParsimonyScorer() all_best = [] files = os.listdir(data_dir) for i, file in enumerate(files[:lim]): # Load and sort file print(f"Processing {file} ({i+1}/{len(files[:lim])})") aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta') aln.sort(key=lambda a: a.id) result_trees = get_best_trees(aln, scorer, eval_limit, limited_expand) print(f"Found {len(result_trees)} trees.") all_best.extend(result_trees) return [tr[0] for tr in all_best]
def make_nj_tree(dist_mat, dArgs, aSampleNames): ''' Uses Biopython.Phylo to make a neighbour joining tree from a distance matrix Parameters ---------- dist_mat: dict distance matrix as a dict of dicts distance_a_to_b = dist_mat[a][b] dArgs: dict input argument dictionary aSampleNames: list list of sample names Returns ------- returns 0 also writes tree file to to dArgs['tree'] in newick format ''' aSimpleMatrix = [] for i, sample_1 in enumerate(aSampleNames): mat_line = [] for j, sample_2 in enumerate(aSampleNames): if j < i: mat_line.append(dist_mat[sample_1][sample_2]) elif j == i: mat_line.append(0) else: pass aSimpleMatrix.append(mat_line) oDistMat = TreeConstruction._DistanceMatrix(aSampleNames, aSimpleMatrix) constructor = TreeConstruction.DistanceTreeConstructor() oTree = constructor.nj(oDistMat) Phylo.write(oTree, dArgs['tree'], 'newick') logging.info("Tree file written.") return 0
def UPGMA(self, f=min): """builds a tree via UPGMA, and uses the passed in function to deal with asymmetric 'distances'""" m = self.distanceMatrix() for i in range(len(self.languages)): for j in range(i, len(self.languages)): m[i][j] = f(m[i][j], m[j][i]) m[j][i] = m[i][j] predm = [[m[i][j] for j in range(i + 1)] for i in range(len(self.languages))] # print(predm) dm = TreeConstruction._DistanceMatrix([l.name for l in self.languages], predm) constructor = TreeConstruction.DistanceTreeConstructor(method='upgma') upgmatree = constructor.upgma(dm) Phylo.draw_ascii(upgmatree) # indices = range(len(self.languages)) # while len(indices)>1: # #find minimum distance in m # # #join indices as tuple # #recalculate m return m
scores[pdbs[i]] = {} PhyloM.append([]) scoresM.append([]) scores[pdbs[i]][pdbs[j]] = score scoresM[i].append(score) else: print '{:->3.2%}\t{} {}\t{}'.format( s/total, pdbs[i], pdbs[j], "error") if pdbs[i] not in scores: scores[pdbs[i]] = {} PhyloM.append([]) scoresM.append([]) scores[pdbs[i]][pdbs[j]] = 0 scoresM[i].append(0) print_matrix("Scores", scores, pdbs) scoresM = TreeConstruction._Matrix([x[x.rfind('/')+1:] for x in pdbs], scoresM) distances = {} for i in range(leng): distances[pdbs[i]] = {} for j in range(i+1): distances[pdbs[i]][pdbs[j]] = (scores[pdbs[i]][pdbs[i]]+scores[pdbs[j]][pdbs[j]])/2.0 - scores[pdbs[i]][pdbs[j]] PhyloM[i].append(distances[pdbs[i]][pdbs[j]]) PhyloM = TreeConstruction._DistanceMatrix([x[x.rfind('/')+1:] for x in pdbs], PhyloM) print_matrix("Distances", distances, pdbs) tree = TreeConstruction.DistanceTreeConstructor().upgma(PhyloM) Phylo.draw_ascii(tree) tree.ladderize() #Phylo.draw_graphviz(tree, node_size=0) def hide_inner(node):
PhyloM.append([]) scoresM.append([]) scores[pdbs[i]][pdbs[j]] = score scoresM[i].append(score) else: print '{:->3.2%}\t{} {}\t{}'.format(s / total, pdbs[i], pdbs[j], "error") if pdbs[i] not in scores: scores[pdbs[i]] = {} PhyloM.append([]) scoresM.append([]) scores[pdbs[i]][pdbs[j]] = 0 scoresM[i].append(0) print_matrix("Scores", scores, pdbs) scoresM = TreeConstruction._Matrix([x[x.rfind('/') + 1:] for x in pdbs], scoresM) distances = {} for i in range(leng): distances[pdbs[i]] = {} for j in range(i + 1): distances[pdbs[i]][pdbs[j]] = (scores[pdbs[i]][pdbs[i]] + scores[ pdbs[j]][pdbs[j]]) / 2.0 - scores[pdbs[i]][pdbs[j]] PhyloM[i].append(distances[pdbs[i]][pdbs[j]]) PhyloM = TreeConstruction._DistanceMatrix([x[x.rfind('/') + 1:] for x in pdbs], PhyloM) print_matrix("Distances", distances, pdbs) tree = TreeConstruction.DistanceTreeConstructor().upgma(PhyloM) Phylo.draw_ascii(tree) tree.ladderize()
def find_good_tree(trees, data_dir, lim=25): ''' Evaluates a list of trees against multiple alignments, returning the best scoring tree across all alignments. Requires tree terminal names and alignment names in data_dir are the same. Args: trees: List[Tree], a list of BioPython Phylo trees to evaluate data_dir: str, the path to the data directory lim: int, number of alignments to evaluate against (more is slower) ''' best_trees = [] scorer = TreeConstruction.ParsimonyScorer() files = os.listdir(data_dir) alns = [] for i, file in enumerate(files[:lim]): aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta') for rec in aln: rec.name = rec.id = rec.name[:5] alns.append(aln) scores = {} max_aln = {} # Score trees and track highest score for each alignment to normalize later print(f"Processing {len(trees)} trees...") for i, tree in enumerate(trees): print(f"\t{i+1}/{len(trees)}") for j, aln in enumerate(alns[:lim]): try: sco = scorer.get_score(copy.deepcopy(tree), aln) except Exception as e: print(e) print( "Scoring failed. Did you ensure that terminal names and alignment names match?" ) return None scores[(i, j)] = sco if sco > max_aln.get(j, 0): max_aln[j] = sco # Computes normalized scores for each tree fin_scores = {} for i in range(len(trees)): for j in range(len(alns[:lim])): m_aln = max_aln.get(j, 0) if m_aln <= 0: continue normalized_score = scores.get((i, j), 0) / m_aln if not normalized_score: print(f"Error for tree {i} and alignment {j}.") fin_scores[i] = fin_scores.get(i, 0) + normalized_score # Finds final best tree best_tree = -1 for key in fin_scores.keys(): if best_tree < 0 or fin_scores[key] < fin_scores[best_tree]: best_tree = key if best_tree < 0: print("No best tree found.") return None return trees[best_tree]