def __init__(self, fastas, labels, max_seqs=45): """ Creates dataset object from given alignments. Drops ones with len>max_lens. You can create an empty dataset to load your data later. It is more memory-friendly approach. :param fastas: alignment files :param labels: labels of each fasta :param max_seqs: maximum number of sequences in one file """ self.is_sparse = False self.edge_indices = None self.edge_weights = None self.f = None # holder for hdf5 file self.data = np.zeros((len(fastas), max_seqs, 3)) self.labels = np.array(labels, dtype=np.long) self.dist = np.zeros((len(fastas), max_seqs, max_seqs)) calc = TreeConstruction.DistanceCalculator( 'blosum62') # calculator for distance matrices for file_index, file in enumerate(fastas): aln = AlignIO.read(file, "fasta") for seq_index, seq in enumerate(aln): for col_index, col in enumerate(seq): self.data[file_index, seq_index] += aesnn3[res_to_index[col]] self.dist[file_index, :len(aln), :len(aln)] = np.array( calc.get_distance(aln))
def labeler(files, etalon_tree, tree_path=".", rebuild=False): """ Constructs labels for given files. (Best phylogeny reconstruction method) :param files: an iterable with file paths to alignments :param etalon_tree: the path to etalon tree :param tree_path: a directory, where built trees will be stored :param rebuild: set it True, if you need to rebuild trees or build them from scratch :return: tensor with labels """ tree_path = osp.abspath(tree_path) # raxml needs absolute paths if rebuild: calculator = TreeConstruction.DistanceCalculator('blosum62') dist_constructor = TreeConstruction.DistanceTreeConstructor() # construct all trees with UPGMA, NJ and raxml for i, file in enumerate(files): aln = AlignIO.read(file, 'fasta') tree = dist_constructor.upgma(calculator.get_distance(aln)) name = file.split("/")[-1].split(".")[0] Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)), 'newick') tree = dist_constructor.nj(calculator.get_distance(aln)) Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)), 'newick') raxml = RaxmlCommandline(sequences=osp.abspath(file), model='PROTCATWAG', name='{}.tre'.format(name), threads=3, working_dir=tree_path) _, stderr = raxml() print(stderr) print('{} finished'.format(name)) # get best tree tns = dendropy.TaxonNamespace() act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree), "newick", taxon_namespace=tns) act_tree.encode_bipartitions() distances = np.zeros(shape=(len(files), 3)) for i, file in enumerate(files): name = file.split("/")[-1].split(".")[0] nj_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "nj_{}.tre".format(name)), "newick", taxon_namespace=tns) up_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "upgma_{}.tre".format(name)), "newick", taxon_namespace=tns) ml_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "RAxML_bestTree.{}.tre".format(name)), "newick", taxon_namespace=tns) distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference( nj_tree, act_tree) distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference( up_tree, act_tree) distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference( ml_tree, act_tree) return distances.argmin(1)
def neighbor_joining_tree(aln, prot_model='blosum62'): """ Estimate a tree from an alignment using neighbor-joining """ calculator = TreeConstruction.DistanceCalculator(prot_model) constructor = TreeConstruction.DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) return(tree)