Example #1
0
def writeNewick(species, distance, output):
    '''
	Input is a list of species names and the top half of the distance matrix
	Newick tree is output to standard out.	

	'''
    outputFile = sys.stdout
    if args.output:
        outputFile = open(output, 'w')
    import Bio.Phylo.TreeConstruction as TreeConstruction
    constructor = TreeConstruction.DistanceTreeConstructor()
    distanceMatrix = TreeConstruction._DistanceMatrix(species, distance)
    treeConstructor = TreeConstruction.DistanceTreeConstructor(method='nj')
    njTree = treeConstructor.nj(distanceMatrix)
    TEMP_FILE_NUM = str(int(os.urandom(3).encode('hex'), 16))
    while checkTempNum(TEMP_FILE_NUM):
        TEMP_FILE_NUM = str(int(os.urandom(3).encode('hex'), 16))
    tempFile = open(".tempFile" + TEMP_FILE_NUM, 'w')
    from Bio import Phylo
    Phylo.write(njTree, tempFile, "newick")
    tempFile.close()
    import re
    treeF = open(".tempFile" + TEMP_FILE_NUM, 'r')
    tree = treeF.read()
    treeF.close()
    os.remove(".tempFile" + TEMP_FILE_NUM)

    tree = re.sub("Inner[0-9]+:[-0-9\.]+", "", tree)
    tree = re.sub(":[0-9\.]+", "", tree)
    tree = re.sub("_", " ", tree)
    outputFile.write(tree)
    if args.output:
        outputFile.close()
Example #2
0
def labeler(files, etalon_tree, tree_path=".", rebuild=False):
    """
    Constructs labels for given files. (Best phylogeny reconstruction method)
    :param files: an iterable with file paths to alignments
    :param etalon_tree: the path to etalon tree
    :param tree_path: a directory, where built trees will be stored
    :param rebuild: set it True, if you need to rebuild trees or build them from scratch
    :return: tensor with labels
    """
    tree_path = osp.abspath(tree_path)  # raxml needs absolute paths
    if rebuild:
        calculator = TreeConstruction.DistanceCalculator('blosum62')
        dist_constructor = TreeConstruction.DistanceTreeConstructor()

        # construct all trees with UPGMA, NJ and raxml
        for i, file in enumerate(files):
            aln = AlignIO.read(file, 'fasta')
            tree = dist_constructor.upgma(calculator.get_distance(aln))
            name = file.split("/")[-1].split(".")[0]
            Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)),
                        'newick')
            tree = dist_constructor.nj(calculator.get_distance(aln))
            Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)),
                        'newick')
            raxml = RaxmlCommandline(sequences=osp.abspath(file),
                                     model='PROTCATWAG',
                                     name='{}.tre'.format(name),
                                     threads=3,
                                     working_dir=tree_path)
            _, stderr = raxml()
            print(stderr)
            print('{} finished'.format(name))
    # get best tree
    tns = dendropy.TaxonNamespace()
    act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree),
                                           "newick",
                                           taxon_namespace=tns)
    act_tree.encode_bipartitions()
    distances = np.zeros(shape=(len(files), 3))
    for i, file in enumerate(files):
        name = file.split("/")[-1].split(".")[0]
        nj_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "nj_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        up_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "upgma_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        ml_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "RAxML_bestTree.{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference(
            nj_tree, act_tree)
        distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference(
            up_tree, act_tree)
        distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference(
            ml_tree, act_tree)
    return distances.argmin(1)
    def _make_nj_tree(self, treesams, dm):
        """
        **PRIVATE**

        Parameters
        ----------
        treesams: dict
            {sam name: samid, sam name: samid, ...}

        Returns
        -------
        nwkstring: str
            tree as newick string
        """

        iNofSams = len(treesams.keys())
        logging.info("Calculating %i distances. Patience!", ((iNofSams**2) - iNofSams) / 2)

        dist_mat = get_distance_matrix(self.cur, treesams.values())

        if dm != None:
            logging.info("Distance matrix written to file: %s", dm)
            if os.path.exists(dm) == True:
                os.remove(dm)

        aSampleNames = treesams.keys()
        aSimpleMatrix = []
        for i, sample_1 in enumerate(aSampleNames):
            mat_line = []
            for j, sample_2 in enumerate(aSampleNames):
                if j < i:
                    sid1 = treesams[sample_1]
                    sid2 = treesams[sample_2]
                    mat_line.append(dist_mat[sid1][sid2])
                elif j == i:
                    mat_line.append(0)
                else:
                    pass
            aSimpleMatrix.append(mat_line)
            if dm != None:
                with open(dm, 'a') as f:
                    f.write("%s\n" % ','.join([sample_1] + [str(x) for x in mat_line[:-1]]))

        logging.info("Bulding tree.")
        oDistMat = TreeConstruction._DistanceMatrix(aSampleNames, aSimpleMatrix)
        constructor = TreeConstruction.DistanceTreeConstructor()
        oTree = constructor.nj(oDistMat)

        # I don't know how to get newick string from this object without a file ...
        td = tempfile.mkdtemp()
        tmpfile = os.path.join(td, 'tree.nwk')
        Phylo.write(oTree, tmpfile, 'newick')
        nwkstring = ""
        with open(tmpfile, 'r') as f:
            nwkstring = f.read()
        shutil.rmtree(td)

        return nwkstring
Example #4
0
def neighbor_joining_tree(aln, prot_model='blosum62'):
    
    """ Estimate a tree from an alignment using neighbor-joining
    """

    calculator = TreeConstruction.DistanceCalculator(prot_model)
    constructor = TreeConstruction.DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
    return(tree)
Example #5
0
 def NJ(self, f=min):
     m = self.distanceMatrix()
     for i in range(len(self.languages)):
         for j in range(i, len(self.languages)):
             m[i][j] = f(m[i][j], m[j][i])
             m[j][i] = m[i][j]
     predm = [[m[i][j] for j in range(i + 1)]
              for i in range(len(self.languages))]
     #	print(predm)
     dm = TreeConstruction._DistanceMatrix([l.name for l in self.languages],
                                           predm)
     constructor = TreeConstruction.DistanceTreeConstructor(method='nj')
     njtree = constructor.nj(dm)
     Phylo.draw_ascii(njtree)
Example #6
0
 def __init__(self, fastas, labels, max_seqs=45):
     """
     Creates dataset object from given alignments. Drops ones with len>max_lens.
     You can create an empty dataset to load your data later. It is more memory-friendly approach.
     :param fastas: alignment files
     :param labels: labels of each fasta
     :param max_seqs: maximum number of sequences in one file
     """
     self.is_sparse = False
     self.edge_indices = None
     self.edge_weights = None
     self.f = None  # holder for hdf5 file
     self.data = np.zeros((len(fastas), max_seqs, 3))
     self.labels = np.array(labels, dtype=np.long)
     self.dist = np.zeros((len(fastas), max_seqs, max_seqs))
     calc = TreeConstruction.DistanceCalculator(
         'blosum62')  # calculator for distance matrices
     for file_index, file in enumerate(fastas):
         aln = AlignIO.read(file, "fasta")
         for seq_index, seq in enumerate(aln):
             for col_index, col in enumerate(seq):
                 self.data[file_index,
                           seq_index] += aesnn3[res_to_index[col]]
         self.dist[file_index, :len(aln), :len(aln)] = np.array(
             calc.get_distance(aln))
Example #7
0
def _read_matrix(filename):
	with open(filename) as f:
		header = f.readline().split()

		buf = list(map(lambda l: list(map(float, l[1:].split())), f.readlines()))

	if len(buf) != len(header):
		raise ValueError('Distance matrix must be square')
	
	
	buf = list(zip(*buf))

	for i, l in enumerate(buf):
		if len(l) != len(header):
			raise ValueError('Distance matrix must be square')

		buf[i] =  list(l[:i + 1:])


	if not buf:
		return None

	res = TreeConstruction.DistanceMatrix(names=header,
			matrix=buf)
	
	return res
Example #8
0
def evaluate_directory(data_dir,
                       eval_limit=50000,
                       lim=5,
                       limited_expand=False):
    '''
    Processes a directory of FASTA files, generating and evaluating trees
    using branch and bound with early stopping and limited expansion (optional)
    Args:
        data_dir: str, the path to the data directory which stores fasta files
        eval_limit: int, max number of trees to evaluate for each file. Good
                    setting depends on seq length and time you're willing to wait.
        lim: int, number of files in the data directory to process
        limited_expand: bool, whether or not to use limited expansion
    Returns:
        List[Tree], a list of BioPython Phylo trees with tied best scores
    '''
    scorer = TreeConstruction.ParsimonyScorer()
    all_best = []

    files = os.listdir(data_dir)

    for i, file in enumerate(files[:lim]):
        # Load and sort file
        print(f"Processing {file} ({i+1}/{len(files[:lim])})")
        aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta')
        aln.sort(key=lambda a: a.id)

        result_trees = get_best_trees(aln, scorer, eval_limit, limited_expand)
        print(f"Found {len(result_trees)} trees.")
        all_best.extend(result_trees)

    return [tr[0] for tr in all_best]
Example #9
0
def make_nj_tree(dist_mat, dArgs, aSampleNames):
    '''
    Uses Biopython.Phylo to make a neighbour joining tree from a distance matrix

    Parameters
    ----------
    dist_mat: dict
        distance matrix as a dict of dicts
        distance_a_to_b = dist_mat[a][b]
    dArgs: dict
        input argument dictionary
    aSampleNames: list
        list of sample names

    Returns
    -------
    returns 0
    also writes tree file to to dArgs['tree'] in newick format
    '''

    aSimpleMatrix = []
    for i, sample_1 in enumerate(aSampleNames):
        mat_line = []
        for j, sample_2 in enumerate(aSampleNames):
            if j < i:
                mat_line.append(dist_mat[sample_1][sample_2])
            elif j == i:
                mat_line.append(0)
            else:
                pass
        aSimpleMatrix.append(mat_line)

    oDistMat = TreeConstruction._DistanceMatrix(aSampleNames, aSimpleMatrix)
    constructor = TreeConstruction.DistanceTreeConstructor()
    oTree = constructor.nj(oDistMat)
    Phylo.write(oTree, dArgs['tree'], 'newick')
    logging.info("Tree file written.")

    return 0
Example #10
0
    def UPGMA(self, f=min):
        """builds a tree via UPGMA, and uses the passed in function to deal with asymmetric 'distances'"""
        m = self.distanceMatrix()
        for i in range(len(self.languages)):
            for j in range(i, len(self.languages)):
                m[i][j] = f(m[i][j], m[j][i])
                m[j][i] = m[i][j]
        predm = [[m[i][j] for j in range(i + 1)]
                 for i in range(len(self.languages))]
        #	print(predm)
        dm = TreeConstruction._DistanceMatrix([l.name for l in self.languages],
                                              predm)
        constructor = TreeConstruction.DistanceTreeConstructor(method='upgma')
        upgmatree = constructor.upgma(dm)
        Phylo.draw_ascii(upgmatree)
        #	indices = range(len(self.languages))
        #	while len(indices)>1:
        #		#find minimum distance in m
        #
        #		#join indices as  tuple
        #		#recalculate m

        return m
Example #11
0
				scores[pdbs[i]] = {}
				PhyloM.append([])
				scoresM.append([])
			scores[pdbs[i]][pdbs[j]] = score
			scoresM[i].append(score)
		else:
			print '{:->3.2%}\t{} {}\t{}'.format( s/total, pdbs[i], pdbs[j], "error")
			if pdbs[i] not in scores:
				scores[pdbs[i]] = {}
				PhyloM.append([])
				scoresM.append([])
			scores[pdbs[i]][pdbs[j]] = 0
			scoresM[i].append(0)
		
print_matrix("Scores", scores, pdbs)
scoresM = TreeConstruction._Matrix([x[x.rfind('/')+1:] for x in pdbs], scoresM)

distances = {}
for i in range(leng):
	distances[pdbs[i]] = {}
	for j in range(i+1):
		distances[pdbs[i]][pdbs[j]] = (scores[pdbs[i]][pdbs[i]]+scores[pdbs[j]][pdbs[j]])/2.0 - scores[pdbs[i]][pdbs[j]]
		PhyloM[i].append(distances[pdbs[i]][pdbs[j]])
PhyloM = TreeConstruction._DistanceMatrix([x[x.rfind('/')+1:] for x in pdbs], PhyloM)
print_matrix("Distances", distances, pdbs)

tree = TreeConstruction.DistanceTreeConstructor().upgma(PhyloM)
Phylo.draw_ascii(tree)
tree.ladderize()
#Phylo.draw_graphviz(tree, node_size=0)
def hide_inner(node):
                PhyloM.append([])
                scoresM.append([])
            scores[pdbs[i]][pdbs[j]] = score
            scoresM[i].append(score)
        else:
            print '{:->3.2%}\t{} {}\t{}'.format(s / total, pdbs[i], pdbs[j],
                                                "error")
            if pdbs[i] not in scores:
                scores[pdbs[i]] = {}
                PhyloM.append([])
                scoresM.append([])
            scores[pdbs[i]][pdbs[j]] = 0
            scoresM[i].append(0)

print_matrix("Scores", scores, pdbs)
scoresM = TreeConstruction._Matrix([x[x.rfind('/') + 1:] for x in pdbs],
                                   scoresM)

distances = {}
for i in range(leng):
    distances[pdbs[i]] = {}
    for j in range(i + 1):
        distances[pdbs[i]][pdbs[j]] = (scores[pdbs[i]][pdbs[i]] + scores[
            pdbs[j]][pdbs[j]]) / 2.0 - scores[pdbs[i]][pdbs[j]]
        PhyloM[i].append(distances[pdbs[i]][pdbs[j]])
PhyloM = TreeConstruction._DistanceMatrix([x[x.rfind('/') + 1:] for x in pdbs],
                                          PhyloM)
print_matrix("Distances", distances, pdbs)

tree = TreeConstruction.DistanceTreeConstructor().upgma(PhyloM)
Phylo.draw_ascii(tree)
tree.ladderize()
Example #13
0
def find_good_tree(trees, data_dir, lim=25):
    '''
    Evaluates a list of trees against multiple alignments, returning the best
    scoring tree across all alignments. Requires tree terminal names and
    alignment names in data_dir are the same.
    Args:
        trees: List[Tree], a list of BioPython Phylo trees to evaluate
        data_dir: str, the path to the data directory
        lim: int, number of alignments to evaluate against (more is slower)
    '''
    best_trees = []
    scorer = TreeConstruction.ParsimonyScorer()

    files = os.listdir(data_dir)
    alns = []
    for i, file in enumerate(files[:lim]):
        aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta')
        for rec in aln:
            rec.name = rec.id = rec.name[:5]
        alns.append(aln)
    scores = {}
    max_aln = {}

    # Score trees and track highest score for each alignment to normalize later
    print(f"Processing {len(trees)} trees...")
    for i, tree in enumerate(trees):
        print(f"\t{i+1}/{len(trees)}")
        for j, aln in enumerate(alns[:lim]):
            try:
                sco = scorer.get_score(copy.deepcopy(tree), aln)
            except Exception as e:
                print(e)
                print(
                    "Scoring failed. Did you ensure that terminal names and alignment names match?"
                )
                return None

            scores[(i, j)] = sco
            if sco > max_aln.get(j, 0): max_aln[j] = sco

    # Computes normalized scores for each tree
    fin_scores = {}
    for i in range(len(trees)):
        for j in range(len(alns[:lim])):
            m_aln = max_aln.get(j, 0)
            if m_aln <= 0: continue
            normalized_score = scores.get((i, j), 0) / m_aln
            if not normalized_score:
                print(f"Error for tree {i} and alignment {j}.")
            fin_scores[i] = fin_scores.get(i, 0) + normalized_score

    # Finds final best tree
    best_tree = -1
    for key in fin_scores.keys():
        if best_tree < 0 or fin_scores[key] < fin_scores[best_tree]:
            best_tree = key
    if best_tree < 0:
        print("No best tree found.")
        return None

    return trees[best_tree]