Example #1
0
 def __init__(self, fastas, labels, max_seqs=45):
     """
     Creates dataset object from given alignments. Drops ones with len>max_lens.
     You can create an empty dataset to load your data later. It is more memory-friendly approach.
     :param fastas: alignment files
     :param labels: labels of each fasta
     :param max_seqs: maximum number of sequences in one file
     """
     self.is_sparse = False
     self.edge_indices = None
     self.edge_weights = None
     self.f = None  # holder for hdf5 file
     self.data = np.zeros((len(fastas), max_seqs, 3))
     self.labels = np.array(labels, dtype=np.long)
     self.dist = np.zeros((len(fastas), max_seqs, max_seqs))
     calc = TreeConstruction.DistanceCalculator(
         'blosum62')  # calculator for distance matrices
     for file_index, file in enumerate(fastas):
         aln = AlignIO.read(file, "fasta")
         for seq_index, seq in enumerate(aln):
             for col_index, col in enumerate(seq):
                 self.data[file_index,
                           seq_index] += aesnn3[res_to_index[col]]
         self.dist[file_index, :len(aln), :len(aln)] = np.array(
             calc.get_distance(aln))
Example #2
0
def labeler(files, etalon_tree, tree_path=".", rebuild=False):
    """
    Constructs labels for given files. (Best phylogeny reconstruction method)
    :param files: an iterable with file paths to alignments
    :param etalon_tree: the path to etalon tree
    :param tree_path: a directory, where built trees will be stored
    :param rebuild: set it True, if you need to rebuild trees or build them from scratch
    :return: tensor with labels
    """
    tree_path = osp.abspath(tree_path)  # raxml needs absolute paths
    if rebuild:
        calculator = TreeConstruction.DistanceCalculator('blosum62')
        dist_constructor = TreeConstruction.DistanceTreeConstructor()

        # construct all trees with UPGMA, NJ and raxml
        for i, file in enumerate(files):
            aln = AlignIO.read(file, 'fasta')
            tree = dist_constructor.upgma(calculator.get_distance(aln))
            name = file.split("/")[-1].split(".")[0]
            Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)),
                        'newick')
            tree = dist_constructor.nj(calculator.get_distance(aln))
            Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)),
                        'newick')
            raxml = RaxmlCommandline(sequences=osp.abspath(file),
                                     model='PROTCATWAG',
                                     name='{}.tre'.format(name),
                                     threads=3,
                                     working_dir=tree_path)
            _, stderr = raxml()
            print(stderr)
            print('{} finished'.format(name))
    # get best tree
    tns = dendropy.TaxonNamespace()
    act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree),
                                           "newick",
                                           taxon_namespace=tns)
    act_tree.encode_bipartitions()
    distances = np.zeros(shape=(len(files), 3))
    for i, file in enumerate(files):
        name = file.split("/")[-1].split(".")[0]
        nj_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "nj_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        up_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "upgma_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        ml_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "RAxML_bestTree.{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference(
            nj_tree, act_tree)
        distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference(
            up_tree, act_tree)
        distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference(
            ml_tree, act_tree)
    return distances.argmin(1)
Example #3
0
def neighbor_joining_tree(aln, prot_model='blosum62'):
    
    """ Estimate a tree from an alignment using neighbor-joining
    """

    calculator = TreeConstruction.DistanceCalculator(prot_model)
    constructor = TreeConstruction.DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
    return(tree)