Esempio n. 1
0
    def get_tree(self,
                 chrom,
                 start=1,
                 end=None,
                 samples=None,
                 return_format="tree_obj"):

        print("chrom: {} start: {} end: {} samples: {}".format(
            chrom, start, end, samples))
        names, matrix = self.get_matrix(chrom,
                                        start=start,
                                        end=end,
                                        samples=samples,
                                        return_format="Phylo")
        distance_matrix = _DistanceMatrix(names, matrix)

        constructor = DistanceTreeConstructor()
        tree = constructor.nj(distance_matrix)  # neighbour joining tree

        if return_format == "tree_obj":
            return tree
        elif return_format == "newick":
            treeIO = StringIO()
            Phylo.write(tree, treeIO, "newick")
            treeString = treeIO.getvalue()
            treeString = treeString.strip()
            return treeString
Esempio n. 2
0
def build_phylogeny_trees():
    path = "out/homologous_gene_sequences/"
    output_path = "out/aligned_homologous_gene_sequences/"

    for homologous_gene_sequence in os.listdir(path):
        input = path + homologous_gene_sequence
        output = output_path + homologous_gene_sequence
    
        clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True)
        os.system(str(clustal_omega))

        multi_seq_align = AlignIO.read(output, 'fasta')

        # Distance Matrix
        calculator = DistanceCalculator('identity')
        dist_mat = calculator.get_distance(multi_seq_align)

        tree_constructor = DistanceTreeConstructor()
        phylo_tree = tree_constructor.upgma(dist_mat)

        Phylo.draw(phylo_tree)

        print('\nPhylogenetic Tree\n', homologous_gene_sequence)
        Phylo.draw_ascii(phylo_tree)
        Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
Esempio n. 3
0
 def printGeneTree(self):
     """
     Print gene trees with matplotlib and in the terminal for the four largest target ORFs of coronaviruses.
     Takes a .phy file containing multiple alligned sequences, generates a matrix based on sequence composition 
     and compares each sequence (genome) to one another. sequences with grater scores (similarity) are ranked closer
     together on the phylogenetic trees.
     input: A .phy file that contains coronavirus gene sequences to draw phylogenetic tree
     output: A visual representation of a gene tree on terminal and matplotlib
     """
     align = AlignIO.read(
         self.newPhylip,
         'phylip')  # Reads created .phy file containing the SeqRecord
     #print (align) # prints concatenated allignments
     calculator = DistanceCalculator('identity')
     dm = calculator.get_distance(align)  # Calculate the distance matrix
     print(
         '\n======================================== DISTANCE MATRIX =======================================\n'
     )
     print(dm, "\n\n")  # Print the distance Matrix
     constructor = DistanceTreeConstructor(
     )  # Construct the phylogenetic tree using UPGMA algorithm
     tree = constructor.upgma(dm)
     print(
         '\n========================================= GENE TREE ===========================================\n'
     )
     Phylo.draw(
         tree
     )  # Draw the phylogenetic tree (must install matplotlib to use this formatting)
     Phylo.draw_ascii(tree)  # Print the phylogenetic tree in terminal
def build_trees(filename, tree_name):
    # Compute alignment with ClustalW algorithm
    clustalw_cline = ClustalwCommandline("clustalw",
                                         infile="{}.fa".format(filename))
    clustalw_cline()
    alignment = AlignIO.read("{}.aln".format(filename), format="clustal")

    # Create distance matrix
    calculator = DistanceCalculator('blosum62')
    dist_matrix = calculator.get_distance(alignment)

    # Build phylogenetic trees using upgma and nj methods
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dist_matrix)
    nj_tree = constructor.nj(dist_matrix)

    # Draw the trees
    label_func = lambda clade: "" if clade.name.startswith("Inner") else clade

    Phylo.draw(upgma_tree, label_func=label_func, do_show=False)
    plt.title("{} × upgma".format(tree_name))
    plt.show()

    Phylo.draw(nj_tree, label_func=label_func, do_show=False)
    plt.title("{} × nj".format(tree_name))
    plt.show()
Esempio n. 5
0
def make_newick_tree(dm):
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    njtree = constructor.nj(dm)
    upgmatree.root_with_outgroup({'name': "KE136308.1"})
    njtree.root_with_outgroup({'name': "KE136308.1"})
    return upgmatree, njtree
Esempio n. 6
0
def NJ(thatdm):
    # Reconstruct tree
    treehat = DistanceTreeConstructor().nj(thatdm)
    xtreehat = XTree(
        treehat,
        dict((clade, set([clade.name])) for clade in treehat.get_terminals()))
    return (xtreehat)
Esempio n. 7
0
def construct_tree(matrix, nj=True):
    """Build a tree from a distance matrix

    Can either use neighbor-joining (nj) or UPGMA.
    """

    if not (matrix and type(matrix) == list and len(matrix) > 0):
        print "matrix has invalid value"
        return

    dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))],
                         matrix=matrix)

    constructor = DistanceTreeConstructor()
    if nj:
        tree = constructor.nj(dm)
    else:
        tree = constructor.upgma(dm)

    # this will remove the names from the inner nodes
    # this is critical for seq-gen to read in the tree
    for clade in tree.get_nonterminals():
        clade.name = ''

    return tree
Esempio n. 8
0
def fastaToNJTree(fastaFile, outputFile):
    aln = AlignIO.read(fastaFile, 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
    Phylo.write(tree, outputFile, 'newick')
Esempio n. 9
0
def consensus(msa):
    alignment = MultipleSeqAlignment(msa)
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(alignment)
    print tree
Esempio n. 10
0
def get_tree(aln, kind='nj'):
    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
Esempio n. 11
0
def plot_phylo_tree(align: MultipleSeqAlignment, accession_numbers: dict):
    """
    Plots a phylogenetic tree
    :param align: MultipleSeqAlignment with the alignment result to be plotted
    :param accession_numbers: dict of accession numbers and their translation to human-understandable names
    :return: figure-handle of the plotted phylogenetic tree
    """
    # calculate distance - https://biopython.org/wiki/Phylo
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(align)

    # construct a tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)

    # remove the names for the non-terminals for better visual appeal
    for non_terminal in tree.get_nonterminals():
        non_terminal.name = ''

    # change accession numbers into human more understandable names
    for terminal in tree.get_terminals():
        terminal.name = accession_numbers[re.match("(^\S*)(?=\.)",
                                                   terminal.name)[0]]

    print(Phylo.draw_ascii(tree))

    # plot the tree
    fig, ax = plt.subplots(1, 1)
    # draw the resulting tree
    Phylo.draw(tree, show_confidence=False, axes=ax, do_show=False)
    ax.set_xlim(right=0.8)
    return fig
Esempio n. 12
0
def main(argv):
    # Test table data and corresponding labels
    M_labels = [
        'Wuttagoonaspis', 'Romundina', 'Brindabellaspis', 'Eurycaraspis',
        'Entelognathus'
    ]
    print(M_labels)  #A through G
    M = np.loadtxt(open(argv[1], "rb"), delimiter=",")
    l = np.tril(M)
    temp = np.ones((5, 5))
    u = np.triu(temp)
    l += u
    np.fill_diagonal(l, 0)

    M = l.tolist()

    for j in range(0, 5):
        for i in range(0, 5):
            M[i] = list(filter(lambda a: a != 1, M[i]))

    m = _Matrix(M_labels, M)
    print(type(m))

    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(m)
    Phylo.draw(tree)
Esempio n. 13
0
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""
    def setUp(self):
        self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
Esempio n. 14
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"

    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    print("Number of characters in alignment:", len(alignment[0]))

    ####################
    # Neighbor joining #
    ####################
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor()
    start = time.time()
    tree = constructor.nj(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

    #########
    # UPGMA #
    #########

    start = time.time()
    tree = constructor.upgma(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)
Esempio n. 15
0
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'):  # Construct Tree with specific type (Default = UPGMA)
    if with_marburg == 1:
        print('Constructing Tree with All Viruses without Marburg')
        filename = algorithm + '_' + gene_name
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire']
    else:
        print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name))
        filename = algorithm + '_' + gene_name + '_with_Marburg'
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg']
        marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta")
        Alignment.read_data()
        print('Aligning Genes for marburg_genome')
        gene_name += '_with_marburg'
        Alignment.read_genes(marburg_genome)
    print('Reading edit matrix and construct tree')
    edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None)  # read edit matrix file
    constructor = DistanceTreeConstructor()  # Create a tree constructor object
    edit_matrix = convert_tu_lower_triangular(edit_matrix)  # Convert Edit Distance matrix to lower triangular
    distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix)
    if algorithm == 'NJ':  # Neighbor-Joining Alogrithm
        tree = constructor.nj(distance_matrix)
    else:  # UPGMA Algorithm
        tree = constructor.upgma(distance_matrix)
    save_tree(tree, filename)  # Save Tree into a file
    return tree
Esempio n. 16
0
def consensus(msa):
    alignment = MultipleSeqAlignment(msa)
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(alignment)
    print tree
def buildTree(FASTAFile):
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.nj(myMatrix)
    upgmaTree.root_at_midpoint()
    Phylo.draw(upgmaTree)
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    return tree
Esempio n. 18
0
    def distance_matrix(cls, cluster_list):
        print cluster_list
        dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list)
        
        distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()}
    
        matrix = []
        for i in range(0,len(cluster_list)):
            matrix_iteration = []
            for j in range(0,i+1):
                if i == j:
                    matrix_iteration.append(0)
                elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]])
                elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]])
                else:
                    raise("Error, can't find pair!")
            matrix.append(matrix_iteration)
            #print matrix_iteration

        cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list]
        matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(matrix_obj)
        tree.ladderize()
        #Phylo.draw_ascii(tree)
        output = StringIO.StringIO()
        Phylo.write(tree, output, 'newick')
        tree_str = output.getvalue()
        #print tree_str
        
        return tree_str
Esempio n. 19
0
def measure_D_net(G,qmod,qcon):
    D_net_dic = {}
    D_net_ret = {}
    D_net = []
    for u in G: D_net_dic[u] = {}

    for u in sorted(G):
        key1 = "Taxon" + str(u)
        tmp_row = []
        for v in sorted(G):
            key2 = "Taxon" + str(v)
            if u < v: continue
            D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon)
            tmp_row.append(D_net_dic[u][v])

            print D_net_dic[u][v],
        D_net.append(tmp_row)
        print '\n'


    names = []
    for u in G: names.append('Taxon'+str(u))
    print names 
    print D_net
    D_net_final = _DistanceMatrix(names,D_net)
    #print D_net_final.names 

    constructor = DistanceTreeConstructor()
    tree_dmc = constructor.upgma(D_net_final)
    #print tree_dmc
    Phylo.write(tree_dmc,'ph_dmc.nre','newick')
    
    return D_net_final
Esempio n. 20
0
    def summarise_dist(self, rf_results: RfResults, dir_out):

        for use_norm in (True, False):
            if use_norm:
                path_out = os.path.join(dir_out, 'rf_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg')
                plt_title = 'Normalised Robinson-Foulds Distance'
            else:
                path_out = os.path.join(dir_out, 'rf_un_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg')
                plt_title = '(un)Normalised Robinson-Foulds Distance'

            metrics = defaultdict(dict)
            names = set()
            for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items():
                if use_norm:
                    metrics[tid_a][tid_b] = norm_rf
                    metrics[tid_b][tid_a] = norm_rf
                else:
                    metrics[tid_a][tid_b] = rf
                    metrics[tid_b][tid_a] = rf
                names.add(tid_a)
                names.add(tid_b)

            labels = sorted(list(names))
            mat_vals = list()
            mat = np.zeros((len(labels), len(labels)))
            for i in range(len(labels)):
                cur_row = list()
                tid_a = labels[i]
                for j in range(i + 1):
                    tid_b = labels[j]
                    if tid_a == tid_b:
                        cur_row.append(0.0)
                    else:
                        cur_row.append(metrics[tid_a][tid_b])
                        mat[i, j] = metrics[tid_a][tid_b]
                mat_vals.append(cur_row)
            mat = mat + mat.T

            # Newick
            dm = DistanceMatrix(names=labels, matrix=mat_vals)
            constructor = DistanceTreeConstructor()
            tree = constructor.nj(dm)

            Phylo.write(tree, path_out, 'newick')

            # Heatmap
            cmap = sns.cubehelix_palette(100, reverse=True)

            sns.set(font_scale=1)
            fig_size = (15, 15)

            rf_df = pd.DataFrame(mat, columns=labels, index=labels)
            sns.clustermap(rf_df,
                           annot=True,
                           fmt='.3f',
                           cmap=cmap,
                           figsize=fig_size).fig.suptitle(plt_title)
            plt.savefig(path_hm)
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""
    def setUp(self):
        self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/upgma.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/nj.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/nj.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()
Esempio n. 22
0
def dna(file_path, file_format, algorithm):
    # Read the sequences and align
    aln = AlignIO.read(file_path, file_format)

    # Print the alignment
    print(aln)

    # Calculate the distance matrix
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)

    # Print the distance Matrix
    print('\nDistance Matrix\n===================')
    print(calculator)

    # Construct the phylogenetic tree using choosen algorithm
    constructor = DistanceTreeConstructor()
    if algorithm.lower() == 'upgma':
        tree = constructor.upgma(dm)
    elif algorithm.lower() == 'nj':
        tree = constructor.nj(dm)
    else:
        click.echo('Invalid algorithm!')

    # Draw the phylogenetic tree
    Phylo.draw(tree)

    # Print the phylogenetic tree in the terminal
    print('\nPhylogenetic Tree\n===================')
    Phylo.draw_ascii(tree)
def upgma_tree_constructor(x):
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(x)
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    Phylo.draw_ascii(upgmatree)
def nj_tree_constructor(x):
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(x)
    njtree = constructor.nj(dm)
    print(njtree)
    Phylo.draw_ascii(njtree)
Esempio n. 25
0
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""

    def setUp(self):
        self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
Esempio n. 26
0
def tree_reconstruction(phy_file, method, model, phyformat):
    '''Construct tree with given method and model'''

    aln = AlignIO.read(phy_file, 'phylip-' + phyformat)

    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator(model)
    dm = calculator.get_distance(aln)

    if method == 'upgma':
        tree = constructor.upgma(dm)
    elif method == 'nj':
        tree = constructor.nj(dm)

    tree.ladderize()

    for c in tree.find_clades():
        if 'Inner' in c.name:
            c.name = ''

    Phylo.write(tree, args.output + '/tree.nwk', 'newick')

    plt.rcParams['font.style'] = 'italic'
    plt.rc('font', size=8)
    plt.rc('axes', titlesize=14)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('figure', titlesize=18)

    draw(tree, do_show=False)
    plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
Esempio n. 27
0
def create_tree_distance_impl(msa, algorithm):
    calculator = DistanceCalculator('identity')
    constructor = DistanceTreeConstructor(distance_calculator=calculator,method=algorithm)
    tree = constructor.build_tree(msa)
    Phylo.write(tree, "../../data/created/tree" + str(random.randint(0,10000000)) + ".nex", "nexus")
    Phylo.draw(tree,do_show=False)
    plt.savefig("../../data/created/createdTree"+algorithm+".png")
    return "../../data/created/createdTree"+algorithm+".png"
Esempio n. 28
0
def main():
    alignment = AlignIO.read(open("protein.fasta"), "fasta")
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'upgma')
    tree = constructor.build_tree(alignment)
    tree.ladderize()
    Phylo.draw(tree)
Esempio n. 29
0
def get_tree():
    #biopython-extract the unrooted  tree
    aln = AlignIO.read('agc.aln', 'clustal')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return tree
Esempio n. 30
0
def build_tree(aln, kind='nj'):
    """Build a tree with bio.phylo module"""

    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
Esempio n. 31
0
 def build_nj_tree(self):
     dm = self.distance_matrix()
     constructor = DistanceTreeConstructor()
     tree = constructor.nj(dm)
     treeio = StringIO.StringIO()
     Phylo.write(tree, treeio, 'newick')
     treestr = treeio.getvalue()
     treeio.close()
     return treestr
Esempio n. 32
0
 def build_nj_tree(self):
     dm = self.distance_matrix()
     constructor = DistanceTreeConstructor()
     tree = constructor.nj(dm)
     treeio = StringIO.StringIO()
     Phylo.write(tree, treeio, 'newick')
     treestr = treeio.getvalue()
     treeio.close()
     return treestr
Esempio n. 33
0
 def tree(self):
     """Returns a phylogenetic tree constructed from the given alignment."""
     calculator = DistanceCalculator(self._distance_model)
     constructor = DistanceTreeConstructor(calculator, self._tree_algorithm)
     tree = constructor.build_tree(self.alignment)
     # Make the tree rooted.
     tree.root_at_midpoint()
     tree.root.name = 'Root'
     return tree
def createTree(file):
    aln = AlignIO.read(file, 'phylip')
    # Calculate the distance matrix
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)

    # Construct the phylogenetic tree using UPGMA algorithm
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    Phylo.write(tree, 'new.xml', 'phyloxml')
Esempio n. 35
0
def D_seq_matrix(fasta_file):
    aln = AlignIO.read(fasta_file, 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree_seq = constructor.upgma(dm)
    #print tree_dmc
    Phylo.write(tree_seq,'ph_seq.nre','newick')
    print dm.names 
    return dm
Esempio n. 36
0
def print_trees(country, position_table):
    ### Pull out the concensus sequence

    concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0]
    concensus_seq

    position_table = position_table.set_index('seqid')

    ### Determine which samples are farthest from the concensus sequence

    distance_from_concensus_seq = position_table.apply(
        lambda row: sum(row != concensus_seq), axis=1)
    distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values(
        ascending=False)
    distance_from_concensus_seq_sorted

    ### Select 10 sequences to do our first analysis

    subset_seqs = distance_from_concensus_seq_sorted[:10].index
    subset_seqs

    ### Construct a distance matrix for our sequences

    distances = {}
    for i, seqid1 in enumerate(subset_seqs):
        distances[seqid1, seqid1] = 0
        for j in range(i + 1, len(subset_seqs)):
            seqid2 = subset_seqs[j]
            distances[seqid1, seqid2] = sum(
                position_table.loc[seqid1] != position_table.loc[seqid2])
            distances[seqid2, seqid1] = distances[seqid1, seqid2]
    distances = pd.Series(distances).unstack()

    matrix = np.tril(distances.values).tolist()
    for i in range(len(matrix)):
        matrix[i] = matrix[i][:i + 1]
    dm = DistanceMatrix(list(distances.index), matrix)

    ### Now construct our tree
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    print(country.upper())
    print("Neighbor Joining Tree")
    tree.ladderize()  # Flip branches so deeper clades are displayed at top
    display(Phylo.draw(tree))
    #**Please see the guidance at the top of the page for what to try**

    if (len(dm) > 1):
        tree2 = constructor.upgma(dm)
        #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences
        print("UPGMA Tree")
        tree2.ladderize(
        )  # Flip branches so deeper clades are displayed at top
        display(Phylo.draw(tree2))
    return
Esempio n. 37
0
def phyloxml_from_msa(msa, phyloxml):
    from Bio import AlignIO
    from Bio.Phylo.TreeConstruction import DistanceCalculator
    from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
    from Bio import Phylo
    ms_alignment = AlignIO.read(msa, "fasta")
    calculator = DistanceCalculator("ident")
    dist_matrix = calculator.get_distance(ms_alignment)
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dist_matrix)
    Phylo.write(tree, phyloxml, "phyloxml")
Esempio n. 38
0
File: phylo.py Progetto: xzy3/QuaSim
def build_phylogenetic_tree(seqs):
    calculator = DistanceCalculator(DISTANCE_TYPE)
    # Print distance matrix for testing
    # distance_matrix = calculator.get_distance(seqs)

    constructor = DistanceTreeConstructor(calculator,
                                          TREE_CONSTRUCTION_ALGORITHM)

    tree = constructor.build_tree(seqs)

    return tree
Esempio n. 39
0
def construct_tree(X_2d, acc, title):
    acc = list(acc)
    data = pairwise_distances(X_2d).astype('float')
    data[np.isnan(data)] = 0
    data_list = []
    for i in range(data.shape[0]):
        #for j in range(i, data.shape[0]):
        data_list.append([data[i, j] for j in range(0, i+1)])
    data = data_list
    dm = _DistanceMatrix(acc, matrix=data)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    Phylo.write(tree, title + ".nwk", 'newick')
Esempio n. 40
0
def dendroNJ(inFile, model='identity', bootstrap=True, replicate=100):
    """
    Given an alingment in fasta format, the function returns a Neighbor Joining tree in newick format.
    Module required:
    - AlignIO (from Bio)
    - DistanceCalculator (from Bio.Phylo.TreeConstruction)
    - DistanceTreeConstructor (from Bio.Phylo.TreeConstruction)
    - bootstrap_consensus (from Bio.Phylo.Consensus)
    Usage: <inFile> <model (default = 'identity')> <bootstrap (default = True)>
                           <replicate (default = 100)>
    """
    aln = AlignIO.read(inFile, 'fasta') # read the alignment
    constructor = DistanceTreeConstructor(DistanceCalculator(model), 'nj')
    if bootstrap:
        tree = bootstrap_consensus(aln, int(replicate), constructor, majority_consensus)
    else:
        tree = constructor.build_tree(aln)
    return tree.format('newick')
Esempio n. 41
0
def build_tree(dist_matrix, names_list, clust):

    tree = None
    if clust == 'nj':
        # print(dist_matrix)
        dm = DistanceMatrix(dist_matrix, names_list)
        tree_scikit = nj(dm,result_constructor=str)
        tree = Tree(tree_scikit)
    elif clust == 'upgma':
        dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix))
        constructor = DistanceTreeConstructor()
        tree_biopython = constructor.upgma(dm)
        # remove InnerNode names
        for i in tree_biopython.get_nonterminals():
            i.name = None
        output = StringIO()
        Phylo.write(tree_biopython,output, "newick")
        tree = Tree(output.getvalue())
    else:
        print("Unknown tree clustering method ! Aborting")
        sys.exit()

    return tree
Esempio n. 42
0
def D_F_matrix(D_Seq,D_net,final_tree, alpha):

    names_Seq = D_Seq.names
    names_Net = D_net.names
    D_F = []
    D_F_names = []

    for key1 in names_Net:
        i = names_Net.index(key1)
        #print key1
        temp_row = []
        for j in range(0,i+1):


            key2 = names_Net[j]
            #print key2,
            if key1 in names_Net and key2 in names_Seq:
                if not key1 in D_F_names:
                    D_F_names.append(key1)
                i1 = names_Net.index(key1)
                j2 = names_Net.index(key2)                              # should be 1-alpha * D_net and alpha * D_seq
                new_val = ((1-alpha) * D_net[key1,key2]) + (alpha * D_Seq[key1,key2])  # alpha can be set to any value (between 0 and 1)
                #print new_val,                                          # we can change alpha to choose how much of D_Seq and D_net we want to use
                temp_row.append(new_val)
        #print temp_row
        D_F.append(temp_row)

    print D_F

    D_F_final = _DistanceMatrix(D_F_names,D_F)

    constructor = DistanceTreeConstructor()
    tree_D_F = constructor.upgma(D_F_final)
    #print tree_dmc
    Phylo.write(tree_D_F,final_tree,'newick')
    return D_F_final
Esempio n. 43
0
def D_F_matrix(D_Seq,D_net,final_tree):

    names_Seq = D_Seq.names
    names_Net = D_net.names
    D_F = []
    D_F_names = []

    for key1 in names_Net:
        i = names_Net.index(key1)
        #print key1
        temp_row = []
        for j in range(0,i+1):
            
            
            key2 = names_Net[j]
            #print key2,
            if key1 in names_Net and key2 in names_Seq:
                if not key1 in D_F_names:
                    D_F_names.append(key1)
                i1 = names_Net.index(key1)
                j2 = names_Net.index(key2)
                new_val = (0.5*D_net[key1,key2] + 0.5*D_Seq[key1,key2])
                #print new_val,
                temp_row.append(new_val)
        #print temp_row
        D_F.append(temp_row)

    print D_F 

    D_F_final = _DistanceMatrix(D_F_names,D_F)

    constructor = DistanceTreeConstructor()
    tree_D_F = constructor.upgma(D_F_final)
    #print tree_dmc
    Phylo.write(tree_D_F,final_tree,'newick')
    return D_F_final
Esempio n. 44
0
 def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA"):
     """Method for constructing dn tree and ds tree.
     Argument:
         -   dn_ds_method - Available methods include NG86, LWL85, YN00
                            and ML.
         -   tree_method  - Available methods include UPGMA and NJ.
     """
     from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
     dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method)
     dn_constructor = DistanceTreeConstructor()
     ds_constructor = DistanceTreeConstructor()
     if tree_method == "UPGMA":
         dn_tree = dn_constructor.upgma(dn_dm)
         ds_tree = ds_constructor.upgma(ds_dm)
     elif tree_method == "NJ":
         dn_tree = dn_constructor.nj(dn_dm)
         ds_tree = ds_constructor.nj(ds_dm)
     else:
         raise RuntimeError("Unkown tree method ({0}). Only NJ and UPGMA "
                            "are accepted.".format(tree_method))
     return dn_tree, ds_tree
Esempio n. 45
0
# CAGTTCGCCACAA Gamma

# Several thigns can be done witht he alignment: get a distance matrix from it:
dstcalc = DistanceCalculator('identity')
dm = dstcalc.get_distance(aln)
# DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon'], matrix=[[0], [0.23076923076923073, 0], [0.3846153846153846, 0.23076923076923073, 0], [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0], [0.6153846153846154, 0.3846153846153846, 0.46153846153846156, 0.15384615384615385, 0]])
print "What's the get_distance(aln) from DistanceCalculator('identity') object?"
print type(dm)
print dm
# Alpha   0
# Beta    0.230769230769  0
# Gamma   0.384615384615  0.230769230769  0
# Delta   0.538461538462  0.538461538462  0.538461538462  0
# Epsilon 0.615384615385  0.384615384615  0.461538461538  0.153846153846  0

# build a tree from it.
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

construc0 = DistanceTreeConstructor(dstcalc, 'nj')
tre0 = construc0.build_tree(aln)
print type(tre0)
# as you can see from abovedstcalc is needed for te constructor and then
# to build the tree the alignment is needed. That's two things which need to originae fromt he same thing.
# A bit of a tall order
# You can build the tree from a distance matrix only, by leaving out the aln argument
# by not using the build_tree method on the constructor, but rather the .nj method

construc2 = DistanceTreeConstructor()
tre2 = construc2.nj(dm)
print type(tre2)
Esempio n. 46
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	plot_rectangular = False
	common_kmer_data_path=''
	taxonomic_names_on_leaves = False
	try:
		opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputCommonKmerXFile"):
			input_file = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
		elif opt in ("-D", "--CommonKmerDataPath"):
			common_kmer_data_path = arg
		elif opt in ("-r", "--Rectangular"):
			plot_rectangular = True
		elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
			taxonomic_names_on_leaves = True
	
	
	#Read in the x vector
	fid = open(input_file,'r')
	x = map(lambda y: float(y),fid.readlines())
	fid.close()
	
	#Normalize the x vector
	#x = map(lambda y: y/sum(x),x)
	
	#Read in the taxonomy
	taxonomy = list()
	fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r')
	for line in fid:
		taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID)
	fid.close()
	
	#Read in the basis for the ckm matrices
	x_file_names = list()
	fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r')
	for line in fid:
		x_file_names.append(os.path.basename(line.strip()))
	fid.close()
	
	#Read in the common kmer matrix
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r')
	ckm30=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r')
	ckm50=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	names = x_file_names
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])
	
	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)
	
	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)
	
	#Function to insert a node some % along a branch
	def insert_hyp_node(t, leaf_name, percent):
		total_dist = t.get_distance(t.name,leaf_name)
		percent_dist = percent*total_dist
		child_node = (t&leaf_name)
		ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))
	
	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = map(lambda y: y**1.5,cutoffs)
	for i in range(len(x_file_names)):
		xi = x[i:len(x):len(x_file_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, x_file_names[i], cutoffs[j-1])
				hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names
				#insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])
	
	#Now put the bubbles on the nodes
	def layout(node):
		#print(node)
		if node.is_leaf():
			if node.name in x_file_names:
				#make reconstructed bubble
				size = x[x_file_names.index(node.name)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				if taxonomic_names_on_leaves:
					nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)])
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
				else:
					nameFace = AttrFace("name", fsize=25, fgcolor='black')
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in x_file_names:
				idx = hyp_node_names[node.name][2]
				size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#print node
				#print size
			else:
				size=0
		else:
			size=0
		#print(size)
	
	ts = TreeStyle()
	ts.layout_fn = layout
	if plot_rectangular:
		ts.mode = "r"
	else:
		ts.mode = "c"
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50

	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
def NNIheuristic(FASTAFile, sampleSize, threshold, outputDir):
    """"Find the maximum parsimony score for that tree"""
    random.seed(0)
    outputFile = FASTAFile.replace(".align", ".out")
    if "/" in outputFile:
        outputFile = outputFile[outputFile.rfind("/"):]
    output = open(outputDir + "/" + outputFile, 'w')
    output.write("*****************RUN STARTS HERE!*****************")
    #start time
    startTime = time.clock()
    output.write("\n" + "Filename: " + FASTAFile + "\n")
    output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n")
    output.write("Sample Size: " + str(sampleSize) + "\nThreshold: " + str(threshold) + "\n\n")
    # Import fasta alignment file
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    output.write("matrix constructed here")
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.upgma(myMatrix)
    
    output.write("constructed upgma tree")
        
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format
    output.write("created the original tree into newick format")

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    score = maxParsimony(tree, tipMapping)
    graph = nx.Graph()
    makeGraph(graph, tree)
    output.write("made a graph")
    leaves = getLeaves(tree)
    currentFeasible = isFeasible(graph,leaves)
    
    output.write("tested isFeasible")
    
    # Perform NNI heuristic
    counter = 0
    loopCounter = 0
    while True:
        output.write("in the while loop")
        loopCounter += 1
        output.write("Loop Iteration: " + str(loopCounter) + "\n")
        output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n")
        output.write("Current Tree\nFeasibility: " + str(currentFeasible) + "\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n")
        NNIs = allNNIs(tree)
        if len(NNIs)-1 < sampleSize:
            sampleSize = len(NNIs)-1
        toScore = random.sample(NNIs, sampleSize)
        
        # add feasibility test
        output.write("starting feasibility test")
        feasible = []
        infeasible = []
        for tree in toScore:
            graph = nx.Graph()
            makeGraph(graph, tree)
            leaves = getLeaves(tree)
            if isFeasible(graph, leaves): #if this tree is possible
                feasible.append(tree)
            else:
                infeasible.append(tree) #if this tree is not possible
        output.write("Number of Feasible Neighbor Trees: " + str(len(feasible)) + "\n")
        output.write("Number of Infeasible Neighbor Trees: " + str(len(infeasible)) + "\n")
        if len(feasible) != 0: #if feasible trees were found
            if isFeasible(graph, leaves): #if this NNI is possible
                feasible.append(tree) 
            else:
                infeasible.append(tree) #if this NNI is not possible
        if len(feasible) != 0: #if feasible NNIs were found
            scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), feasible)
            sortedList = sorted(scoredList)
            counter = 0
            if not currentFeasible or sortedList[0][0] < score:
                score = sortedList[0][0]
                tree = sortedList[0][1]
                currentFeasible = True
                output.write("Found a New Feasible Tree!\n\n")
            else:
                output.write("Best Possible Feasible Tree Found\n" + str(tree) + "\n" + "Score: " + str(score) + "\n\n")
                break
        else: #if no possible trees we're found
            if currentFeasible: #checks if the original tree was feasible
                output.write("No Feasible Neighbors, Best Possible Feasible Tree\n" + str(tree) + "\n\n")
                break
            counter += 1
            output.write("Threshold counter: " + str(counter) + "\n\n")
            if counter >= threshold:
                output.write("Threshold Met: No Feasible Tree Found\n")
                stopTime = (time.clock() - startTime)
                output.write("Program Stop: " + str(stopTime) + " seconds\n\n")
                return
            output.write("Searching Infeasible Space\n")
            scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), infeasible)
            sortedList = sorted(scoredList)
            choseNeighbor = False    
            for neighbor in sortedList: #if the original tree was infeasible and no feasible neighbors were found, take the next best infeasible tree and run again
                if neighbor[0] > score:
                    score = neighbor[0]
                    tree = neighbor[1]
                    choseNeighbor = True
                    break
            if not choseNeighbor: 
                score = sortedList[-1][0]
                tree = sortedList[-1][1]
            currentFeasible = False
            output.write("Next Best Infeasible Tree\n\n")
    endTime = (time.clock() - startTime)
    output.write("Program End: " + str(endTime) + " seconds\n\n")
                
    #outputTree = RLRtoNewick(tree)
    #print "Final score", score
    return
Esempio n. 48
0
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio.Phylo.TreeConstruction import _DistanceMatrix


def dm_to_tree(dm):
    dm = dm.astype(float)
    distance_triangular = [list(dm.values[i, : i + 1]) for i in range(len(dm))]
    try:
        dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular)
    except Exception, e:
        print list(dm.columns)
        print [type(i) for i in dm.columns]
        print type(distance_triangular)
        print type(distance_triangular[0])
        print set([str(type(i)) for j in distance_triangular for i in j])
        print distance_triangular
        raise e
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    for c in tree.get_nonterminals():
        c.name = None
    return tree
Esempio n. 49
0
## pad sequences so that they all have the same length
#for record in records:
#    if len(record.seq) != maxlen:
#        sequence = str(record.seq).ljust(maxlen, '.')
#        record.seq = Seq.Seq(sequence)
#assert all(len(record.seq) == maxlen for record in records)

## write to temporary file and do alignment
#output_file = '{}_padded.fasta'.format(os.path.splitext(input_file)[0])
#with open(output_file, 'w') as f:
#    SeqIO.write(records, f, 'fasta')
#alignment = AlignIO.read(output_file, "fasta")

#cline = ClustalwCommandline("clustalw2", infile=input_file)
#print(cline)
#print type(cline)

muscle_cline = MuscleCommandline(input=input_file)
stdout, stderr = muscle_cline()
alignment = AlignIO.read(StringIO(stdout), "fasta")
print(alignment)

#alignment = AlignIO.read('../data/ls_orchid.fasta', 'fasta')
#print alignment
calculator = DistanceCalculator('ident')
dm = calculator.get_distance(alignment)
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.write(tree, 'phyloxml.xml', 'phyloxml')
Esempio n. 50
0
def compute_tree(options, mat, names):
    """ make upgma hierarchical clustering and write it as png and
    graphviz dot
    """
    # oops, convert to biopython matrix
    matrix = []
    for i in xrange(len(names)):
        row = []
        for j in xrange(i + 1):
            # tree constructor writes 0-distances as 1s for some reason
            # so we hack around here
            val = float(mat[names[i]][names[j]])
            if val == 0.:
                val = 1e-10
            elif val == 1.:
                val = 1.1
            row.append(val)
        matrix.append(row)
    dm = _DistanceMatrix(names, matrix)

    # upgma tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    robust_makedirs(os.path.dirname(tree_path(options)))
    Phylo.write(tree, tree_path(options), "newick")

    # png tree -- note : doesn't work in toil
    def f(x):
        if "Inner" in str(x):
            return ""
        else:
            return x
    Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10)
    pylab.savefig(tree_path(options).replace("newick", "png"))

    # graphviz
    # get networkx graph
    nxgraph = Phylo.to_networkx(tree)
    # make undirected
    nxgraph = nx.Graph(nxgraph)
    # push names to name labels
    nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label")
    for node_id in nxgraph.nodes():
        node = nxgraph.node[node_id]
        if "Inner" in str(node["label"]):
            node["label"] = "\"\""
            node["width"] = 0.001
            node["height"] = 0.001
        else:
            node["fontsize"] = 18
    for edge_id in nxgraph.edges():
        edge = nxgraph.edge[edge_id[0]][edge_id[1]]
        # in graphviz, weight means something else, so make it a label
        weight = float(edge["weight"])
        # undo hack from above
        if weight > 1:
            weight = 1.
        if weight <= 1e-10 or weight == 1.:
            weight = 0.
        edge["weight"] = None
        edge["label"] = "{0:.3g}".format(float(weight) * 100.)
        edge["fontsize"] = 14
        edge["len"] = draw_len(weight)
    nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
def nj_tree(distanceMatrix):
    print "Constructing Neighbor Joining Tree"
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(distanceMatrix)
    Phylo.write(tree, "geneContentTree.newick", "newick")
    print "Done constructing tree"
Esempio n. 52
0
# rosalind_ba7b
'''
Limb Length Problem

Find the limb length for a leaf in a tree.

Given: An integer n, followed by an integer j between 0 and n - 1, 
followed by a space-separated additive distance matrix D (whose elements are integers).

Return: The limb length of the leaf in Tree(D) corresponding to row j of this 
distance matrix (use 0-based indexing).

'''
import numpy as np
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

f = open('rosalind_ba7b.txt')
n = int(f.readline().rstrip())
j = int(f.readline().rstrip())

D = np.fromfile(f, sep=' ', dtype=int).reshape(n, n)

#For the Phylo.TreeConstruction to work, integers must be Python int and not numpy.int64
dm = [[int(D[i, j]) for j in range(i+1)] for i in range(n)]
names = [str(i) for i in range(n)]

constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names, dm))

print(round(tree.find_any(str(j)).branch_length))
def noFeasibleTest(FASTAFile, sampleSize, outputDir):
    """"takes a FASTAFile, constructs a UPGMA Tree from the file data, converts this tree to RLR format,
    tries to find the tree with the lowest parsimony score (ignores feasibility check)"""
    random.seed(0)
    outputFile = FASTAFile.replace(".align", ".out")
    if "/" in outputFile:
        outputFile = outputFile[outputFile.rfind("/"):]
    output = open(outputDir + "/" + outputFile, 'w')
    output.write("*****************RUN STARTS HERE!*****************")
    #start time
    startTime = time.clock()
    output.write("\n" + "Filename: " + FASTAFile + "\n")
    output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n")
    output.write("Sample Size: " + str(sampleSize) + "\n\n")
    # Import fasta alignment file
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.upgma(myMatrix)
        
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NNI.NewicktoRLR(tree)
    score = NNI.maxParsimony(tree, tipMapping)
        
    # Perform NNI heuristic
    loopCounter = 0
    while True:
        loopCounter += 1
        output.write("Loop Iteration: " + str(loopCounter) + "\n")
        output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n")
        output.write("Current Tree\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n")
        NNIs = NNI.allNNIs(tree)
        if len(NNIs)-1 < sampleSize:
            sampleSize = len(NNIs)-1
        toScore = random.sample(NNIs, sampleSize)
        
        scoredList = map(lambda x: (NNI.maxParsimony(x, tipMapping), x), toScore)
        sortedlist = sorted(scoredList)
        if sortedlist[0][0] < score:
            score = sortedlist[0][0]
            tree = sortedlist[0][1]
            output.write("Found A More Parsimonious Tree!\n\n")
            
        else:
            break
            output.write("No Neighbors With Better Scores Found\n\n")
    output.write("Final Tree:\n" + str(tree) + "\nScore: " + str(score) + "\n\n")
    endTime = (time.clock() - startTime)
    output.write("Program End: " + str(endTime) + " seconds\n\n")
    return
Esempio n. 54
0
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):
	
	#Make sure names are unique
	names = org_names
	for name in names:
		if names.count(name)>1:
			temp_name = name
			i=1
			for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup
				names[names.index(temp_name)] = temp_name + "_" + str(i)
				i = i +1
		
	#Normalize the x vector
	x = map(lambda y: y/sum(x),x)
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])

	#Make the list of distances (ave of the two ckm matrices)
	ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm
	ckm_ave_train_dist = dict()
	for i in range(len(org_names)):
		ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))]

	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)

	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)

	#Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
	def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
		dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name])
		nearby_indicies = list()
		#Add all the organisms that are within 0.05 of the given percent
	#	for i in range(len(dists)):
	#		if dists[i]<=.05:
	#			nearby_indicies.append(i)
		nearby_names = list()
		#If there are no nearby indicies, add the closest organism to the given percent
		if nearby_indicies==[]:
			nearby_names.append(org_names[dists.index(min(dists))])
		else:
			for i in range(len(nearby_indicies)):
				nearby_names.append(org_names[i])
		mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names))
		nearby_names.append(leaf_name)
		LCA = t.get_common_ancestor(nearby_names)
		LCA_to_leaf_dist = t.get_distance(LCA,leaf_name)
		#divide the dist to the right/left of the LCA node by the number of percentage points in there
		if LCA.name==t.name:
			percent_dist = percent*LCA_to_leaf_dist
			if mean_dist <= percent:
				child_node = (t&leaf_name)
			else:
				child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy
			ancestor_node = (t&child_node.name).up
		elif mean_dist <= percent:
			percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		else:
			percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))

	#Set outgroup
	if outgroup in names:
		t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names...
	else:
		print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ")
		print(names)
		print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.")

	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs]
	for i in range(len(org_names)):
		xi = x[i:len(x):len(org_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names)
				hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names

	size_factor=250
	font_size=55

	#Now put the bubbles on the nodes
	def layout(node):
		node_style = NodeStyle()
		node_style["hz_line_width"] = 10
		node_style["vt_line_width"] = 10
		node.set_style(node_style)
		#print(node)
		if node.is_leaf():
			if node.name in org_names:
				#make reconstructed bubble
				size = x[org_names.index(node.name)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#Denote that this was a training organism
				nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in org_names:
				idx = hyp_node_names[node.name][2]
				size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#This is if I want the names of the hypothetical nodes to be printed as well
				#nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				#faces.add_face_to_node(nameFace, node, 0, position="branch-right")
			else:
				size=0
		else:
			size=0
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	#ts.mode = "c"
	ts.scale = 2*1000
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50
	F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere")
	F.border.width = None
	F.opacity = 0.6
	ts.legend.add_face(F,0)
	ts.legend.add_face(TextFace("  Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1)
	ts.legend.add_face(TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1)
	ts.legend_position=4
	#t.show(tree_style=ts)
	t.render(outfile, w=550, units="mm", tree_style=ts)
	
	#Redner the XML file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	project.add_phylogeny(phylo)
	project.export(open(outfilexml,'w'))
Esempio n. 55
0
# Creates the distance matrix
calculator = DistanceCalculator('ident')
dm_ape = calculator.get_distance(alignApe)
dm_hiv = calculator.get_distance(alignHIV)


# Jukes Cantor corrections
dm_ape_corrected = dm_ape
for d in dm_ape_corrected.matrix:
	d[:] = [-3/4*np.log(1-4/3*x) for x in d]

dm_hiv_corrected = dm_hiv
for d in dm_hiv_corrected.matrix:
	d[:] = [-3/4*np.log(1-4/3*x) for x in d]


# Constructs the tree using the upgma algorithm
constructor = DistanceTreeConstructor()

tree_ape = constructor.upgma(dm_ape)
tree_ape_corrected = constructor.upgma(dm_ape_corrected)

tree_hiv = constructor.upgma(dm_hiv)
tree_hiv_corrected = constructor.upgma(dm_hiv_corrected)

# Outputs the trees as a xml
Phylo.write(tree_ape, 'treeApe.xml', 'phyloxml')
Phylo.write(tree_ape_corrected, 'treeApe_corrected.xml', 'phyloxml')

Phylo.write(tree_hiv, 'treeHIV.xml', 'phyloxml')
Phylo.write(tree_hiv_corrected, 'treeHIV_corrected.xml', 'phyloxml')
 def setUp(self):
     self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip')
     calculator = DistanceCalculator('blosum62')
     self.dm = calculator.get_distance(self.aln)
     self.constructor = DistanceTreeConstructor(calculator)
Esempio n. 57
0
def main():
    global YIELD_FILE
    global MLST_FILE
    global FORCE_MLST_SCHEME
    #Set up the file names for Nullarbor folder structure
    YIELD_FILE = 'yield.tab'
    MLST_FILE = 'mlst.tab'


    #Add MLST schemes to force their usage if that species is encountered
    #Only force schemes if there are two (e.g., A baumannii and E coli)
    FORCE_MLST_SCHEME = {"Acinetobacter baumannii": "abaumannii_2",
                         "Campylobacter jejuni": "campylobacter",
                         #"Citrobacter freundii": "cfreundii",
                         #"Cronobacter": "cronobacter",
                         "Enterobacter cloacae": "ecloacae",
                         "Escherichia coli": "ecoli",
                         #"Klebsiella oxytoca": "koxytoca",
                         #"Klebsiella pneumoniae": "kpneumoniae",
                         #"Pseudomonas aeruginosa": "paeruginosa"
                         "Shigella sonnei": "ecoli",
                         "Salmonella enterica": "senterica",
                         "Vibrio cholerae": "vcholerae"
                        }


    '''
    Read in the MDU-IDs from file. For each ID, instantiate an object of
    class Isolate.  This class associates QC data with the ID tag.
    Move the contigs for all isolates into a tempdir, with a temp 9-character
    filename.  Run andi phylogenomics on all the contig sets.  Infer an NJ tree
    using Bio Phylo from the andi-calculated distance matrix.  Correct the
    negative branch lengths in the NJ tree using ETE3.  Export the tree to
    file. Gather and combine the metadata for each ID as a super-matrix.
    Optionally, add LIMS metadata to the super-matrix from a LIMS excel
    spreadsheet option (adds MALDI-ToF, Submitting Lab ID, Submitting Lab
    species guess) and/or use the flag-if-new to highlight
    'new' isolates.  Export the tree and metadata to .csv, .tsv/.tab file.
    Export the 'isolates not found' to text file too.
    '''
    if not ARGS.subparser_name:
        PARSER.print_help()
        sys.exit()


    elif ARGS.subparser_name == 'version':
        from .utils.version import Version
        Version()
        sys.exit()

    else:# ARGS.subparser_name == "run":
        if ARGS.Nullarbor_folders:
            print('Nullarbor folder structure selected.')
            YIELD_FILE = 'yield.clean.tab'
            MLST_FILE = 'mlst2.tab'

        EXCEL_OUT = (f"{os.path.splitext(os.path.basename(ARGS.LIMS_request_sheet))[0]}" \
                     f"_results.xlsx")

        if ARGS.threads > cpu_count():
            sys.exit(f'Number of requested threads must be less than {cpu_count()}.')

        print(str(ARGS.threads) +' CPU processors requested.')


        #Check if final slash in manually specified wgs_qc path
        if ARGS.wgs_qc[-1] != '/':
            print('\n-wgs_qc path is entered as '+ARGS.wgs_qc)
            print('You are missing a final \'/\' on this path.')
            print('Exiting now.\n')
            sys.exit()



        #i) read in the IDs from file
        xls_table = get_isolate_request_IDs(ARGS.LIMS_request_sheet)
        IDs = list(set(xls_table.index.values))

        #base should be a global, given that it is used in other functions too.
        base = os.path.splitext(ARGS.LIMS_request_sheet)[0]

        #ii) Return a folder path to the QC data for each available ID
        #    using a wildcard search of the ID in IDs in ARGS.wgs_qc path.
        iso_paths = isolates_available(IDs)
        #Drop the path and keep the folder name
        isos = [i.split('/')[-1] for i in iso_paths]

        #iii) make tempdir to store the temp_contigs there for 'andi' analysis.
        assembly_tempdir = make_tempdir()

        #vi) Copy contigs to become temp_contigs into tempdir, only if andi
        #requested.
        #Translation dict to store {random 9-character filename: original filename}
        iso_ID_trans = {}
        #Dict to store each isolate under each consensus species#####maybe delete
        from collections import defaultdict
        isos_grouped_by_cons_spp = defaultdict(list)
        for iso in isos:
            #Instantiate an Isolate class for each isolate in isolates
            sample = Isolate(iso)
            #Next, we could just use iso_path+/contigs.fa, but that would skip
            #the if os.path.exists() test in sample.assembly(iso).
            assembly_path = sample.assembly()
            short_id = shortened_ID()
            #Store key,value as original_name,short_id for later retrieval.
            iso_ID_trans[iso] = short_id
            if ARGS.andi_run:
                cmd = 'ln -s '+assembly_path+' '+assembly_tempdir+'/'+short_id+\
                      '_contigs.fa'
                os.system(cmd)
                print('Creating symlink:', cmd)
        if len(list(iso_ID_trans.items())) > 0:
            with open(base+'_temp_names.txt', 'w') as tmp_names:
                print('\nTranslated isolate IDs:\nShort\tOriginal')
                for key, value in list(iso_ID_trans.items()):
                    print(value+'\t'+key)
                    tmp_names.write(value+'\t'+key+'\n')
        if ARGS.metadata_run:
           #summary_frames will store all of the metaDataFrames herein
            summary_frames = []
            n_isos = len(isos)
            if n_isos == 0:
                print('\nNo isolates detected in the path '+ARGS.wgs_qc+'.')
                print('Exiting now.\n')
                sys.exit()
            #Kraken set at 2 threads, so 36 processes can run on 72 CPUs
            #Create a pool 'p' of size based on number of isolates (n_isos)
            if n_isos <= ARGS.threads//2:
                p = Pool(n_isos)
            else:
                p = Pool(ARGS.threads//2)
            print(f'\nRunning kraken on the assemblies ({ARGS.assembly_name} files):')
            results_k_cntgs = p.map(kraken_contigs_multiprocessing, isos)
            print(results_k_cntgs)
            #concat the dataframe objects
            res_k_cntgs = pd.concat(results_k_cntgs, axis=0, sort=False)
            print('\nKraken_contigs results gathered from kraken on contigs...')

            #Multiprocessor retrieval of kraken results on reads.  Single thread
            #per job.
            if n_isos <= ARGS.threads:
                p = Pool(n_isos)
            else:
                p = Pool(ARGS.threads)
            results_k_reads = p.map(kraken_reads_multiprocessing, isos)
            #concat the dataframe objects
            res_k_reads = pd.concat(results_k_reads, axis=0)
            print('Kraken_reads results gathered from kraken.tab files...')

            #Multiprocessor retrieval of contig metrics.  Single process
            #per job.
            results_metrics_contigs = p.map(metricsContigs_multiprocessing, isos)
            res_m_cntgs = pd.concat(results_metrics_contigs, axis=0)
            print('Contig metrics gathered using \'fa -t\'...')

            #Multiprocessor retrieval of read metrics.  Single process
            #per job.
            results_metrics_reads = p.map(metricsReads_multiprocessing, isos)
            res_m_reads = pd.concat(results_metrics_reads, axis=0)
            print('Read metrics gathered from '+YIELD_FILE+' files...')

            #Multiprocessor retrieval of abricate results. Single process
            #per job.
            results_abricate = p.map(abricate_multiprocessing, isos)
            res_all_abricate = pd.concat(results_abricate, axis=0, sort=False)
            res_all_abricate.fillna('', inplace=True)
            print('Resistome hits gathered from abricate.tab files...')

            #append the dfs to the summary list of dfs
            summary_frames.append(res_k_cntgs)
            summary_frames.append(res_k_reads)
            summary_frames.append(res_m_cntgs)
            summary_frames.append(res_m_reads)
            summary_frames.append(res_all_abricate)

            #These next steps build up the metadata not yet obtained
            #(via mulitprocesses above), also replace the dm-matrix short names
            #with original names

            #Let's store the metadata for each isolate in summary_isos
            summary_isos = []

            #Let's populate summary_isos above, isolate by isolate (in series)
            c = 0
            for iso in isos:
                iso_df = []
                sample = Isolate(iso)
                short_id = iso_ID_trans[iso]
                species_cntgs = res_k_cntgs.loc[iso, 'sp_krkn1_cntgs']
                species_reads = res_k_reads.loc[iso, 'sp_krkn1_reads']
                if species_cntgs == species_reads:
                    species = species_cntgs
                else:
                    species = 'indet'
                mlst_df = sample.mlst(species, sample.assembly())
                iso_df.append(mlst_df)
                species_consensus = {'sp_krkn_ReadAndContigConsensus':species}
                species_cons_df = pd.DataFrame([species_consensus], index=[iso])
                iso_df.append(species_cons_df)
                iso_df_pd = pd.concat(iso_df, axis=1)
                summary_isos.append(iso_df_pd)

            #Glue the isolate by isolate metadata into a single df
            summary_isos_df = pd.concat(summary_isos)
            #Glue the dataframes built during multiprocessing processes
            summary_frames_df = pd.concat(summary_frames, axis=1)
            #Finish up with everything in one table!
            metadata_overall = pd.concat([xls_table, summary_isos_df, summary_frames_df],
                                         axis=1, sort=False)

            metadata_overall.fillna('', inplace=True)
            metadata_overall.index.name = 'ISOLATE'
            print('\nMetadata super-matrix:')
            #Write this supermatrix (metadata_overall) to csv and tab/tsv
            csv = os.path.abspath(base+'_metadataAll.csv')
            tsv = os.path.abspath(base+'_metadataAll.tab')
            json = os.path.abspath(base+'_metadataAll.json')
            metadata_overall.to_csv(sys.stdout)
            writer = pd.ExcelWriter(EXCEL_OUT)
            metadata_overall.to_excel(writer,'Sheet 1', freeze_panes=(1, 1))
            writer.save()
            print(f"\nResults written to {os.path.abspath(EXCEL_OUT)}")

            for k, v in zip(metadata_overall['sp_krkn_ReadAndContigConsensus'],
                            metadata_overall.index):
                isos_grouped_by_cons_spp[k.replace(' ', '_')].append(v)

        #Run andi?
        if ARGS.andi_run:
            #Run andi
            andi_mat = 'andi_'+ARGS.model_andi_distance+'dist_'+base+'.mat'
            andi_c = 'nice andi -j -m '+ARGS.model_andi_distance+' -t '+\
                      str(ARGS.threads)+' '+assembly_tempdir+'/*_contigs.fa > '+\
                      andi_mat
            print('\nRunning andi with: \''+andi_c+'\'')
            os.system(andi_c)

            #Read in the andi dist matrix, convert to lower triangle
            dm = read_file_lines(andi_mat)[1:]
            dm = lower_tri(dm)
            #Correct the names in the matrix
            for iso in isos:
                #Could do it this way, but this is slower than a nested loop
                #dm.names[dm.names.index(iso_ID_trans[iso])] = iso
                #real	0m9.417s
                #user	1m18.576s
                #sys	0m2.620s
                #Nested loop is faster
                for i in range(0, len(dm.names)):
                    #iso_ID_trans[iso] is the short_id
                    if dm.names[i] == iso_ID_trans[iso]:
                        dm.names[i] = iso
                #real	0m8.789s
                #user	1m14.637s
                #sys	0m2.420s

            #From the distance matrix in dm, infer the NJ tree
            from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
            constructor = DistanceTreeConstructor()
            njtree = constructor.nj(dm)
            njtree.rooted = True
            from Bio import Phylo
            Phylo.write(njtree, 'temp.tre', 'newick')
            from ete3 import Tree
            t = Tree('temp.tre', format=1)
            #Get rid of negative branch lengths (an artefact, not an error, of NJ)
            for node in t.traverse():
                node.dist = abs(node.dist)
            t.set_outgroup(t.get_midpoint_outgroup())
            t_out = base+'_andi_NJ_'+ARGS.model_andi_distance+'dist.nwk.tre'
            t.write(format=1, outfile=t_out)
            print('Final tree (midpoint-rooted, NJ under '+\
                   ARGS.model_andi_distance+' distance) looks like this:')
            #Print the ascii tree
            print(t)
            #Remove the temp.tre
            os.remove('temp.tre')
            print('Tree (NJ under '+ARGS.model_andi_distance+\
                  ' distance, midpoint-rooted) written to '+t_out+'.')

        #Run roary?
        if ARGS.roary_run:
            roary_keepers = [
                            "accessory.header.embl",
                            "accessory.tab",
                            "accessory_binary_genes.fa",
                            "accessory_binary_genes.fa.newick",
                            "accessory_binary_genes_midpoint.nwk.tre",
                            "accessory_graph.dot",
                            "blast_identity_frequency.Rtab",
                            "clustered_proteins",
                            "core_accessory.header.embl",
                            "core_accessory.tab",
                            "core_accessory_graph.dot",
                            "core_gene_alignment.aln",
                            "gene_presence_absence.Ltab.csv",
                            "gene_presence_absence.Rtab",
                            "gene_presence_absence.csv",
                            "number_of_conserved_genes.Rtab",
                            "number_of_genes_in_pan_genome.Rtab",
                            "number_of_new_genes.Rtab",
                            "number_of_unique_genes.Rtab",
                            "pan_genome_reference.fa",
                            "pan_genome_sequences",
                            "summary_statistics.txt"
                            ]
            params = [(i, 'prokka') for i in isos if not
                      os.path.exists('prokka/'+i)]
            if len(params) > 0:
                print('\nRunning prokka:')
                if len(params) <= ARGS.threads//2:
                    p = Pool(len(params))
                else:
                    p = Pool(ARGS.threads//2)
                p.map(prokka, params)
            else:
                print('\nProkka files already exist. Let\'s move on to '+\
                      'the roary analysis...')

            #Run Roary on the species_consensus subsets.
            print('Now, let\'s run roary!')
            for k, v in list(isos_grouped_by_cons_spp.items()):
                print(k, v)
                n_isos = len(v)
                if n_isos > 1:
                    shutil.rmtree(base+'_'+k+'_roary', ignore_errors=True)
                    roary(base, k,
                          ' '.join(['prokka/'+iso+'/*.gff' for iso in v]))
                    roary_genes = pd.read_table(base+'_'+k+
                                                '_roary/gene_presence_absence.' +\
                                                'Rtab',
                                                index_col=0, header=0)
                    roary_genes = roary_genes.transpose()
                    roary_genes.to_csv(base+'_'+k+
                                       '_roary/gene_presence_absence.Ltab.csv',
                                       mode='w', index=True, index_label='name')
                    if n_isos > 2:
                        from ete3 import Tree
                        t = Tree(base+'_'+k+
                                 '_roary/accessory_binary_genes.fa.newick',
                                 format=1)
                        #Get rid of negative branch lengths (an artefact,
                        #not an error, of NJ)
                        for node in t.traverse():
                            node.dist = abs(node.dist)
                        t.set_outgroup(t.get_midpoint_outgroup())
                        t_out = base+'_'+k+\
                                '_roary/accessory_binary_genes_midpoint.nwk.tre'
                        t.write(format=1, outfile=t_out)
                        print('\nWritten midpoint-rooted roary tree.\n')
                        wd = os.getcwd()
                        os.chdir(base+'_'+k+'_roary')
                        for f_name in glob.glob('*'):
                            if f_name not in roary_keepers:
                                shutil.rmtree(f_name, ignore_errors=True)
                                os.remove(f_name)
                        os.chdir(wd)
                    if n_isos <= 2:
                        print('Need more than two isolates to have a meaningful '+\
                              'pangenome tree. No mid-point rooting of the ' +\
                              'pangenome tree performed.')
                    wd = os.getcwd()
                    os.chdir(base+'_'+k+'_roary')
                    os.system('python ../collapseSites.py -f core_gene_alignment.aln -i fasta -t '+str(ARGS.threads))
                    if os.path.exists('core_gene_alignment_collapsed.fasta'):
                        os.system('FastTree -nt -gtr < core_gene_alignment_collapsed.fasta > core_gene_FastTree_SNVs.tre')

                        #calc pairwise snp dist and write to file
                        with open('core_gene_alignment_collapsed.fasta', 'r') as inf:
                            from Bio import AlignIO
                            aln = AlignIO.read(inf, 'fasta')
                            pairs = []
                            for i in range(0,len(aln)):
                                lst = [(aln, i, j) for j in range(0, i+1)]
                                pairs.append(lst)
                            if len(pairs) <= ARGS.threads:
                                p = Pool(len(pairs))
                            else:
                                p = Pool(ARGS.threads)
                            print('Running pw comparisons in parallel...')
                            result = p.map(pw_calc, pairs)
                            summary = pd.concat(result, axis=0, sort=False)
                            summary.fillna('', inplace=True)
                            with open('core_gene_alignment_SNV_distances.tab', 'w') as distmat:
                                summary.to_csv(distmat, mode='w', sep='\t', index=True, index_label='name')

                    #convert roary output to fripan compatible
                    os.system('python ../roary2fripan.py '+base+'_'+k)
                    roary2fripan_strains_file = pd.read_table(base+'_'+k+
                                                              '.strains',
                                                              index_col=0,
                                                              header=0)
                    info_list = []
                    info_list.append(roary2fripan_strains_file)
                    info_list.append(metadata_overall.loc[v, :])
                    strains_info_out = pd.concat(info_list, axis=1, sort=False)
                    strains_info_out.to_csv(base+'_'+k+'.strains', mode='w',
                                            sep='\t', index=True,
                                            index_label='ID')
                    print('Updated '+base+'_'+k+'.strains with all metadata.')
                    os.system('cp '+base+'_'+k+'* ~/public_html/fripan')
                    os.chdir(wd)
                else:
                    print('Only one isolate in '+k+'. Need at least 2 isolates '+\
                          'to run roary.  Moving on...')

        #Keep the tempdirs created during the run
        if not ARGS.keep_tempdirs:
            shutil.rmtree(assembly_tempdir, ignore_errors=True)
            print('\nDeleted tempdir '+assembly_tempdir+'.')
        else:
            print('\nTempdir '+assembly_tempdir+' not deleted.')

        print('\nRun finished.')
def best_elements_order_tree(relations, elements = None, filter_order = None):
  present_elements, present_element_groups, properties, property_groups, element_2_property_2_relation, property_2_element_2_relation = relations_2_model(relations)
  if not elements: elements = present_elements
  
  # distances = {}
  # for e1 in elements:
  #   for e2 in elements:
  #     if (e1 is e2) or (id(e1) > id(e2)): continue
  #     nb_similarity = 0
  #     for property in properties[:]:
  #       if   True == (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]):
  #         nb_similarity += 2
  #       elif (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]):
  #         nb_similarity += 1
  #     distances[e1, e2] = distances[e2, e1] = 1.0 - nb_similarity / len(properties)

  distances = {}
  for e1 in elements:
    for e2 in elements:
      if (e1 is e2) or (id(e1) > id(e2)): continue
      d = 0
      for property in properties[:]:
        if   (e1 in property_2_element_2_relation[property]) != (e2 in property_2_element_2_relation[property]):
          d += 1.0
      distances[e1, e2] = distances[e2, e1] = d


  label_2_element = { element.label : element for element in elements }
  
  from Bio.Phylo.TreeConstruction import _DistanceMatrix as DistanceMatrix, DistanceTreeConstructor
  
  dm = DistanceMatrix([element.label for element in elements])
  for e1 in elements:
    for e2 in elements:
      if (e1 is e2) or (id(e1) > id(e2)): continue
      dm[e1.label, e2.label] = distances[e1, e2]
      
  print(dm, file = sys.stderr)
  
  treebuilder = DistanceTreeConstructor(None)
  tree = treebuilder.nj(dm)
  #tree = treebuilder.upgma(dm)
  
  print(tree, file = sys.stderr)
  
  def walker(clade):
    if clade.clades:
      results = []
      partss  = [walker(child) for child in clade.clades]
      for ordered_parts in all_orders(partss):
        combinations = all_combinations(ordered_parts)
        results.extend(combinations)
      return results
    else:
      element = label_2_element[clade.name]
      return [ [element] ]
    
  orders = walker(tree.root)
  print(len(orders), file = sys.stderr)
  
  def score_order(order):
    nb_hole           = 0
    nb_prop_with_hole = 0
    total_hole_length = 0
    for property in properties:
      start   = None
      end     = None
      in_hole = False
      for i, element in enumerate(order):
        if element in property_2_element_2_relation[property]:
          if start is None: start = i
          end = i
          in_hole = False
        else:
          if (not start is None) and (not in_hole):
            in_hole = True
            nb_hole += 1
            
      # After end, it is not a hole!
      if end != i: nb_hole -= 1
      
      if not end is None:
        length = end - start + 1
        
        if length > len(property_2_element_2_relation[property]):
          total_hole_length += length - len(property_2_element_2_relation[property])
          nb_prop_with_hole += 1
          
    return (-nb_prop_with_hole, -nb_hole * 2 + -total_hole_length)
  
  order, score = best(orders, score_order, score0 = (-sys.maxsize, -sys.maxsize))
  
  return order
Esempio n. 59
0
from Bio import Phylo
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from io import StringIO
import re

# hamming distance
def hamming(seq1, seq2):
    # assert len(seq1) == len(seq2), 'unequal reads!'
    return int(sum([i[0] != i[1] for i in zip(seq1, seq2)]))

f = open('rosalind_chbp.txt')
species = f.readline().rstrip().split()
table = [''.join(i) for i in zip(*f.read().rstrip().split())]
n = len(table)

'''
For the Phylo.TreeConstruction to work, integers in the distance matrix
must be Python int and not numpy.int64
'''
dm = [[hamming(table[i], table[j]) for j in range(i+1)] for i in range(n)]
constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names=species, matrix=dm))

handle = StringIO()
Phylo.write(tree, handle, format='newick', plain=True)
result = handle.getvalue()
result = re.sub('Inner[0-9]+', '', result)
open('rosalind_chbp_sub.txt', 'wt').write(result)