Esempio n. 1
0
    def get_dn_ds_tree(self,
                       dn_ds_method="NG86",
                       tree_method="UPGMA",
                       codon_table=default_codon_table):
        """Method for constructing dn tree and ds tree.

        Argument:

            - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML.
            - tree_method  - Available methods include UPGMA and NJ.
        """
        from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
        dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method,
                                             codon_table=codon_table)
        dn_constructor = DistanceTreeConstructor()
        ds_constructor = DistanceTreeConstructor()
        if tree_method == "UPGMA":
            dn_tree = dn_constructor.upgma(dn_dm)
            ds_tree = ds_constructor.upgma(ds_dm)
        elif tree_method == "NJ":
            dn_tree = dn_constructor.nj(dn_dm)
            ds_tree = ds_constructor.nj(ds_dm)
        else:
            raise RuntimeError("Unknown tree method ({0}). Only NJ and UPGMA "
                               "are accepted.".format(tree_method))
        return dn_tree, ds_tree
Esempio n. 2
0
    def get_dn_ds_tree(self,
                       dn_ds_method="NG86",
                       tree_method="UPGMA",
                       codon_table=None):
        """Construct dn tree and ds tree.

        Argument:
         - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML.
         - tree_method  - Available methods include UPGMA and NJ.

        """
        from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

        if codon_table is None:
            codon_table = CodonTable.generic_by_id[1]
        dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method,
                                             codon_table=codon_table)
        dn_constructor = DistanceTreeConstructor()
        ds_constructor = DistanceTreeConstructor()
        if tree_method == "UPGMA":
            dn_tree = dn_constructor.upgma(dn_dm)
            ds_tree = ds_constructor.upgma(ds_dm)
        elif tree_method == "NJ":
            dn_tree = dn_constructor.nj(dn_dm)
            ds_tree = ds_constructor.nj(ds_dm)
        else:
            raise RuntimeError(f"Unknown tree method ({tree_method})."
                               " Only NJ and UPGMA are accepted.")
        return dn_tree, ds_tree
Esempio n. 3
0
def dna(file_path, file_format, algorithm):
    # Read the sequences and align
    aln = AlignIO.read(file_path, file_format)

    # Print the alignment
    print(aln)

    # Calculate the distance matrix
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)

    # Print the distance Matrix
    print('\nDistance Matrix\n===================')
    print(calculator)

    # Construct the phylogenetic tree using choosen algorithm
    constructor = DistanceTreeConstructor()
    if algorithm.lower() == 'upgma':
        tree = constructor.upgma(dm)
    elif algorithm.lower() == 'nj':
        tree = constructor.nj(dm)
    else:
        click.echo('Invalid algorithm!')

    # Draw the phylogenetic tree
    Phylo.draw(tree)

    # Print the phylogenetic tree in the terminal
    print('\nPhylogenetic Tree\n===================')
    Phylo.draw_ascii(tree)
def build_trees(filename, tree_name):
    # Compute alignment with ClustalW algorithm
    clustalw_cline = ClustalwCommandline("clustalw",
                                         infile="{}.fa".format(filename))
    clustalw_cline()
    alignment = AlignIO.read("{}.aln".format(filename), format="clustal")

    # Create distance matrix
    calculator = DistanceCalculator('blosum62')
    dist_matrix = calculator.get_distance(alignment)

    # Build phylogenetic trees using upgma and nj methods
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dist_matrix)
    nj_tree = constructor.nj(dist_matrix)

    # Draw the trees
    label_func = lambda clade: "" if clade.name.startswith("Inner") else clade

    Phylo.draw(upgma_tree, label_func=label_func, do_show=False)
    plt.title("{} × upgma".format(tree_name))
    plt.show()

    Phylo.draw(nj_tree, label_func=label_func, do_show=False)
    plt.title("{} × nj".format(tree_name))
    plt.show()
Esempio n. 5
0
def make_newick_tree(dm):
    constructor = DistanceTreeConstructor()
    upgmatree = constructor.upgma(dm)
    njtree = constructor.nj(dm)
    upgmatree.root_with_outgroup({'name': "KE136308.1"})
    njtree.root_with_outgroup({'name': "KE136308.1"})
    return upgmatree, njtree
def buildTree(FASTAFile):
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.nj(myMatrix)
    upgmaTree.root_at_midpoint()
    Phylo.draw(upgmaTree)
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    return tree
Esempio n. 7
0
def construct_tree(matrix, nj=True):
    """Build a tree from a distance matrix

    Can either use neighbor-joining (nj) or UPGMA.
    """

    if not (matrix and type(matrix) == list and len(matrix) > 0):
        print "matrix has invalid value"
        return

    dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))],
                         matrix=matrix)

    constructor = DistanceTreeConstructor()
    if nj:
        tree = constructor.nj(dm)
    else:
        tree = constructor.upgma(dm)

    # this will remove the names from the inner nodes
    # this is critical for seq-gen to read in the tree
    for clade in tree.get_nonterminals():
        clade.name = ''

    return tree
Esempio n. 8
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"

    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    print("Number of characters in alignment:", len(alignment[0]))

    ####################
    # Neighbor joining #
    ####################
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor()
    start = time.time()
    tree = constructor.nj(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

    #########
    # UPGMA #
    #########

    start = time.time()
    tree = constructor.upgma(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)
Esempio n. 9
0
    def distance_matrix(cls, cluster_list):
        print cluster_list
        dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list)
        
        distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()}
    
        matrix = []
        for i in range(0,len(cluster_list)):
            matrix_iteration = []
            for j in range(0,i+1):
                if i == j:
                    matrix_iteration.append(0)
                elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]])
                elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]])
                else:
                    raise("Error, can't find pair!")
            matrix.append(matrix_iteration)
            #print matrix_iteration

        cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list]
        matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(matrix_obj)
        tree.ladderize()
        #Phylo.draw_ascii(tree)
        output = StringIO.StringIO()
        Phylo.write(tree, output, 'newick')
        tree_str = output.getvalue()
        #print tree_str
        
        return tree_str
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""
    def setUp(self):
        self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/upgma.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/nj.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        tree_file = StringIO.StringIO()
        Phylo.write(tree, tree_file, 'newick')
        ref_tree = open('./TreeConstruction/nj.tre')
        self.assertEqual(tree_file.getvalue(), ref_tree.readline())
        ref_tree.close()
Esempio n. 11
0
    def summarise_dist(self, rf_results: RfResults, dir_out):

        for use_norm in (True, False):
            if use_norm:
                path_out = os.path.join(dir_out, 'rf_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg')
                plt_title = 'Normalised Robinson-Foulds Distance'
            else:
                path_out = os.path.join(dir_out, 'rf_un_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg')
                plt_title = '(un)Normalised Robinson-Foulds Distance'

            metrics = defaultdict(dict)
            names = set()
            for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items():
                if use_norm:
                    metrics[tid_a][tid_b] = norm_rf
                    metrics[tid_b][tid_a] = norm_rf
                else:
                    metrics[tid_a][tid_b] = rf
                    metrics[tid_b][tid_a] = rf
                names.add(tid_a)
                names.add(tid_b)

            labels = sorted(list(names))
            mat_vals = list()
            mat = np.zeros((len(labels), len(labels)))
            for i in range(len(labels)):
                cur_row = list()
                tid_a = labels[i]
                for j in range(i + 1):
                    tid_b = labels[j]
                    if tid_a == tid_b:
                        cur_row.append(0.0)
                    else:
                        cur_row.append(metrics[tid_a][tid_b])
                        mat[i, j] = metrics[tid_a][tid_b]
                mat_vals.append(cur_row)
            mat = mat + mat.T

            # Newick
            dm = DistanceMatrix(names=labels, matrix=mat_vals)
            constructor = DistanceTreeConstructor()
            tree = constructor.nj(dm)

            Phylo.write(tree, path_out, 'newick')

            # Heatmap
            cmap = sns.cubehelix_palette(100, reverse=True)

            sns.set(font_scale=1)
            fig_size = (15, 15)

            rf_df = pd.DataFrame(mat, columns=labels, index=labels)
            sns.clustermap(rf_df,
                           annot=True,
                           fmt='.3f',
                           cmap=cmap,
                           figsize=fig_size).fig.suptitle(plt_title)
            plt.savefig(path_hm)
Esempio n. 12
0
def tree_reconstruction(phy_file, method, model, phyformat):
    '''Construct tree with given method and model'''

    aln = AlignIO.read(phy_file, 'phylip-' + phyformat)

    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator(model)
    dm = calculator.get_distance(aln)

    if method == 'upgma':
        tree = constructor.upgma(dm)
    elif method == 'nj':
        tree = constructor.nj(dm)

    tree.ladderize()

    for c in tree.find_clades():
        if 'Inner' in c.name:
            c.name = ''

    Phylo.write(tree, args.output + '/tree.nwk', 'newick')

    plt.rcParams['font.style'] = 'italic'
    plt.rc('font', size=8)
    plt.rc('axes', titlesize=14)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('figure', titlesize=18)

    draw(tree, do_show=False)
    plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
Esempio n. 13
0
def get_tree(aln, kind='nj'):
    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
Esempio n. 14
0
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""

    def setUp(self):
        self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
Esempio n. 15
0
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor"""
    def setUp(self):
        self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip')
        calculator = DistanceCalculator('blosum62')
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def nj_tree_constructor(x):
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(x)
    njtree = constructor.nj(dm)
    print(njtree)
    Phylo.draw_ascii(njtree)
Esempio n. 17
0
    def get_tree(self,
                 chrom,
                 start=1,
                 end=None,
                 samples=None,
                 return_format="tree_obj"):

        print("chrom: {} start: {} end: {} samples: {}".format(
            chrom, start, end, samples))
        names, matrix = self.get_matrix(chrom,
                                        start=start,
                                        end=end,
                                        samples=samples,
                                        return_format="Phylo")
        distance_matrix = _DistanceMatrix(names, matrix)

        constructor = DistanceTreeConstructor()
        tree = constructor.nj(distance_matrix)  # neighbour joining tree

        if return_format == "tree_obj":
            return tree
        elif return_format == "newick":
            treeIO = StringIO()
            Phylo.write(tree, treeIO, "newick")
            treeString = treeIO.getvalue()
            treeString = treeString.strip()
            return treeString
Esempio n. 18
0
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'):  # Construct Tree with specific type (Default = UPGMA)
    if with_marburg == 1:
        print('Constructing Tree with All Viruses without Marburg')
        filename = algorithm + '_' + gene_name
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire']
    else:
        print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name))
        filename = algorithm + '_' + gene_name + '_with_Marburg'
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg']
        marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta")
        Alignment.read_data()
        print('Aligning Genes for marburg_genome')
        gene_name += '_with_marburg'
        Alignment.read_genes(marburg_genome)
    print('Reading edit matrix and construct tree')
    edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None)  # read edit matrix file
    constructor = DistanceTreeConstructor()  # Create a tree constructor object
    edit_matrix = convert_tu_lower_triangular(edit_matrix)  # Convert Edit Distance matrix to lower triangular
    distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix)
    if algorithm == 'NJ':  # Neighbor-Joining Alogrithm
        tree = constructor.nj(distance_matrix)
    else:  # UPGMA Algorithm
        tree = constructor.upgma(distance_matrix)
    save_tree(tree, filename)  # Save Tree into a file
    return tree
class DistanceTreeConstructorTest(unittest.TestCase):
    """Test DistanceTreeConstructor."""

    def setUp(self):
        self.aln = AlignIO.read("TreeConstruction/msa.phy", "phylip")
        calculator = DistanceCalculator("blosum62")
        self.dm = calculator.get_distance(self.aln)
        self.constructor = DistanceTreeConstructor(calculator)

    def test_upgma(self):
        tree = self.constructor.upgma(self.dm)
        self.assertIsInstance(tree, BaseTree.Tree)
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read("./TreeConstruction/upgma.tre", "newick")
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertIsInstance(tree, BaseTree.Tree)
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick")
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

        # create a matrix of length 2
        calculator = DistanceCalculator("blosum62")
        self.min_dm = calculator.get_distance(self.aln)
        for i in range(len(self.min_dm) - 2):
            del self.min_dm[len(self.min_dm) - 1]

        min_tree = self.constructor.nj(self.min_dm)
        self.assertIsInstance(min_tree, BaseTree.Tree)

        ref_min_tree = Phylo.read("./TreeConstruction/nj_min.tre", "newick")
        self.assertTrue(Consensus._equal_topology(min_tree, ref_min_tree))

    def test_built_tree(self):
        tree = self.constructor.build_tree(self.aln)
        self.assertIsInstance(tree, BaseTree.Tree)
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick")
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
Esempio n. 20
0
def get_tree():
    #biopython-extract the unrooted  tree
    aln = AlignIO.read('agc.aln', 'clustal')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return tree
Esempio n. 21
0
 def build_nj_tree(self):
     dm = self.distance_matrix()
     constructor = DistanceTreeConstructor()
     tree = constructor.nj(dm)
     treeio = StringIO.StringIO()
     Phylo.write(tree, treeio, 'newick')
     treestr = treeio.getvalue()
     treeio.close()
     return treestr
Esempio n. 22
0
def build_tree(aln, kind='nj'):
    """Build a tree with bio.phylo module"""

    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
Esempio n. 23
0
 def build_nj_tree(self):
     dm = self.distance_matrix()
     constructor = DistanceTreeConstructor()
     tree = constructor.nj(dm)
     treeio = StringIO.StringIO()
     Phylo.write(tree, treeio, 'newick')
     treestr = treeio.getvalue()
     treeio.close()
     return treestr
Esempio n. 24
0
def print_trees(country, position_table):
    ### Pull out the concensus sequence

    concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0]
    concensus_seq

    position_table = position_table.set_index('seqid')

    ### Determine which samples are farthest from the concensus sequence

    distance_from_concensus_seq = position_table.apply(
        lambda row: sum(row != concensus_seq), axis=1)
    distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values(
        ascending=False)
    distance_from_concensus_seq_sorted

    ### Select 10 sequences to do our first analysis

    subset_seqs = distance_from_concensus_seq_sorted[:10].index
    subset_seqs

    ### Construct a distance matrix for our sequences

    distances = {}
    for i, seqid1 in enumerate(subset_seqs):
        distances[seqid1, seqid1] = 0
        for j in range(i + 1, len(subset_seqs)):
            seqid2 = subset_seqs[j]
            distances[seqid1, seqid2] = sum(
                position_table.loc[seqid1] != position_table.loc[seqid2])
            distances[seqid2, seqid1] = distances[seqid1, seqid2]
    distances = pd.Series(distances).unstack()

    matrix = np.tril(distances.values).tolist()
    for i in range(len(matrix)):
        matrix[i] = matrix[i][:i + 1]
    dm = DistanceMatrix(list(distances.index), matrix)

    ### Now construct our tree
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    print(country.upper())
    print("Neighbor Joining Tree")
    tree.ladderize()  # Flip branches so deeper clades are displayed at top
    display(Phylo.draw(tree))
    #**Please see the guidance at the top of the page for what to try**

    if (len(dm) > 1):
        tree2 = constructor.upgma(dm)
        #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences
        print("UPGMA Tree")
        tree2.ladderize(
        )  # Flip branches so deeper clades are displayed at top
        display(Phylo.draw(tree2))
    return
Esempio n. 25
0
def build_tree_NJ(msa, distanceMatrix=None):
    if not distanceMatrix:
        distCalculator = DistanceCalculator("identity")
        distanceMatrix = distCalculator.get_distance(msa)
    # Construct the tree with the distance Matrix
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(distanceMatrix)
    # Make the tree rooted
    #tree.root_at_midpoint()
    #return newick format
    return "[&R] " + tree.format("newick").strip()
Esempio n. 26
0
def generar_arbol(file, indice):
    with open(file, "r") as aln:
        alineamiento = AlignIO.read(aln, "clustal")

    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alineamiento)

    constructor = DistanceTreeConstructor(calculator)
    nj = constructor.nj(dm)  # Neighbor Joining
    Phylo.draw(nj)
    path = './static/assets/arbol_filogenetico' + indice + '.png'
    pylab.savefig(path, format='png')
Esempio n. 27
0
 def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA"):
     """Method for constructing dn tree and ds tree.
     Argument:
         -   dn_ds_method - Available methods include NG86, LWL85, YN00
                            and ML.
         -   tree_method  - Available methods include UPGMA and NJ.
     """
     from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
     dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method)
     dn_constructor = DistanceTreeConstructor()
     ds_constructor = DistanceTreeConstructor()
     if tree_method == "UPGMA":
         dn_tree = dn_constructor.upgma(dn_dm)
         ds_tree = ds_constructor.upgma(ds_dm)
     elif tree_method == "NJ":
         dn_tree = dn_constructor.nj(dn_dm)
         ds_tree = ds_constructor.nj(ds_dm)
     else:
         raise RuntimeError("Unkown tree method ({0}). Only NJ and UPGMA "
                            "are accepted.".format(tree_method))
     return dn_tree, ds_tree
Esempio n. 28
0
def createNJPhyloTree(align, distanceModel="identity", alignName="anonymous"):

    print(
        "[INFO] Calculating distance matrix for {} alignment and {} distance model"
        .format(alignName, distanceModel))
    calculator = DistanceCalculator(distanceModel)
    dm = calculator.get_distance(align)

    print("[INFO] Constructing NJ phylogenetic tree for {} alignment".format(
        alignName))
    constructor = DistanceTreeConstructor()
    njtree = constructor.nj(dm)
    return njtree
Esempio n. 29
0
def construct_tree(X_2d, acc, title):
    acc = list(acc)
    data = pairwise_distances(X_2d).astype('float')
    data[np.isnan(data)] = 0
    data_list = []
    for i in range(data.shape[0]):
        #for j in range(i, data.shape[0]):
        data_list.append([data[i, j] for j in range(0, i+1)])
    data = data_list
    dm = _DistanceMatrix(acc, matrix=data)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    Phylo.write(tree, title + ".nwk", 'newick')
Esempio n. 30
0
def GenerarArbol():
    #Generacion de arbol filogenetico UPGMA 
    alineamientos = AlignIO.read("protsec.aln","clustal")  

    # Calculamos matriz de distancias.
    calculo_matriz = DistanceCalculator('identity')
   
    matriz_distancia = calculo_matriz.get_distance(alineamientos)
    print matriz_distancia
    # Creamos el arbol UPGMA.
    creador_arbol = DistanceTreeConstructor()
    arbol_UPGMA =creador_arbol.nj(matriz_distancia)
    Phylo.draw_ascii(arbol_UPGMA)
    Phylo.draw(arbol_UPGMA)
Esempio n. 31
0
def create_NJ_tree(alignment):
    # Import Phylo library for tree constructor and draw methods
    from Bio import Phylo
    from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
    # Create simpler names for tree-constructing methods
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator('identity')
    # Calculate the distances between the sequences in alignment
    dists = calculator.get_distance(alignment)
    # Create the phylo tree
    tree = constructor.nj(dists)
    # Print the phylo tree
    print(Phylo.draw_ascii(tree))
    # Return the phylo tree
    return tree
Esempio n. 32
0
def nj_tree(fichero_clw):
    with open(fichero_clw, "r") as aln:
        #usar AlignIO tpara leer el archivo de alineamiento en formato 'clustal' format
        alignment = AlignIO.read(aln, "fasta")

    #calcular la  matriz de distancias
    calculator = DistanceCalculator('identity')
    # añade la matriz de  distancias al objeto calculator y lo retorna
    dm = calculator.get_distance(alignment)
    print(dm)
    #initialize a DistanceTreeConstructor object based on our distance calculator object
    constructor = DistanceTreeConstructor(calculator)
    #build the tree
    #upgma_tree = constructor.build_tree(alignment)
    nj_tree = constructor.nj(dm)
    Phylo.draw(nj_tree)
Esempio n. 33
0
def build_guide_trees(distance_matrix):
    # build distance matrix biopython object
    matrix = [distance_matrix[i, :i + 1].tolist() for i in range(len(distance_matrix))]
    names = ['S' + str(i) for i in range(len(distance_matrix))]
    dm = _DistanceMatrix(names, matrix)
    print('Constructed matrix')
    constructor = DistanceTreeConstructor()

    # construct neighbour joining tree
    t = time.time()
    tree = constructor.nj(dm)
    print('Constructed nj tree in {:.4f}s'.format(time.time() - t))
    Phylo.write(tree, "njtree.dnd", "newick")
    remove_inner_nodes_tree("njtree.dnd")

    """
Esempio n. 34
0
def ex01():
    alignments = get_alignments()
    calculator = DistanceCalculator('blosum62')
    constructor = DistanceTreeConstructor()

    for a, name in alignments:
        dist_matrix = calculator.get_distance(a)
        upgma_tree = constructor.upgma(dist_matrix)
        nj_tree = constructor.nj(dist_matrix)

        print("\n\n>>> {}".format(name))
        # print(dist_matrix)
        # draw_ascii(upgma_tree)
        # draw_ascii(nj_tree)
        draw(upgma_tree)
        draw(nj_tree)
Esempio n. 35
0
    def test_correct_res(self):

        dist_matrix = pd.read_csv("data/wiki_tree.csv", index_col=0)
        self.tree.set_distance_matrix(dist_matrix)
        self.tree.fit()

        dist_matrix = _DistanceMatrix(names=['a', 'b', 'c', 'd', 'e'],
                                      matrix=[[0], [5, 0], [9, 10, 0],
                                              [9, 10, 8, 0], [8, 9, 7, 3, 0]])
        constructor = DistanceTreeConstructor()
        lib_tree = constructor.nj(dist_matrix)

        self.assertTrue(
            is_isomorphic(
                Phylo.to_networkx(lib_tree).to_undirected(),
                Phylo.to_networkx(self.tree.get_tree()).to_undirected()))
def run_optimization():
    '''
    '''
    
    params = get_data()
    
    
    num_samples = 16
    

#---------------------------------------------------------------------------------------------------------------------------------------------------    
    NUM_OF_VERTICES = 200
    
    
    distances = np.zeros((num_samples, num_samples))
    
    for i in range(num_samples):
        for j in range(i + 1, num_samples): 
            print("working on the pair", (i, j))
            distances[i, j] = np.abs(compare_curves(params[i], params[j], num_of_verts=NUM_OF_VERTICES))
            distances[j, i] = distances[i,j]
#---------------------------------------------------------------------------------------------------------------------------------------------------  
                
 
    
# Plot distance matrix and make phylogenetic tree
#---------------------------------------------------------------------------------------------------------------------------------------------------    
    plt.matshow(distances)
    plt.colorbar()
    plt.show
    
    distaceMat = [list(distances[i, :i+1]) for i in range(16)]
    
    distaceMatrix = DistanceMatrix(names=['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'],
                                   matrix=distaceMat)
    
    constructor = DistanceTreeConstructor()
    
    tree_up = constructor.upgma(distaceMatrix)
    
    tree_nj = constructor.nj(distaceMatrix)
    
    Phylo.draw_ascii(tree_nj)
    
    Phylo.draw_ascii(tree_up)
    
    return distances
Esempio n. 37
0
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):
	
	#Make sure names are unique
	names = org_names
	for name in names:
		if names.count(name)>1:
			temp_name = name
			i=1
			for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup
				names[names.index(temp_name)] = temp_name + "_" + str(i)
				i = i +1
		
	#Normalize the x vector
	x = map(lambda y: y/sum(x),x)
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])

	#Make the list of distances (ave of the two ckm matrices)
	ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm
	ckm_ave_train_dist = dict()
	for i in range(len(org_names)):
		ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))]

	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)

	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)

	#Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
	def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
		dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name])
		nearby_indicies = list()
		#Add all the organisms that are within 0.05 of the given percent
	#	for i in range(len(dists)):
	#		if dists[i]<=.05:
	#			nearby_indicies.append(i)
		nearby_names = list()
		#If there are no nearby indicies, add the closest organism to the given percent
		if nearby_indicies==[]:
			nearby_names.append(org_names[dists.index(min(dists))])
		else:
			for i in range(len(nearby_indicies)):
				nearby_names.append(org_names[i])
		mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names))
		nearby_names.append(leaf_name)
		LCA = t.get_common_ancestor(nearby_names)
		LCA_to_leaf_dist = t.get_distance(LCA,leaf_name)
		#divide the dist to the right/left of the LCA node by the number of percentage points in there
		if LCA.name==t.name:
			percent_dist = percent*LCA_to_leaf_dist
			if mean_dist <= percent:
				child_node = (t&leaf_name)
			else:
				child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy
			ancestor_node = (t&child_node.name).up
		elif mean_dist <= percent:
			percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		else:
			percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))

	#Set outgroup
	if outgroup in names:
		t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names...
	else:
		print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ")
		print(names)
		print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.")

	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs]
	for i in range(len(org_names)):
		xi = x[i:len(x):len(org_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names)
				hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names

	size_factor=250
	font_size=55

	#Now put the bubbles on the nodes
	def layout(node):
		node_style = NodeStyle()
		node_style["hz_line_width"] = 10
		node_style["vt_line_width"] = 10
		node.set_style(node_style)
		#print(node)
		if node.is_leaf():
			if node.name in org_names:
				#make reconstructed bubble
				size = x[org_names.index(node.name)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#Denote that this was a training organism
				nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in org_names:
				idx = hyp_node_names[node.name][2]
				size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#This is if I want the names of the hypothetical nodes to be printed as well
				#nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				#faces.add_face_to_node(nameFace, node, 0, position="branch-right")
			else:
				size=0
		else:
			size=0
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	#ts.mode = "c"
	ts.scale = 2*1000
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50
	F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere")
	F.border.width = None
	F.opacity = 0.6
	ts.legend.add_face(F,0)
	ts.legend.add_face(TextFace("  Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1)
	ts.legend.add_face(TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1)
	ts.legend_position=4
	#t.show(tree_style=ts)
	t.render(outfile, w=550, units="mm", tree_style=ts)
	
	#Redner the XML file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	project.add_phylogeny(phylo)
	project.export(open(outfilexml,'w'))
Esempio n. 38
0
# CAGTTCGCCACAA Gamma

# Several thigns can be done witht he alignment: get a distance matrix from it:
dstcalc = DistanceCalculator('identity')
dm = dstcalc.get_distance(aln)
# DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon'], matrix=[[0], [0.23076923076923073, 0], [0.3846153846153846, 0.23076923076923073, 0], [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0], [0.6153846153846154, 0.3846153846153846, 0.46153846153846156, 0.15384615384615385, 0]])
print "What's the get_distance(aln) from DistanceCalculator('identity') object?"
print type(dm)
print dm
# Alpha   0
# Beta    0.230769230769  0
# Gamma   0.384615384615  0.230769230769  0
# Delta   0.538461538462  0.538461538462  0.538461538462  0
# Epsilon 0.615384615385  0.384615384615  0.461538461538  0.153846153846  0

# build a tree from it.
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

construc0 = DistanceTreeConstructor(dstcalc, 'nj')
tre0 = construc0.build_tree(aln)
print type(tre0)
# as you can see from abovedstcalc is needed for te constructor and then
# to build the tree the alignment is needed. That's two things which need to originae fromt he same thing.
# A bit of a tall order
# You can build the tree from a distance matrix only, by leaving out the aln argument
# by not using the build_tree method on the constructor, but rather the .nj method

construc2 = DistanceTreeConstructor()
tre2 = construc2.nj(dm)
print type(tre2)
def nj_tree(distanceMatrix):
    print "Constructing Neighbor Joining Tree"
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(distanceMatrix)
    Phylo.write(tree, "geneContentTree.newick", "newick")
    print "Done constructing tree"
Esempio n. 40
0
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio.Phylo.TreeConstruction import _DistanceMatrix


def dm_to_tree(dm):
    dm = dm.astype(float)
    distance_triangular = [list(dm.values[i, : i + 1]) for i in range(len(dm))]
    try:
        dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular)
    except Exception, e:
        print list(dm.columns)
        print [type(i) for i in dm.columns]
        print type(distance_triangular)
        print type(distance_triangular[0])
        print set([str(type(i)) for j in distance_triangular for i in j])
        print distance_triangular
        raise e
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    for c in tree.get_nonterminals():
        c.name = None
    return tree
def best_elements_order_tree(relations, elements = None, filter_order = None):
  present_elements, present_element_groups, properties, property_groups, element_2_property_2_relation, property_2_element_2_relation = relations_2_model(relations)
  if not elements: elements = present_elements
  
  # distances = {}
  # for e1 in elements:
  #   for e2 in elements:
  #     if (e1 is e2) or (id(e1) > id(e2)): continue
  #     nb_similarity = 0
  #     for property in properties[:]:
  #       if   True == (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]):
  #         nb_similarity += 2
  #       elif (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]):
  #         nb_similarity += 1
  #     distances[e1, e2] = distances[e2, e1] = 1.0 - nb_similarity / len(properties)

  distances = {}
  for e1 in elements:
    for e2 in elements:
      if (e1 is e2) or (id(e1) > id(e2)): continue
      d = 0
      for property in properties[:]:
        if   (e1 in property_2_element_2_relation[property]) != (e2 in property_2_element_2_relation[property]):
          d += 1.0
      distances[e1, e2] = distances[e2, e1] = d


  label_2_element = { element.label : element for element in elements }
  
  from Bio.Phylo.TreeConstruction import _DistanceMatrix as DistanceMatrix, DistanceTreeConstructor
  
  dm = DistanceMatrix([element.label for element in elements])
  for e1 in elements:
    for e2 in elements:
      if (e1 is e2) or (id(e1) > id(e2)): continue
      dm[e1.label, e2.label] = distances[e1, e2]
      
  print(dm, file = sys.stderr)
  
  treebuilder = DistanceTreeConstructor(None)
  tree = treebuilder.nj(dm)
  #tree = treebuilder.upgma(dm)
  
  print(tree, file = sys.stderr)
  
  def walker(clade):
    if clade.clades:
      results = []
      partss  = [walker(child) for child in clade.clades]
      for ordered_parts in all_orders(partss):
        combinations = all_combinations(ordered_parts)
        results.extend(combinations)
      return results
    else:
      element = label_2_element[clade.name]
      return [ [element] ]
    
  orders = walker(tree.root)
  print(len(orders), file = sys.stderr)
  
  def score_order(order):
    nb_hole           = 0
    nb_prop_with_hole = 0
    total_hole_length = 0
    for property in properties:
      start   = None
      end     = None
      in_hole = False
      for i, element in enumerate(order):
        if element in property_2_element_2_relation[property]:
          if start is None: start = i
          end = i
          in_hole = False
        else:
          if (not start is None) and (not in_hole):
            in_hole = True
            nb_hole += 1
            
      # After end, it is not a hole!
      if end != i: nb_hole -= 1
      
      if not end is None:
        length = end - start + 1
        
        if length > len(property_2_element_2_relation[property]):
          total_hole_length += length - len(property_2_element_2_relation[property])
          nb_prop_with_hole += 1
          
    return (-nb_prop_with_hole, -nb_hole * 2 + -total_hole_length)
  
  order, score = best(orders, score_order, score0 = (-sys.maxsize, -sys.maxsize))
  
  return order
Esempio n. 42
0
from Bio import Phylo
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from io import StringIO
import re

# hamming distance
def hamming(seq1, seq2):
    # assert len(seq1) == len(seq2), 'unequal reads!'
    return int(sum([i[0] != i[1] for i in zip(seq1, seq2)]))

f = open('rosalind_chbp.txt')
species = f.readline().rstrip().split()
table = [''.join(i) for i in zip(*f.read().rstrip().split())]
n = len(table)

'''
For the Phylo.TreeConstruction to work, integers in the distance matrix
must be Python int and not numpy.int64
'''
dm = [[hamming(table[i], table[j]) for j in range(i+1)] for i in range(n)]
constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names=species, matrix=dm))

handle = StringIO()
Phylo.write(tree, handle, format='newick', plain=True)
result = handle.getvalue()
result = re.sub('Inner[0-9]+', '', result)
open('rosalind_chbp_sub.txt', 'wt').write(result)
Esempio n. 43
0
def main():
    global YIELD_FILE
    global MLST_FILE
    global FORCE_MLST_SCHEME
    #Set up the file names for Nullarbor folder structure
    YIELD_FILE = 'yield.tab'
    MLST_FILE = 'mlst.tab'


    #Add MLST schemes to force their usage if that species is encountered
    #Only force schemes if there are two (e.g., A baumannii and E coli)
    FORCE_MLST_SCHEME = {"Acinetobacter baumannii": "abaumannii_2",
                         "Campylobacter jejuni": "campylobacter",
                         #"Citrobacter freundii": "cfreundii",
                         #"Cronobacter": "cronobacter",
                         "Enterobacter cloacae": "ecloacae",
                         "Escherichia coli": "ecoli",
                         #"Klebsiella oxytoca": "koxytoca",
                         #"Klebsiella pneumoniae": "kpneumoniae",
                         #"Pseudomonas aeruginosa": "paeruginosa"
                         "Shigella sonnei": "ecoli",
                         "Salmonella enterica": "senterica",
                         "Vibrio cholerae": "vcholerae"
                        }


    '''
    Read in the MDU-IDs from file. For each ID, instantiate an object of
    class Isolate.  This class associates QC data with the ID tag.
    Move the contigs for all isolates into a tempdir, with a temp 9-character
    filename.  Run andi phylogenomics on all the contig sets.  Infer an NJ tree
    using Bio Phylo from the andi-calculated distance matrix.  Correct the
    negative branch lengths in the NJ tree using ETE3.  Export the tree to
    file. Gather and combine the metadata for each ID as a super-matrix.
    Optionally, add LIMS metadata to the super-matrix from a LIMS excel
    spreadsheet option (adds MALDI-ToF, Submitting Lab ID, Submitting Lab
    species guess) and/or use the flag-if-new to highlight
    'new' isolates.  Export the tree and metadata to .csv, .tsv/.tab file.
    Export the 'isolates not found' to text file too.
    '''
    if not ARGS.subparser_name:
        PARSER.print_help()
        sys.exit()


    elif ARGS.subparser_name == 'version':
        from .utils.version import Version
        Version()
        sys.exit()

    else:# ARGS.subparser_name == "run":
        if ARGS.Nullarbor_folders:
            print('Nullarbor folder structure selected.')
            YIELD_FILE = 'yield.clean.tab'
            MLST_FILE = 'mlst2.tab'

        EXCEL_OUT = (f"{os.path.splitext(os.path.basename(ARGS.LIMS_request_sheet))[0]}" \
                     f"_results.xlsx")

        if ARGS.threads > cpu_count():
            sys.exit(f'Number of requested threads must be less than {cpu_count()}.')

        print(str(ARGS.threads) +' CPU processors requested.')


        #Check if final slash in manually specified wgs_qc path
        if ARGS.wgs_qc[-1] != '/':
            print('\n-wgs_qc path is entered as '+ARGS.wgs_qc)
            print('You are missing a final \'/\' on this path.')
            print('Exiting now.\n')
            sys.exit()



        #i) read in the IDs from file
        xls_table = get_isolate_request_IDs(ARGS.LIMS_request_sheet)
        IDs = list(set(xls_table.index.values))

        #base should be a global, given that it is used in other functions too.
        base = os.path.splitext(ARGS.LIMS_request_sheet)[0]

        #ii) Return a folder path to the QC data for each available ID
        #    using a wildcard search of the ID in IDs in ARGS.wgs_qc path.
        iso_paths = isolates_available(IDs)
        #Drop the path and keep the folder name
        isos = [i.split('/')[-1] for i in iso_paths]

        #iii) make tempdir to store the temp_contigs there for 'andi' analysis.
        assembly_tempdir = make_tempdir()

        #vi) Copy contigs to become temp_contigs into tempdir, only if andi
        #requested.
        #Translation dict to store {random 9-character filename: original filename}
        iso_ID_trans = {}
        #Dict to store each isolate under each consensus species#####maybe delete
        from collections import defaultdict
        isos_grouped_by_cons_spp = defaultdict(list)
        for iso in isos:
            #Instantiate an Isolate class for each isolate in isolates
            sample = Isolate(iso)
            #Next, we could just use iso_path+/contigs.fa, but that would skip
            #the if os.path.exists() test in sample.assembly(iso).
            assembly_path = sample.assembly()
            short_id = shortened_ID()
            #Store key,value as original_name,short_id for later retrieval.
            iso_ID_trans[iso] = short_id
            if ARGS.andi_run:
                cmd = 'ln -s '+assembly_path+' '+assembly_tempdir+'/'+short_id+\
                      '_contigs.fa'
                os.system(cmd)
                print('Creating symlink:', cmd)
        if len(list(iso_ID_trans.items())) > 0:
            with open(base+'_temp_names.txt', 'w') as tmp_names:
                print('\nTranslated isolate IDs:\nShort\tOriginal')
                for key, value in list(iso_ID_trans.items()):
                    print(value+'\t'+key)
                    tmp_names.write(value+'\t'+key+'\n')
        if ARGS.metadata_run:
           #summary_frames will store all of the metaDataFrames herein
            summary_frames = []
            n_isos = len(isos)
            if n_isos == 0:
                print('\nNo isolates detected in the path '+ARGS.wgs_qc+'.')
                print('Exiting now.\n')
                sys.exit()
            #Kraken set at 2 threads, so 36 processes can run on 72 CPUs
            #Create a pool 'p' of size based on number of isolates (n_isos)
            if n_isos <= ARGS.threads//2:
                p = Pool(n_isos)
            else:
                p = Pool(ARGS.threads//2)
            print(f'\nRunning kraken on the assemblies ({ARGS.assembly_name} files):')
            results_k_cntgs = p.map(kraken_contigs_multiprocessing, isos)
            print(results_k_cntgs)
            #concat the dataframe objects
            res_k_cntgs = pd.concat(results_k_cntgs, axis=0, sort=False)
            print('\nKraken_contigs results gathered from kraken on contigs...')

            #Multiprocessor retrieval of kraken results on reads.  Single thread
            #per job.
            if n_isos <= ARGS.threads:
                p = Pool(n_isos)
            else:
                p = Pool(ARGS.threads)
            results_k_reads = p.map(kraken_reads_multiprocessing, isos)
            #concat the dataframe objects
            res_k_reads = pd.concat(results_k_reads, axis=0)
            print('Kraken_reads results gathered from kraken.tab files...')

            #Multiprocessor retrieval of contig metrics.  Single process
            #per job.
            results_metrics_contigs = p.map(metricsContigs_multiprocessing, isos)
            res_m_cntgs = pd.concat(results_metrics_contigs, axis=0)
            print('Contig metrics gathered using \'fa -t\'...')

            #Multiprocessor retrieval of read metrics.  Single process
            #per job.
            results_metrics_reads = p.map(metricsReads_multiprocessing, isos)
            res_m_reads = pd.concat(results_metrics_reads, axis=0)
            print('Read metrics gathered from '+YIELD_FILE+' files...')

            #Multiprocessor retrieval of abricate results. Single process
            #per job.
            results_abricate = p.map(abricate_multiprocessing, isos)
            res_all_abricate = pd.concat(results_abricate, axis=0, sort=False)
            res_all_abricate.fillna('', inplace=True)
            print('Resistome hits gathered from abricate.tab files...')

            #append the dfs to the summary list of dfs
            summary_frames.append(res_k_cntgs)
            summary_frames.append(res_k_reads)
            summary_frames.append(res_m_cntgs)
            summary_frames.append(res_m_reads)
            summary_frames.append(res_all_abricate)

            #These next steps build up the metadata not yet obtained
            #(via mulitprocesses above), also replace the dm-matrix short names
            #with original names

            #Let's store the metadata for each isolate in summary_isos
            summary_isos = []

            #Let's populate summary_isos above, isolate by isolate (in series)
            c = 0
            for iso in isos:
                iso_df = []
                sample = Isolate(iso)
                short_id = iso_ID_trans[iso]
                species_cntgs = res_k_cntgs.loc[iso, 'sp_krkn1_cntgs']
                species_reads = res_k_reads.loc[iso, 'sp_krkn1_reads']
                if species_cntgs == species_reads:
                    species = species_cntgs
                else:
                    species = 'indet'
                mlst_df = sample.mlst(species, sample.assembly())
                iso_df.append(mlst_df)
                species_consensus = {'sp_krkn_ReadAndContigConsensus':species}
                species_cons_df = pd.DataFrame([species_consensus], index=[iso])
                iso_df.append(species_cons_df)
                iso_df_pd = pd.concat(iso_df, axis=1)
                summary_isos.append(iso_df_pd)

            #Glue the isolate by isolate metadata into a single df
            summary_isos_df = pd.concat(summary_isos)
            #Glue the dataframes built during multiprocessing processes
            summary_frames_df = pd.concat(summary_frames, axis=1)
            #Finish up with everything in one table!
            metadata_overall = pd.concat([xls_table, summary_isos_df, summary_frames_df],
                                         axis=1, sort=False)

            metadata_overall.fillna('', inplace=True)
            metadata_overall.index.name = 'ISOLATE'
            print('\nMetadata super-matrix:')
            #Write this supermatrix (metadata_overall) to csv and tab/tsv
            csv = os.path.abspath(base+'_metadataAll.csv')
            tsv = os.path.abspath(base+'_metadataAll.tab')
            json = os.path.abspath(base+'_metadataAll.json')
            metadata_overall.to_csv(sys.stdout)
            writer = pd.ExcelWriter(EXCEL_OUT)
            metadata_overall.to_excel(writer,'Sheet 1', freeze_panes=(1, 1))
            writer.save()
            print(f"\nResults written to {os.path.abspath(EXCEL_OUT)}")

            for k, v in zip(metadata_overall['sp_krkn_ReadAndContigConsensus'],
                            metadata_overall.index):
                isos_grouped_by_cons_spp[k.replace(' ', '_')].append(v)

        #Run andi?
        if ARGS.andi_run:
            #Run andi
            andi_mat = 'andi_'+ARGS.model_andi_distance+'dist_'+base+'.mat'
            andi_c = 'nice andi -j -m '+ARGS.model_andi_distance+' -t '+\
                      str(ARGS.threads)+' '+assembly_tempdir+'/*_contigs.fa > '+\
                      andi_mat
            print('\nRunning andi with: \''+andi_c+'\'')
            os.system(andi_c)

            #Read in the andi dist matrix, convert to lower triangle
            dm = read_file_lines(andi_mat)[1:]
            dm = lower_tri(dm)
            #Correct the names in the matrix
            for iso in isos:
                #Could do it this way, but this is slower than a nested loop
                #dm.names[dm.names.index(iso_ID_trans[iso])] = iso
                #real	0m9.417s
                #user	1m18.576s
                #sys	0m2.620s
                #Nested loop is faster
                for i in range(0, len(dm.names)):
                    #iso_ID_trans[iso] is the short_id
                    if dm.names[i] == iso_ID_trans[iso]:
                        dm.names[i] = iso
                #real	0m8.789s
                #user	1m14.637s
                #sys	0m2.420s

            #From the distance matrix in dm, infer the NJ tree
            from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
            constructor = DistanceTreeConstructor()
            njtree = constructor.nj(dm)
            njtree.rooted = True
            from Bio import Phylo
            Phylo.write(njtree, 'temp.tre', 'newick')
            from ete3 import Tree
            t = Tree('temp.tre', format=1)
            #Get rid of negative branch lengths (an artefact, not an error, of NJ)
            for node in t.traverse():
                node.dist = abs(node.dist)
            t.set_outgroup(t.get_midpoint_outgroup())
            t_out = base+'_andi_NJ_'+ARGS.model_andi_distance+'dist.nwk.tre'
            t.write(format=1, outfile=t_out)
            print('Final tree (midpoint-rooted, NJ under '+\
                   ARGS.model_andi_distance+' distance) looks like this:')
            #Print the ascii tree
            print(t)
            #Remove the temp.tre
            os.remove('temp.tre')
            print('Tree (NJ under '+ARGS.model_andi_distance+\
                  ' distance, midpoint-rooted) written to '+t_out+'.')

        #Run roary?
        if ARGS.roary_run:
            roary_keepers = [
                            "accessory.header.embl",
                            "accessory.tab",
                            "accessory_binary_genes.fa",
                            "accessory_binary_genes.fa.newick",
                            "accessory_binary_genes_midpoint.nwk.tre",
                            "accessory_graph.dot",
                            "blast_identity_frequency.Rtab",
                            "clustered_proteins",
                            "core_accessory.header.embl",
                            "core_accessory.tab",
                            "core_accessory_graph.dot",
                            "core_gene_alignment.aln",
                            "gene_presence_absence.Ltab.csv",
                            "gene_presence_absence.Rtab",
                            "gene_presence_absence.csv",
                            "number_of_conserved_genes.Rtab",
                            "number_of_genes_in_pan_genome.Rtab",
                            "number_of_new_genes.Rtab",
                            "number_of_unique_genes.Rtab",
                            "pan_genome_reference.fa",
                            "pan_genome_sequences",
                            "summary_statistics.txt"
                            ]
            params = [(i, 'prokka') for i in isos if not
                      os.path.exists('prokka/'+i)]
            if len(params) > 0:
                print('\nRunning prokka:')
                if len(params) <= ARGS.threads//2:
                    p = Pool(len(params))
                else:
                    p = Pool(ARGS.threads//2)
                p.map(prokka, params)
            else:
                print('\nProkka files already exist. Let\'s move on to '+\
                      'the roary analysis...')

            #Run Roary on the species_consensus subsets.
            print('Now, let\'s run roary!')
            for k, v in list(isos_grouped_by_cons_spp.items()):
                print(k, v)
                n_isos = len(v)
                if n_isos > 1:
                    shutil.rmtree(base+'_'+k+'_roary', ignore_errors=True)
                    roary(base, k,
                          ' '.join(['prokka/'+iso+'/*.gff' for iso in v]))
                    roary_genes = pd.read_table(base+'_'+k+
                                                '_roary/gene_presence_absence.' +\
                                                'Rtab',
                                                index_col=0, header=0)
                    roary_genes = roary_genes.transpose()
                    roary_genes.to_csv(base+'_'+k+
                                       '_roary/gene_presence_absence.Ltab.csv',
                                       mode='w', index=True, index_label='name')
                    if n_isos > 2:
                        from ete3 import Tree
                        t = Tree(base+'_'+k+
                                 '_roary/accessory_binary_genes.fa.newick',
                                 format=1)
                        #Get rid of negative branch lengths (an artefact,
                        #not an error, of NJ)
                        for node in t.traverse():
                            node.dist = abs(node.dist)
                        t.set_outgroup(t.get_midpoint_outgroup())
                        t_out = base+'_'+k+\
                                '_roary/accessory_binary_genes_midpoint.nwk.tre'
                        t.write(format=1, outfile=t_out)
                        print('\nWritten midpoint-rooted roary tree.\n')
                        wd = os.getcwd()
                        os.chdir(base+'_'+k+'_roary')
                        for f_name in glob.glob('*'):
                            if f_name not in roary_keepers:
                                shutil.rmtree(f_name, ignore_errors=True)
                                os.remove(f_name)
                        os.chdir(wd)
                    if n_isos <= 2:
                        print('Need more than two isolates to have a meaningful '+\
                              'pangenome tree. No mid-point rooting of the ' +\
                              'pangenome tree performed.')
                    wd = os.getcwd()
                    os.chdir(base+'_'+k+'_roary')
                    os.system('python ../collapseSites.py -f core_gene_alignment.aln -i fasta -t '+str(ARGS.threads))
                    if os.path.exists('core_gene_alignment_collapsed.fasta'):
                        os.system('FastTree -nt -gtr < core_gene_alignment_collapsed.fasta > core_gene_FastTree_SNVs.tre')

                        #calc pairwise snp dist and write to file
                        with open('core_gene_alignment_collapsed.fasta', 'r') as inf:
                            from Bio import AlignIO
                            aln = AlignIO.read(inf, 'fasta')
                            pairs = []
                            for i in range(0,len(aln)):
                                lst = [(aln, i, j) for j in range(0, i+1)]
                                pairs.append(lst)
                            if len(pairs) <= ARGS.threads:
                                p = Pool(len(pairs))
                            else:
                                p = Pool(ARGS.threads)
                            print('Running pw comparisons in parallel...')
                            result = p.map(pw_calc, pairs)
                            summary = pd.concat(result, axis=0, sort=False)
                            summary.fillna('', inplace=True)
                            with open('core_gene_alignment_SNV_distances.tab', 'w') as distmat:
                                summary.to_csv(distmat, mode='w', sep='\t', index=True, index_label='name')

                    #convert roary output to fripan compatible
                    os.system('python ../roary2fripan.py '+base+'_'+k)
                    roary2fripan_strains_file = pd.read_table(base+'_'+k+
                                                              '.strains',
                                                              index_col=0,
                                                              header=0)
                    info_list = []
                    info_list.append(roary2fripan_strains_file)
                    info_list.append(metadata_overall.loc[v, :])
                    strains_info_out = pd.concat(info_list, axis=1, sort=False)
                    strains_info_out.to_csv(base+'_'+k+'.strains', mode='w',
                                            sep='\t', index=True,
                                            index_label='ID')
                    print('Updated '+base+'_'+k+'.strains with all metadata.')
                    os.system('cp '+base+'_'+k+'* ~/public_html/fripan')
                    os.chdir(wd)
                else:
                    print('Only one isolate in '+k+'. Need at least 2 isolates '+\
                          'to run roary.  Moving on...')

        #Keep the tempdirs created during the run
        if not ARGS.keep_tempdirs:
            shutil.rmtree(assembly_tempdir, ignore_errors=True)
            print('\nDeleted tempdir '+assembly_tempdir+'.')
        else:
            print('\nTempdir '+assembly_tempdir+' not deleted.')

        print('\nRun finished.')
Esempio n. 44
0
# rosalind_ba7b
'''
Limb Length Problem

Find the limb length for a leaf in a tree.

Given: An integer n, followed by an integer j between 0 and n - 1, 
followed by a space-separated additive distance matrix D (whose elements are integers).

Return: The limb length of the leaf in Tree(D) corresponding to row j of this 
distance matrix (use 0-based indexing).

'''
import numpy as np
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

f = open('rosalind_ba7b.txt')
n = int(f.readline().rstrip())
j = int(f.readline().rstrip())

D = np.fromfile(f, sep=' ', dtype=int).reshape(n, n)

#For the Phylo.TreeConstruction to work, integers must be Python int and not numpy.int64
dm = [[int(D[i, j]) for j in range(i+1)] for i in range(n)]
names = [str(i) for i in range(n)]

constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names, dm))

print(round(tree.find_any(str(j)).branch_length))
Esempio n. 45
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	plot_rectangular = False
	common_kmer_data_path=''
	taxonomic_names_on_leaves = False
	try:
		opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputCommonKmerXFile"):
			input_file = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
		elif opt in ("-D", "--CommonKmerDataPath"):
			common_kmer_data_path = arg
		elif opt in ("-r", "--Rectangular"):
			plot_rectangular = True
		elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
			taxonomic_names_on_leaves = True
	
	
	#Read in the x vector
	fid = open(input_file,'r')
	x = map(lambda y: float(y),fid.readlines())
	fid.close()
	
	#Normalize the x vector
	#x = map(lambda y: y/sum(x),x)
	
	#Read in the taxonomy
	taxonomy = list()
	fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r')
	for line in fid:
		taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID)
	fid.close()
	
	#Read in the basis for the ckm matrices
	x_file_names = list()
	fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r')
	for line in fid:
		x_file_names.append(os.path.basename(line.strip()))
	fid.close()
	
	#Read in the common kmer matrix
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r')
	ckm30=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r')
	ckm50=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	names = x_file_names
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])
	
	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)
	
	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)
	
	#Function to insert a node some % along a branch
	def insert_hyp_node(t, leaf_name, percent):
		total_dist = t.get_distance(t.name,leaf_name)
		percent_dist = percent*total_dist
		child_node = (t&leaf_name)
		ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))
	
	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = map(lambda y: y**1.5,cutoffs)
	for i in range(len(x_file_names)):
		xi = x[i:len(x):len(x_file_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, x_file_names[i], cutoffs[j-1])
				hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names
				#insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])
	
	#Now put the bubbles on the nodes
	def layout(node):
		#print(node)
		if node.is_leaf():
			if node.name in x_file_names:
				#make reconstructed bubble
				size = x[x_file_names.index(node.name)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				if taxonomic_names_on_leaves:
					nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)])
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
				else:
					nameFace = AttrFace("name", fsize=25, fgcolor='black')
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in x_file_names:
				idx = hyp_node_names[node.name][2]
				size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#print node
				#print size
			else:
				size=0
		else:
			size=0
		#print(size)
	
	ts = TreeStyle()
	ts.layout_fn = layout
	if plot_rectangular:
		ts.mode = "r"
	else:
		ts.mode = "c"
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50

	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))