def test_good_manipulation(self):
     dm = DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertEqual(dm[1], [1, 0, 3, 5])
     self.assertEqual(dm[2, 1], 3)
     self.assertEqual(dm[2][1], 3)
     self.assertEqual(dm[1, 2], 3)
     self.assertEqual(dm[1][2], 3)
     self.assertEqual(dm['Alpha'], [0, 1, 2, 4])
     self.assertEqual(dm['Gamma', 'Delta'], 6)
     # setitem
     dm['Alpha'] = [0, 10, 20, 40]
     self.assertEqual(dm['Alpha'], [0, 10, 20, 40])
     # delitem insert item
     del dm[1]
     self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]])
     dm.insert('Beta', [1, 0, 3, 5], 1)
     self.assertEqual(dm.names, self.names)
     self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]])
     del dm['Alpha']
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]])
     dm.insert('Alpha', [1, 2, 4, 0])
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
Esempio n. 2
0
 def test_format_phylip(self):
     dm = DistanceMatrix(self.names, self.matrix)
     handle = StringIO()
     dm.format_phylip(handle)
     lines = handle.getvalue().splitlines()
     self.assertEqual(len(lines), len(dm) + 1)
     self.assertTrue(lines[0].endswith(str(len(dm))))
     for name, line in zip(self.names, lines[1:]):
         self.assertTrue(line.startswith(name))
 def test_format_phylip(self):
     dm = DistanceMatrix(self.names, self.matrix)
     handle = StringIO()
     dm.format_phylip(handle)
     lines = handle.getvalue().splitlines()
     self.assertEqual(len(lines), len(dm) + 1)
     self.assertTrue(lines[0].endswith(str(len(dm))))
     for name, line in zip(self.names, lines[1:]):
         self.assertTrue(line.startswith(name))
Esempio n. 4
0
    def score_to_matrix(list_with_scores):
        """Help function that returns a distance matrix for guide tree generation.
		"""
        # lexikographic sort of list and graphs for further proceeding
        for i in range(len(list_with_scores)):
            graphs = [list_with_scores[i][0], list_with_scores[i][1]]
            graphs.sort()
            graphs.reverse()
            list_with_scores[i][0] = graphs[0]
            list_with_scores[i][1] = graphs[1]
        list_with_scores.sort()
        list_with_scores.reverse()
        # create name and score list for generation of distance matrix
        names = []
        scores = []
        i = -1
        for entry in list_with_scores:
            if entry[0] not in names:
                names.append(entry[0])
                scores.append([0, entry[2]])
                i += 1
            else:
                scores[i].append(entry[2])
        last_graph = list_with_scores[-1][1]
        names.append(last_graph)
        names.reverse()
        scores.append([0])
        scores.reverse()
        for line in scores:
            line.reverse()
        # generating distance matrix
        matrix = DistanceMatrix(names, scores)
        return matrix
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'):  # Construct Tree with specific type (Default = UPGMA)
    if with_marburg == 1:
        print('Constructing Tree with All Viruses without Marburg')
        filename = algorithm + '_' + gene_name
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire']
    else:
        print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name))
        filename = algorithm + '_' + gene_name + '_with_Marburg'
        names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg']
        marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta")
        Alignment.read_data()
        print('Aligning Genes for marburg_genome')
        gene_name += '_with_marburg'
        Alignment.read_genes(marburg_genome)
    print('Reading edit matrix and construct tree')
    edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None)  # read edit matrix file
    constructor = DistanceTreeConstructor()  # Create a tree constructor object
    edit_matrix = convert_tu_lower_triangular(edit_matrix)  # Convert Edit Distance matrix to lower triangular
    distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix)
    if algorithm == 'NJ':  # Neighbor-Joining Alogrithm
        tree = constructor.nj(distance_matrix)
    else:  # UPGMA Algorithm
        tree = constructor.upgma(distance_matrix)
    save_tree(tree, filename)  # Save Tree into a file
    return tree
Esempio n. 6
0
    def summarise_dist(self, rf_results: RfResults, dir_out):

        for use_norm in (True, False):
            if use_norm:
                path_out = os.path.join(dir_out, 'rf_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg')
                plt_title = 'Normalised Robinson-Foulds Distance'
            else:
                path_out = os.path.join(dir_out, 'rf_un_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg')
                plt_title = '(un)Normalised Robinson-Foulds Distance'

            metrics = defaultdict(dict)
            names = set()
            for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items():
                if use_norm:
                    metrics[tid_a][tid_b] = norm_rf
                    metrics[tid_b][tid_a] = norm_rf
                else:
                    metrics[tid_a][tid_b] = rf
                    metrics[tid_b][tid_a] = rf
                names.add(tid_a)
                names.add(tid_b)

            labels = sorted(list(names))
            mat_vals = list()
            mat = np.zeros((len(labels), len(labels)))
            for i in range(len(labels)):
                cur_row = list()
                tid_a = labels[i]
                for j in range(i + 1):
                    tid_b = labels[j]
                    if tid_a == tid_b:
                        cur_row.append(0.0)
                    else:
                        cur_row.append(metrics[tid_a][tid_b])
                        mat[i, j] = metrics[tid_a][tid_b]
                mat_vals.append(cur_row)
            mat = mat + mat.T

            # Newick
            dm = DistanceMatrix(names=labels, matrix=mat_vals)
            constructor = DistanceTreeConstructor()
            tree = constructor.nj(dm)

            Phylo.write(tree, path_out, 'newick')

            # Heatmap
            cmap = sns.cubehelix_palette(100, reverse=True)

            sns.set(font_scale=1)
            fig_size = (15, 15)

            rf_df = pd.DataFrame(mat, columns=labels, index=labels)
            sns.clustermap(rf_df,
                           annot=True,
                           fmt='.3f',
                           cmap=cmap,
                           figsize=fig_size).fig.suptitle(plt_title)
            plt.savefig(path_hm)
Esempio n. 7
0
def buildPhyloDM(groups):
    names=[]
    ct=0
    pcut=0.2
    #pcut=0.0
    for i in groups:
        if i.types!=None:
            di=[[item,i.types.count(item)] for item in set(i.types) \
                    if i.types.count(item)*1.0/len(i.types)>pcut]
            di=sorted(di,key=lambda x: x[1],reverse=True)
            strdi=str(ct)+"|"+List2String(di)
            #pdb.set_trace()
        else:
            strdi=str(ct)
        names.append(strdi)
        ct+=1
    matrix=[]
    for i in range(len(groups)):
        irow=[]
        for j in range(i+1):
            mij=distanceij(groups[i],groups[j]) if i!=j else 0
            irow.append(mij)
        matrix.append(irow)
    dm=DistanceMatrix(names,matrix)
    return dm
Esempio n. 8
0
 def _convert(m, names):
     lwtm = [] # Convert to lower triangular form
     for i in range(0, len(m)):
         j = i + 1
         lwtm.append(m[i,:j].tolist())
     if names is None:
         n = [f"S{n}" for n in range(1, len(m) + 1)]
     return DistanceMatrix(names=n, matrix=lwtm)
Esempio n. 9
0
def score_to_distance(score_matrix):
    np_score = np.array(list(score_matrix))
    max_score = np.max(np_score)

    map_flip = np.vectorize(lambda v: (v + max_score - 2 * v) / max_score)
    flipped = map_flip(np_score)
    return DistanceMatrix(
        score_matrix.names,
        matrix=[list(map(float, sl[:i + 1])) for i, sl in enumerate(flipped)])
Esempio n. 10
0
 def test_good_construction(self):
     dm = DistanceMatrix(self.names, self.matrix)
     self.assertTrue(isinstance(dm, TreeConstruction.DistanceMatrix))
     self.assertEqual(dm.names[0], 'Alpha')
     self.assertEqual(dm.matrix[2][1], 3)
     self.assertEqual(len(dm), 4)
     self.assertEqual(repr(dm),
                      "DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], "
                      "matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])")
Esempio n. 11
0
    def test_correct_answer(self):
        for i in range(6):
            n, m = read_matrix('tests/test{}.txt'.format(i))
            tree1 = DistanceTreeConstructor().nj(DistanceMatrix(n, m))
            tree2 = NJ_tree().create_tree(n, m)

            self.assertTrue(
                nx.is_isomorphic(
                    Phylo.to_networkx(tree1).to_undirected(),
                    Phylo.to_networkx(tree2).to_undirected()))
Esempio n. 12
0
def print_trees(country, position_table):
    ### Pull out the concensus sequence

    concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0]
    concensus_seq

    position_table = position_table.set_index('seqid')

    ### Determine which samples are farthest from the concensus sequence

    distance_from_concensus_seq = position_table.apply(
        lambda row: sum(row != concensus_seq), axis=1)
    distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values(
        ascending=False)
    distance_from_concensus_seq_sorted

    ### Select 10 sequences to do our first analysis

    subset_seqs = distance_from_concensus_seq_sorted[:10].index
    subset_seqs

    ### Construct a distance matrix for our sequences

    distances = {}
    for i, seqid1 in enumerate(subset_seqs):
        distances[seqid1, seqid1] = 0
        for j in range(i + 1, len(subset_seqs)):
            seqid2 = subset_seqs[j]
            distances[seqid1, seqid2] = sum(
                position_table.loc[seqid1] != position_table.loc[seqid2])
            distances[seqid2, seqid1] = distances[seqid1, seqid2]
    distances = pd.Series(distances).unstack()

    matrix = np.tril(distances.values).tolist()
    for i in range(len(matrix)):
        matrix[i] = matrix[i][:i + 1]
    dm = DistanceMatrix(list(distances.index), matrix)

    ### Now construct our tree
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    print(country.upper())
    print("Neighbor Joining Tree")
    tree.ladderize()  # Flip branches so deeper clades are displayed at top
    display(Phylo.draw(tree))
    #**Please see the guidance at the top of the page for what to try**

    if (len(dm) > 1):
        tree2 = constructor.upgma(dm)
        #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences
        print("UPGMA Tree")
        tree2.ladderize(
        )  # Flip branches so deeper clades are displayed at top
        display(Phylo.draw(tree2))
    return
Esempio n. 13
0
    def draw(self):
        """
        visualize the phylo tree
        """
        mat = list(
            map(lambda x: list(filter(lambda x: x > 0, x)),
                self.distMat.tolist()))
        constructor = DistanceTreeConstructor()
        upgmatree = constructor.upgma(DistanceMatrix(self.names, mat))

        Phylo.draw_ascii(upgmatree)
Esempio n. 14
0
 def test_good_manipulation(self):
     dm = DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertEqual(dm[1], [1, 0, 3, 5])
     self.assertEqual(dm[2, 1], 3)
     self.assertEqual(dm[2][1], 3)
     self.assertEqual(dm[1, 2], 3)
     self.assertEqual(dm[1][2], 3)
     self.assertEqual(dm['Alpha'], [0, 1, 2, 4])
     self.assertEqual(dm['Gamma', 'Delta'], 6)
     # setitem
     dm['Alpha'] = [0, 10, 20, 40]
     self.assertEqual(dm['Alpha'], [0, 10, 20, 40])
     # delitem insert item
     del dm[1]
     self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]])
     dm.insert('Beta', [1, 0, 3, 5], 1)
     self.assertEqual(dm.names, self.names)
     self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]])
     del dm['Alpha']
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]])
     dm.insert('Alpha', [1, 2, 4, 0])
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
Esempio n. 15
0
def dendrogram_biopython(condensed_distance_matrix_jaccard, organisms):
    """
    Create a lower triangle matrix. Then create a biopython dendrogram.

    Parameters
    ----------
    condensed_distance_matrix_jaccard: ndarray
        Condensed Jaccard distance matrix
    organisms: list
        organisms names

    """
    from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
    from Bio.Phylo import draw_ascii

    lower_triangle_matrix = [
        list(v[:i + 1])
        for i, v in enumerate(squareform(condensed_distance_matrix_jaccard))
    ]
    constructor = DistanceTreeConstructor()
    dm = DistanceMatrix(organisms, lower_triangle_matrix)
    tree = constructor.nj(dm)
    draw_ascii(tree)
    dm.format_phylip(open('test.phy', 'w'))
Esempio n. 16
0
    def random_score_matrix(list_of_graph_names):
        """Help function that generates random scores for a distance matrix.
		"""
        scores = []
        k = 0
        for i in range(len(list_of_graph_names)):
            scores.append([0])
            if k > 0:
                for j in range(k):
                    random_score = randint(1, 100) / 100
                    scores[i].append(random_score)
            scores[i].reverse()
            k += 1
        # generating distance matrix
        matrix = DistanceMatrix(list_of_graph_names, scores)
        return matrix
def run_optimization():
    '''
    '''
    
    params = get_data()
    
    
    num_samples = 16
    

#---------------------------------------------------------------------------------------------------------------------------------------------------    
    NUM_OF_VERTICES = 200
    
    
    distances = np.zeros((num_samples, num_samples))
    
    for i in range(num_samples):
        for j in range(i + 1, num_samples): 
            print("working on the pair", (i, j))
            distances[i, j] = np.abs(compare_curves(params[i], params[j], num_of_verts=NUM_OF_VERTICES))
            distances[j, i] = distances[i,j]
#---------------------------------------------------------------------------------------------------------------------------------------------------  
                
 
    
# Plot distance matrix and make phylogenetic tree
#---------------------------------------------------------------------------------------------------------------------------------------------------    
    plt.matshow(distances)
    plt.colorbar()
    plt.show
    
    distaceMat = [list(distances[i, :i+1]) for i in range(16)]
    
    distaceMatrix = DistanceMatrix(names=['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'],
                                   matrix=distaceMat)
    
    constructor = DistanceTreeConstructor()
    
    tree_up = constructor.upgma(distaceMatrix)
    
    tree_nj = constructor.nj(distaceMatrix)
    
    Phylo.draw_ascii(tree_nj)
    
    Phylo.draw_ascii(tree_up)
    
    return distances
Esempio n. 18
0
def get_phylogenetic_tree(max_str_len=1,
                          norm="JSD",
                          cpc_function="Square25",
                          joining_alg="nj"):
    desc, genes = iter_over_files()
    pm = pd_matrix(genes,
                   max_str_len=max_str_len,
                   norm=norm,
                   cpc_function="Square25")
    pm = convert_triangle(pm)
    dm = DistanceMatrix(names=desc, matrix=pm)
    constructor = DistanceTreeConstructor()
    if (joining_alg == "nj"):
        tree = constructor.nj(dm)
    elif (joining_alg == "upgma"):
        tree = constructor.upgma(dm)
    Phylo.write(tree, 'phylo-tree/result.xml', 'newick')
Esempio n. 19
0
def main():
    seqs = read_files(sys.argv[1])
    gen_score_file_to_distance_file("scores.txt", seqs)
    matrix, seq_ids = gen_matrix_from_pair_ids_and_value(path='distances.txt')

    print(matrix)
    print(len(matrix))
    print(len(seq_ids))
    dm = DistanceMatrix(names=seq_ids, matrix=matrix)

    print(dm)

    constructor = DistanceTreeConstructor()

    tree = constructor.nj(dm)

    fig = plt.figure(figsize=(12, 5), dpi=100)
    axes = fig.add_subplot(1, 1, 1)
    Phylo.draw(tree, axes=axes)
Esempio n. 20
0
def leaf_distance_from_tree(tree):
    leavesName = []
    for leaf in tree.leaf_nodes():
        leavesName.append(leaf.taxon.label)
    distMatrix = []
    nLeaf = len(leavesName)
    for i in range(nLeaf):
        distMatrix.append([])
        for j in range(i):
            name1 = leavesName[i]
            name2 = leavesName[j]
            node1 = tree.find_node_with_taxon_label(name1)
            node2 = tree.find_node_with_taxon_label(name2)
            ancestor = tree.mrca(taxon_labels=[name1, name2])
            dist1 = node_distance(node1, ancestor)
            dist2 = node_distance(node2, ancestor)
            distMatrix[i].append(dist1 + dist2)
        distMatrix[i].append(0)
    return DistanceMatrix(leavesName, distMatrix)
Esempio n. 21
0
    def run(self):
        self.output().makedirs()
        labels, mat = self.requires().as_matrix()

        # Convert the numpy distance matrix to the expected format
        mat_list = list()
        for i in range(len(labels)):
            row = list()
            for j in range(i + 1):
                row.append(mat[i][j])
            mat_list.append(row)

        dm = DistanceMatrix(names=labels, matrix=mat_list)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(dm)

        # Write the tree to disk.
        with self.output().open('w') as fh:
            Phylo.write(tree, fh, 'newick')
def process_input_matrix(input_matrix):
    """ Converts an array-of-arrays containting sample IDs and distances
        into a BioPython DistanceMatrix object
    """
    input_matrix.pop(0)
    sample_names = [row[0] for row in input_matrix]
    for row in input_matrix:
        row.pop(0)
    distance_matrix = []
    for input_matrix_row in input_matrix:
        distance_matrix.append([float(i) for i in input_matrix_row])
    """ np.tril() converts a matrix like this: [[0 1 2]
                                                [1 0 1]
                                                [2 1 0]]
        ...into this: [[0 0 0]
                       [1 0 0]
                       [2 1 0]]
        ...but what we need to pass to DistanceMatrix() is this: [[0]
                                                                  [1 0]
                                                                  [2 1 0]]
        ...so that's what the (somewhat cryptic) code below does.
    """
    distance_matrix = np.tril(np.array(distance_matrix))
    num_rows = distance_matrix.shape[0]
    """ masking the distance matrix with tril_indices gives a linearized
        distance matrix [0 1 0 2 1 0] that we need to re-construct 
        into [[0], [1, 0], [2, 1, 0]]
    """
    lower_triangular_idx_mask = np.tril_indices(num_rows)
    linear_distance_matrix = distance_matrix[lower_triangular_idx_mask]
    distance_matrix = []
    min = 0
    max = 1
    for i in range(num_rows):
        distance_matrix.append(linear_distance_matrix[min:max].tolist())
        min = max
        max = max + (i + 2)

    distance_matrix = DistanceMatrix(names=sample_names,
                                     matrix=distance_matrix)

    return distance_matrix
Esempio n. 23
0
 def test_bad_manipulation(self):
     dm = DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertRaises(ValueError, dm.__getitem__, 'A')
     self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 1.2))
     self.assertRaises(IndexError, dm.__getitem__, 6)
     self.assertRaises(IndexError, dm.__getitem__, (10, 10))
     # setitem: item or index test
     self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4])
     self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4)
     self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3)
     self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2)
     self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4])
     self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1)
     # setitem: value test
     self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2])
     self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a')
     self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
Esempio n. 24
0
def estimate_parameters4(kmer_distance_matrices, mu=None):
    # Here, we use the expected k-mer distance formula which does not use the coalescent model to account for variation in divergence time
    k = kmer_distance_matrices.keys()
    assert (len(k) == 2)
    names = kmer_distance_matrices.values()[0].names
    n = len(names)
    # Estimate branch lengths and theta
    that = np.zeros((n, n))
    bnds = tuple([(0, None)] * ((n**2 - n) / 2))
    opt_result = minimize(objfn4_T,
                          tuple([0.5] * ((n**2 - n) / 2)),
                          args=(kmer_distance_matrices[k[0]],
                                kmer_distance_matrices[k[1]], k[0], k[1]),
                          bounds=bnds,
                          method='SLSQP')
    #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)]))
    that = [[opt_result.x[i * (i - 1) / 2 + j] for j in range(i)] + [0.0]
            for i in range(n)]
    thatdm = DistanceMatrix(names, that)
    return (thatdm)
Esempio n. 25
0
 def test_bad_manipulation(self):
     dm = DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertRaises(ValueError, dm.__getitem__, "A")
     self.assertRaises(ValueError, dm.__getitem__, ("Alpha", "A"))
     self.assertRaises(TypeError, dm.__getitem__, (1, "A"))
     self.assertRaises(TypeError, dm.__getitem__, (1, 1.2))
     self.assertRaises(IndexError, dm.__getitem__, 6)
     self.assertRaises(IndexError, dm.__getitem__, (10, 10))
     # setitem: item or index test
     self.assertRaises(ValueError, dm.__setitem__, "A", [1, 3, 4])
     self.assertRaises(ValueError, dm.__setitem__, ("Alpha", "A"), 4)
     self.assertRaises(TypeError, dm.__setitem__, (1, "A"), 3)
     self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2)
     self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4])
     self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1)
     # setitem: value test
     self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2])
     self.assertRaises(TypeError, dm.__setitem__, ("Alpha", "Beta"), "a")
     self.assertRaises(TypeError, dm.__setitem__, "Alpha", ["a", "b", "c"])
    def get_distance(self, msa):

        if not isinstance(msa, MultipleSeqAlignment):
            raise TypeError("Must provide a MultipleSeqAlignment object.")
        i=0
        for record in msa:
            record.index= i
            i+=1
        names = [record.id for record in msa]


        dm = DistanceMatrix(names)
        pair_combinations = list(itertools.combinations(msa, 2))




        for pair in range(len(pair_combinations)):
            dm[pair_combinations[pair][0].id, pair_combinations[pair][1].id] = self._pairwise(pair_combinations[pair][0], pair_combinations[pair][1])
        return dm
Esempio n. 27
0
def make_score_matrix(records):
    record_ids = [record.id for record in records]
    matrix = DistanceMatrix(names=record_ids)
    for i, sequence_a in enumerate(tqdm(records)):
        prepare_query(sequence_a.seq)
        blastp_cline = NcbiblastpCommandline(query='query.txt',
                                             db='db',
                                             outfmt=5,
                                             out='./result.txt')
        stdout, stderr = blastp_cline()
        results = SearchIO.read('./result.txt', format='blast-xml')
        for hit in results:
            highest_scoring_pair = max(list(hit), key=lambda hit: hit.bitscore)
            score = highest_scoring_pair.bitscore
            length = len(list(highest_scoring_pair.fragments))
            try:
                j = record_ids.index(highest_scoring_pair.hit_id)
                matrix[i, j] = score / length
            except:
                pass
    return matrix
def build_distance_matrix(ids: List[str],
                          distance_file: TextIO) -> DistanceMatrix:
    r"""Build a distance matrix.

    Parameters
    ----------
    ids : List[str]
        List of sequence IDs
    distance_file : TextIO
        File containing distances in f"{id1}\t{id12}\t{dist}\n" format.

    Returns
    -------
    DistanceMatrix

    """
    dm = DistanceMatrix(names=ids)
    for line in distance_file:
        id1, id2, distance = line.split()
        dm[id1, id2] = float(distance)
    return dm
Esempio n. 29
0
def JukesCantorDistanceMatrix(msa):
    names = [seq.id for seq in msa]
    matrix = []
    rowIdx = 0
    for row in msa:
        matrix.append([])
        for col in msa:
            if col.id == row.id:
                matrix[rowIdx].append(0)
                break
            else:
                strLen = len(row.seq)
                diff = 0
                for i in range(strLen):
                    if row.seq[i] != '-' and col.seq[i] != '-' and row.seq[
                            i] != col.seq[i]:
                        diff = diff + 1
                JDdist = -0.75 * np.log(1 - 4. / 3 * (1.0 * diff / strLen))
                matrix[rowIdx].append(JDdist)
        rowIdx = rowIdx + 1
    return DistanceMatrix(names, matrix)
Esempio n. 30
0
 def distance_matrix(self):
     names = []
     matrix = []
     seqs = []
     names.append(str(self.query_id))
     seqs.append("".join(self.query_seq))
     for s in self.blastsubject_set.all():
         id = s.subject_id
         seq = s.subject_seq
         names.append(str(id))
         seqs.append("".join(seq))
     for i in range(0, len(names)):
         matrix.append([])
         for j in range(0, i + 1):
             d = 0.0
             if i != j:
                 if self.is_prot():
                     d = prot_dist(seqs[i], seqs[j])
                 else:
                     d = nucl_dist(seqs[i], seqs[j])
             matrix[i].append(d)
     return DistanceMatrix(names=names, matrix=matrix)
Esempio n. 31
0
def tree_from_distance_matrix(X):
    """Distance matrix to phylo tree"""

    from Bio import Phylo
    from Bio.Phylo.TreeConstruction import DistanceMatrix,DistanceTreeConstructor
    from Bio.Cluster import distancematrix

    names = list(X.index)
    if type(X) is pd.DataFrame:
        X = X.values
    mat = distancematrix(X)

    #print (names)
    #names = [i[16:] for i in names]
    new=[]
    for i in mat:
        new.append(np.insert(i, 0, 0).tolist())

    dm = DistanceMatrix(names,new)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    #Phylo.draw_ascii(tree,file=open('temp.txt','w'))
    return tree
def ArgMinSumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap(
        kmer_distance_matrices, mu=None):
    k = kmer_distance_matrices.keys()
    assert (len(k) == 2)
    names = kmer_distance_matrices.values()[0].names
    n = len(names)
    # Estimate branch lengths and theta
    #that = np.zeros((n,n))
    bnds = tuple([(0, None)] * (n * (n - 1) / 2) + [(0, None)])
    opt_result = minimize(
        SumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap,
        tuple([0.5] * (n * (n - 1) / 2) + [1]),
        args=(kmer_distance_matrices[k[0]], kmer_distance_matrices[k[1]], k[0],
              k[1]),
        bounds=bnds,
        method='SLSQP')
    thetahat = opt_result.x[-1]
    #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)]))
    that = [[opt_result.x[i * (i - 1) / 2 + j] for j in range(i)] + [0.0]
            for i in range(n)]
    thatdm = DistanceMatrix(names, that)
    #    print dm
    #    print thatdm
    return (thatdm, thetahat)