Example #1
0
def Neigborjoin(Matrice):

    ids = list(Matrice)
    dm = DistanceMatrix(Matrice, ids)

    tree = nj(dm)
    print(tree.ascii_art())
    print('\n')

    newick_str = nj(dm, result_constructor=str)

    return newick_str
Example #2
0
def construct_tree(X_2d, acc, title):
    data = pairwise_distances(X_2d)
    data[np.isnan(data)] = 0
    for i in range(data.shape[0]):
        for j in range(i, data.shape[0]):
            data[j, i] = data[i, j]
    
    dm = DistanceMatrix(data, acc)
    tree = nj(dm)
    newick_str = nj(dm, result_constructor=str)
    
    with open(title + ".nwk", "w") as f:
        f.write(newick_str)
Example #3
0
    def collect(self, column: List[List[Token]]) -> Dict:
        """the collect method takes in a list of Tokens and collects the closest ones together. The returned
        object is a dict of lists with each inner list representing the tree of indices of nearest neighbors
         in the format for e.g. [[2, (3, 1)], 0]
        to represent the tree:
                        --- 0
                    ---|     --- 2
                        --- |   --- 3
                            ---|
                                --- 1

        Parameters
        ----------
        column: list of list[Token]
            the column to align

        Returns
        -------
             a dict of lists with key 'n' representing the cluster and each inner list representing row_indices of groups
             with n tokens / row_index of part of the cluster
       """
        distances = self._compute_pairwise_distance(column)

        # create the tree with the indices of the rows instead of the actual values
        nw = list()
        [nw.append(i) for i in range(len(column))]

        dm = DistanceMatrix(distances, nw)
        tree = nj(dm)

        _, order = deserialize(serialize(tree))

        return {0: order[0]}
Example #4
0
def find_tree(npop: int,
              numerical_label: 'np.ndarray[int]',
              arr: 'np.ndarray[float]',
              ) -> TreeNode:
    """Find tree topology using the centers of mass of clusters.
    'inferred_labels' contains assigned labels. Return the neighbor join tree, population sizes,
    and the bloks of original distance matrix that correspond to given
    population pairs (for further determination of fitting window).
    """
    if npop == 2:
        tree = read(StringIO('(0:0.1, 1:0.1);'), format='newick', into=TreeNode)
        return tree

    arr = arr[:, :npop + OFFSET]
    ds = np.zeros((npop, npop))
    coords = np.zeros((npop, npop+OFFSET))
    for i in set(numerical_label):
        coords[i, :] = np.mean(arr[np.where(numerical_label == i)[0], :], axis=0)
    for i in range(npop):
        for j in range(npop):
            ds[i, j] = np.sqrt(np.sum((coords[i] - coords[j])**2))

    ids = list(map(str, range(npop)))
    dm = DistanceMatrix(ds, ids)
    tree = nj(dm)
    new_tree = tree.root_at_midpoint()
    print(new_tree.ascii_art())
    print(new_tree)
    return new_tree
Example #5
0
def drawTree(MS_distDict, Methyl_distDict, filtered_samples, ratio, outgroup):
    '''
    Merge MS and Methyl distance matrices
    '''
    merged_distMatrix = []
    for sample1 in sorted(filtered_samples):
        sample1_dist = []
        for sample2 in sorted(filtered_samples):
            merged_dist = (MS_distDict[sample1][sample2] * ratio) + (
                Methyl_distDict[sample1][sample2] * (1 - ratio)
            ) / 100  #We want to scale methyl PD dist properly because PD is calculated from a 0-100 scale while MS dist is 0-1 scale
            sample1_dist.append(merged_dist)
        merged_distMatrix.append(sample1_dist)
    '''
    Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict)
    '''
    distObj = DistanceMatrix(merged_distMatrix, sorted(filtered_samples))
    print(distObj.data)
    skbio_tree = nj(distObj, result_constructor=str)
    ete_tree = Tree(
        skbio_tree
    )  #We use skbio to first make a tree from distance matrix then convert to ete tree
    if outgroup is "NA":
        return ete_tree
    else:
        if outgroup == "Midpoint":
            tree_midpoint = ete_tree.get_midpoint_outgroup()
            ete_tree.set_outgroup(tree_midpoint)
        else:
            ete_tree.set_outgroup(outgroup)
    return ete_tree
Example #6
0
    def __init__(self, dist_matrix):
        self.dist_matrix = dist_matrix
        nr_elements = self.dist_matrix.nr_elements
        self.matrix = []
        for i in range(nr_elements):
            row = []
            for j in range(nr_elements):
                row.append(self.dist_matrix.get_distance(i, j))
            self.matrix.append(row)
        self.ids = list(map(str, self.dist_matrix.labels))
        self.nj_dm = DistanceMatrix(self.matrix, self.ids)
        tree = nj(self.nj_dm)
        self.ids = []
        self.sources = []
        self.targets = []
        self.weights = []
        self.colors = []
        self.node_size = []
        self.virtual_nodes = 0
        self.shown_labels = {}
        self.font_colors = []

        # true #00A693 -- false #CC3333
        for node in tree.preorder():
            name_str = ''
            if node.name is None:
                self.virtual_nodes = self.virtual_nodes + 1
                name_str = 'v' + str(self.virtual_nodes)
                node.name = name_str
                self.ids.append(node.name)
                self.colors.append("black")
                self.node_size.append(20)
                self.shown_labels[str(name_str)] = ""
                self.font_colors.append('k')
            else:
                name = node.name.rsplit(' ', 1)
                if len(name) > 1:
                    node.name = name[1]
                    name2 = name[0].rsplit(' ', 1)
                    if len(name2) > 1:
                        node.name = name2[1] + name[1]
                name = node.name
                if name in []:
                    self.ids.append(node.name)
                    self.colors.append("#CC3333")
                    self.node_size.append(800)
                    name_str = node.name
                    self.shown_labels[str(name_str)] = name_str
                else:
                    self.ids.append(node.name)
                    self.colors.append("#00A693")
                    self.node_size.append(800)
                    name_str = node.name
                    self.shown_labels[str(name_str)] = name_str

        for node in tree.preorder():
            for child in node.children:
                self.sources.append(str(node.name))
                self.targets.append(str(child.name))
                self.weights.append(str(child.length))
Example #7
0
    def fromSequences(cls, labels, sequences, findParams=None, **kwargs):
        """
        Construct an NJTree instance from some seqeunces.

        @param cls: Our class.
        @param labels: An iterable producing C{str} labels for the sequences.
        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param findParams: An instance of C{FindParameters}.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @return: An C{NJTree} instance.
        """
        if isinstance(sequences, str):
            sequences = FastaReads(sequences,
                                   readClass=AAReadWithX,
                                   upperCase=True)

        new = cls()
        new.sequences = list(sequences)
        new.labels = labels
        findParams = findParams or FindParameters()
        affinity = np.array(
            affinityMatrix(new.sequences, findParams=findParams, **kwargs))
        new.distance = np.ones(affinity.shape) - affinity
        new.tree = nj(DistanceMatrix(new.distance, labels))
        return new
Example #8
0
    def get_tree_and_order(self,
                           words: List[List[Token]]) -> Tuple[TreeNode, List]:
        """creates a nearest neighbor tree and returns a list of tuples in the form:
        [s1, (s2, s3), s4]
        depicting the different order of how they should be aligned

        Parameters
        ----------
        words: list
            list of inputs

        Returns
        -------
            tuple of TreeNode and Order
        """

        distances = self._compute_pairwise_distance(words)

        # create the tree with the indices of the rows instead of the actual values
        nw = list()
        [nw.append(str(i)) for i in range(len(words))]

        dm = DistanceMatrix(distances, nw)
        tree = nj(dm)

        tree, order = deserialize(serialize(tree), words)

        return tree, order[0]
Example #9
0
def NJ(names, matrix):
    """ 
        input: a numpy matrix 
        return: newick string corresponding to neighbor joining
    """
    dm = DistanceMatrix(matrix, names)
    newick_str = nj(dm, result_constructor=str)
    return newick_str
Example #10
0
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Example #11
0
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Example #12
0
def tree_from_distmatrix(D):
    """tree from distance matrix"""

    from skbio import DistanceMatrix
    from skbio.tree import nj
    ids = list(D.index)
    dm = DistanceMatrix(D.values, ids)
    tree = nj(dm)
    #print(tree.ascii_art())
    return tree
Example #13
0
def get_tree(core=False, newick=False):
    core_collection = kv.get_collection('core')
    all_species = core_collection.distinct('species')
    if core:
        pass
    else:
        other_collection = kv.get_collection('other')
        all_species.extend(other_collection.distinct('species'))
    ssu_species = [n for n  in all_species if kv.db['16S'].find_one({'species':n})]
    
    dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species)
    t = tree.nj(dm)
    print t.ascii_art()
    tips = []
    for node in t.tips():
        print node.name, node.length
        tips.append(node.name.replace(' ', '_'))
    if newick:
        n = tree.nj(dm, result_constructor=str)
        print n
    else:
        return (t, tips)
Example #14
0
def reconstruct_neighborjoining(df_mutation_table, path_out_newick):

    from skbio import DistanceMatrix
    from skbio.tree import nj

    tdf = df_mutation_table.drop(columns='root').transpose()
    dm = DistanceMatrix(
        scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(tdf)),
        tdf.index)

    tree = nj(dm)
    tree.write(path_out_newick, format='newick')
    return tree
Example #15
0
    def run_nj_get_dist_matrix(self, dist_matrix):
        dm = DistanceMatrix(dist_matrix)

        # run neighbor join and get dist matrix from the tree
        nj_tree = nj(dm)
        df = nj_tree.tip_tip_distances().to_data_frame()
        df.index = df.index.astype(int)

        # sort rows and cols
        df.sort_index(inplace=True)
        df.columns = df.columns.values.astype(np.int32)
        df = df[sorted(df.columns)]

        return df.as_matrix()
Example #16
0
    def fromDistanceMatrix(cls, labels, distance):
        """
        Construct an NJTree instance, given a distance matrix.

        @param cls: Our class.
        @param labels: An iterable producing C{str} labels corresponding to the
            rows (equivalently, columns) of the distance matrix.
        @param distance: A square matrix of numeric distances.
        @return: An C{NJTree} instance.
        """
        new = cls()
        new.labels = labels
        new.distance = distance
        new.tree = nj(DistanceMatrix(distance, labels))
        return new
Example #17
0
def drawTree(distDict, alleleDict, sample_list, outgroup, prefix, bootstrap):
    '''
    Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict)
    '''
    distMatrix = []
    targetMatrix = []
    pairwise_numTargets = []
    sample_numTargets = []
    for sample1 in sorted(sample_list):
        sample1_dist = []
        sample1_targets = []
        for sample2 in sorted(sample_list):
            sample_pair = tuple(sorted([sample1, sample2]))
            sample1_dist.append(distDict["sampleComp"][sample_pair]["dist"])
            sample1_targets.append(distDict["sampleComp"][sample_pair]["num_targets"])
            if sample1 != sample2:
                pairwise_numTargets.append(distDict["sampleComp"][sample_pair]["num_targets"])
            else:
                sample_numTargets.append(distDict["sampleComp"][sample_pair]["num_targets"])
        distMatrix.append(sample1_dist)
        targetMatrix.append(sample1_targets)
    if bootstrap is False: #Only output statistics for distance and number targets shared if for original tree (don't output for bootstrap resampling)
        statsOutput = open(prefix + ".buildPhylo.stats.txt", 'w')
        statsOutput.write("Number of Samples Analyzed:\t" + str(len(sample_list)) + "\n" + ','.join(sample_list) + "\n")
        statsOutput.write("Avg targets shared per pair of cells:\t" + str(float(sum(pairwise_numTargets) / len(pairwise_numTargets))) + "\t[" + str(min(pairwise_numTargets)) + "," + str(max(pairwise_numTargets)) + "]\n")
        statsOutput.write("Avg targets captured per single cell:\t" + str(float(sum(sample_numTargets) / len(sample_numTargets))) + "\t[" + str(min(sample_numTargets)) + "," + str(max(sample_numTargets)) + "]\n")
        for dist_indx,dist_list in enumerate(distMatrix): #Print matrix containing distances
            statsOutput.write(sorted(sample_list)[dist_indx] + "," + ",".join(str(round(i,3)) for i in dist_list) + "\n")
        for target_indx,target_list in enumerate(targetMatrix): #Print matrix containing number targets shared between each pair
            statsOutput.write(sorted(sample_list)[target_indx] + "," + ",".join(str(j) for j in target_list) + "\n")
        statsOutput.close()
        pickle.dump(distDict, open(prefix + ".buildPhylo.distDict.pkl", "wb")) #We want to print out the distance information for each single cell pair that was used to buildPhylo (this will be useful for downstream statistics)
    distObj = DistanceMatrix(distMatrix,sorted(sample_list))
    skbio_tree = nj(distObj, result_constructor=str)
    ete_tree = Tree(skbio_tree) #We use skbio to first make a tree from distance matrix then convert to ete tree
    if outgroup is "NA":
        return ete_tree
    else:
        if outgroup == "Midpoint":
            tree_midpoint = ete_tree.get_midpoint_outgroup()
            if tree_midpoint is not None:
                ete_tree.set_outgroup(tree_midpoint)
            else:
                print(ete_tree.write(format = 0))
                return None #We want to throw out tree if midpoint was not found
        else:
            ete_tree.set_outgroup(outgroup)
    return ete_tree
Example #18
0
    def __load_distance_matrix(self, data):
        dm = DistanceMatrix(data)
        nj_tree = nj(dm)

        df = nj_tree.tip_tip_distances().to_data_frame()

        df.index = df.index.astype(int)
        df.sort_index(inplace=True)
        df.columns = df.columns.values.astype(np.int32)
        df = df[sorted(df.columns)]

        self.dist_matrix = df.as_matrix()

        nj_tree.bifurcate()
        self.__post_order(nj_tree)
        self.__build_genotype(nj_tree)
Example #19
0
def main_vec():
	args = parse_args()

	genomes = parse_msa(args['msa'], args['max_samples'])

	try: os.makedirs(args['out_dir'])
	except: pass

	print("Count SNPs")
	dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv')
	print("   path: %s" % dist_path)
	dist_file = open(dist_path, 'w')
	matrix = []

	occurs = []
	for id in genomes:
		occurs.append(genomes[id][0] != '-')
	occurs = np.array(occurs)

	for i, id in enumerate(genomes.keys()):
		occ_row = occurs[i]
		cooccs = occ_row & occurs

		diffs = []
		for sid in genomes:
			diffs.append(genomes[id][0] != genomes[sid][0])
		diffs = np.array(diffs)

		raw_counts = np.sum(diffs & cooccs, axis=1)
		norm_counts = raw_counts / np.sum(cooccs, axis=1)

		for j, sid in enumerate(genomes.keys()):
			dist_file.write('\t'.join([id, sid, str(raw_counts[j]), str(norm_counts[j])])+'\n')

		matrix.append(norm_counts)

	print("Build SNP tree")
	tree_path = os.path.join(args['out_dir'], 'snp_dist.tree')
	print("   path: %s" % tree_path)
	dm = DistanceMatrix(matrix, genomes.keys())
	tree = nj(dm, result_constructor=str)
	open(tree_path,'w').write(tree)

	print("\nDone!")
Example #20
0
def njWithRoot(dis_matrix, muestraPmid):
    # no culcula la distancia, solo le da un formato mas adecuado a las distancias con los ids
    muestraPmidStr = [str(i) for i in muestraPmid]
    ver = dis_matrix.tolist()
    dm = DistanceMatrix(ver, muestraPmidStr)
    treeOrig = nj(dm, result_constructor=str)
    # ponerle raiz
    t = TreeEte(treeOrig)
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    # imprime el arbol
    #print(t)
    # imprime el newick
    tree = t.write(format=3)
    tree = TreeEte(tree, format=1)
    #print(tree)
    #a = newick_to_pairwise_nodes(tree)
    #print(a)
    return tree
Example #21
0
def export_tree_for_all(all_patterns, matrixoutput, treeoutput):
    result_patterns = []
    for idx, (samplename, pattern, count, pcnt) in enumerate(all_patterns):
        removes = set()
        for pos, na, ref in pattern:
            if na == '.':
                removes |= {
                    (pos, ntmp, ref)
                    for ntmp in ['A', 'C', 'G', 'T', 'ins', 'del', '.']
                }
        result_patterns.append([
            idx, set(pattern) - removes, removes,
            '{}_{}_{:.1f}%'.format(
                samplename,
                idx + 1,
                pcnt * 100),
            count])

    patterns = result_patterns
    num_patterns = len(patterns)
    if num_patterns < 3:
        with open(treeoutput, 'w') as fp:
            fp.write('();')
        return
    dist_matrix = np.zeros((num_patterns, num_patterns), dtype=float)
    patternstrs = [ptnstr for _, _, _, ptnstr, _ in patterns]
    for (idx1, ptn1, rm1, ptnstr1, c1), (idx2, ptn2, rm2, ptnstr2, c2) in \
            combinations(patterns, 2):
        distance = len((ptn1 - rm2) ^ (ptn2 - rm1))  # xor
        dist_matrix[idx1, idx2] = distance
        dist_matrix[idx2, idx1] = distance
    with open(matrixoutput, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['##', *patternstrs])
        writer.writerows(dist_matrix)
    if True or num_patterns > 10000:
        # TODO: add a switch to this
        # Too many patterns, unable to calculate dist_matrix
        return
    dist_matrix = DistanceMatrix(dist_matrix, patternstrs)
    tree = nj(dist_matrix)
    with open(treeoutput, 'w') as fp:
        fp.write(str(tree.root_at_midpoint()))
Example #22
0
def main():
	args = parse_args()

	genomes = parse_msa(args['msa'], args['max_samples'])

	try: os.makedirs(args['out_dir'])
	except: pass

	print("Count SNPs")
	dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv')
	print("   path: %s" % dist_path)
	dist_file = open(dist_path, 'w')
	matrix = []

	for id1 in genomes:
		array = []
		is_present1 = genomes[id1] != '-'
		for id2 in genomes:
			is_present2 = genomes[id2] != '-'
			is_diff = genomes[id1] != genomes[id2]
			co_occur = is_present1 & is_present2
			raw_count = (is_diff & co_occur).sum()

			norm_count = 0
			co_sum = co_occur.sum()

			if raw_count != 0 and co_sum != 0:
				norm_count = float(raw_count) / co_sum

			array.append(norm_count)
			dist_file.write('\t'.join([id1, id2, str(raw_count), str(norm_count)])+'\n')
		matrix.append(array)

	print("Build SNP tree")
	tree_path = os.path.join(args['out_dir'], 'snp_dist.tree')
	print("   path: %s" % tree_path)
	dm = DistanceMatrix(matrix, genomes.keys())
	tree = nj(dm, result_constructor=str)
	open(tree_path,'w').write(tree)

	print("\nDone!")
Example #23
0
def get_guide_tree(seqs, random=False):
    """
    Get a guide tree representing distances between sequences
    :param seqs: Sequences to create a tree for
    :return: Guide tree
    """

    # Get distances and ids
    if random:
        distances = calc_random_distances(seqs)
    else:
        distances = calc_distances(seqs)
    ids = [x.name for x in seqs]

    # distances = [[ 0,  16,  22,  26.5],
    #              [16,   0,  25.5, 24.5],
    #              [22,  25.5,  0,  22.5],
    #              [26.5, 24.5, 22.5,  0. ]]
    #

    # Make a distance matrix and Neighbour-Joining tree
    dm = DistanceMatrix(distances, ids)
    tree = nj(dm)

    # print ('maxxy')
    #
    # print (distances)

    # print (np.amin(distances))
    # print (np.argmin(distances))
    # result = np.where(distances == 0.5692307692307692)
    #
    # print (result)

    # Mid-point root and then label the internal nodes
    tree = tree.root_at_midpoint()
    label_internal_nodes(tree)

    return tree
Example #24
0
def nj_tree(feature_matrix):
    from skbio import DistanceMatrix
    from skbio.tree import nj
    import sklearn
    import time
    t = time.time()

    data = sklearn.metrics.pairwise_distances(feature_matrix.values,
                                              metric='hamming')
    print(time.time() - t)
    t = time.time()

    dm = DistanceMatrix(data)

    print('distance matrix', time.time() - t)
    t = time.time()

    tree = nj(dm)

    print('tree build', time.time() - t)

    return tree
Example #25
0
def build_tree(dist_matrix, names_list, clust):

    tree = None
    if clust == 'nj':
        # print(dist_matrix)
        dm = DistanceMatrix(dist_matrix, names_list)
        tree_scikit = nj(dm,result_constructor=str)
        tree = Tree(tree_scikit)
    elif clust == 'upgma':
        dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix))
        constructor = DistanceTreeConstructor()
        tree_biopython = constructor.upgma(dm)
        # remove InnerNode names
        for i in tree_biopython.get_nonterminals():
            i.name = None
        output = StringIO()
        Phylo.write(tree_biopython,output, "newick")
        tree = Tree(output.getvalue())
    else:
        print("Unknown tree clustering method ! Aborting")
        sys.exit()

    return tree
Example #26
0
def build_tree(dist_matrix, names_list, clust):

    tree = None
    if clust == 'nj':
        # print(dist_matrix)
        dm = DistanceMatrix(dist_matrix, names_list)
        tree_scikit = nj(dm, result_constructor=str)
        tree = Tree(tree_scikit)
    elif clust == 'upgma':
        dm = _DistanceMatrix(names=names_list,
                             matrix=condense_matrix(dist_matrix))
        constructor = DistanceTreeConstructor()
        tree_biopython = constructor.upgma(dm)
        # remove InnerNode names
        for i in tree_biopython.get_nonterminals():
            i.name = None
        output = StringIO()
        Phylo.write(tree_biopython, output, "newick")
        tree = Tree(output.getvalue())
    else:
        print("Unknown tree clustering method ! Aborting")
        sys.exit()

    return tree
def run_nj_weighted(cm_uniq, prior_probs=None, verbose=True):

    if verbose:
        print("Running Neighbor-Joining with Weighted Scoring on " +
              str(cm_uniq.shape[0]) + " Unique Cells")

    cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1))

    dm = compute_distance_mat(cm_uniq.values.astype(np.str),
                              cm_uniq.shape[0],
                              priors=prior_probs)

    ids = cm_uniq.index
    dm = sp.spatial.distance.squareform(dm)

    dm = DistanceMatrix(dm, ids)

    newick_str = nj(dm, result_constructor=str)

    tree = newick_to_network(newick_str, cm_uniq)

    nj_net = fill_in_tree(tree, cm_uniq)

    for n in nj_net:
        if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup:
            n.is_target = True
            n.name = 'state-node'
        else:
            n.is_target = False

    state_tree = nj_net
    ret_tree = Cassiopeia_Tree(method='neighbor-joining',
                               network=state_tree,
                               name='Cassiopeia_state_tree')

    return ret_tree
Example #28
0
def create_tree(df, column):
    samples = df.index.tolist()
    
    # idenfity values > 0 (greater than the mean, which is zero after zscore)
    values = []
    valid_samples = []
    for i in range(len(samples)):
        if not np.isnan(df[column][samples[i]]):
            values.append(df[column][samples[i]])
            valid_samples.append(samples[i])
    
    size = len(valid_samples)
    print(size)
    if size > 3:
        dist_matrix = np.zeros((size, size))
        for i in range(size):
            for j in range(i, size):
                dist_matrix[i][j] = distance.euclidean(values[i], values[j])                
                dist_matrix[j][i] = dist_matrix[i][j]

        dmat = DistanceMatrix(dist_matrix, valid_samples)    
        return nj(dmat).root_at_midpoint()
    else:        
        return None
Example #29
0
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()
import pandas as pd
from skbio import tree, DistanceMatrix
import numpy as np
import sys

m = pd.read_csv(sys.argv[1])
m[m.isnull()] = 0
arr = m.as_matrix()
M = arr + arr.T
dm = DistanceMatrix(M)
tree = tree.nj(dm)
Example #30
0
    graphList.append(g)

print('Done')

GL = pd.DataFrame(list(zip(label, graphList)), columns=['organism', 'graph'])

#compute GK similarity matrix
kernel = gk.WeisfeilerLehman(base_kernel=gk.VertexHistogram, normalize=True)
GK = pd.DataFrame(kernel.fit_transform(GL['graph'].values))
GK.columns = GK.index = label

# Use 1-K as measure of Distance
DM_GK = DistanceMatrix(1 - GK.values)

#make GK tree
sktree = nj(DM_GK, result_constructor=str)
GK_tree = Tree(sktree)
GK_tree.name = 'AGORA network similarity tree'
# style
ts = TreeStyle()
ts.show_leaf_name = True
ts.mode = "c"
ts.arc_start = -180
ts.arc_span = 360

#plot tree
#GK_tree.render(file_name='/home/acabbia/Documents/Muscle_Model/GSMM-distance/figures/GK_tree_AGORA.png', tree_style=ts)
#GK_tree.show(tree_style=ts)

#%%
####
Example #31
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and
    returns a tree in newick format.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument("netfp", type=str, help="character_matrix")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=-1,
                        help="Time limit for ILP convergence")
    parser.add_argument(
        "--iter_limit",
        type=int,
        default=-1,
        help="Max number of iterations for ILP solver",
    )
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--no_triplets", action="store_true", default=False)
    parser.add_argument("--max_neighborhood_size", type=str, default=3000)
    parser.add_argument("--out_fp",
                        type=str,
                        default=None,
                        help="optional output file")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Random seed for ILP solver")

    args = parser.parse_args()

    netfp = args.netfp
    outfp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    iter_limit = args.iter_limit
    num_threads = args.num_threads
    max_neighborhood_size = args.max_neighborhood_size
    seed = args.seed

    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    score_triplets = not args.no_triplets

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = pic.load(open(args.mutation_map, "rb"))

    name = netfp.split("/")[-1]
    stem = ".".join(name.split(".")[:-1])

    true_network = nx.read_gpickle(netfp)

    if isinstance(true_network, Cassiopeia_Tree):
        true_network = true_network.get_network()

    target_nodes = get_leaves_of_tree(true_network)

    target_nodes_uniq = []
    seen_charstrings = []
    for t in target_nodes:
        if t.char_string not in seen_charstrings:
            seen_charstrings.append(t.char_string)
            target_nodes_uniq.append(t)

    if args.greedy:

        if verbose:
            print("Running Greedy Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")

        reconstructed_network_greedy = solve_lineage_instance(
            target_nodes_uniq,
            method="greedy",
            prior_probabilities=prior_probs)

        net = reconstructed_network_greedy[0]

        if outfp is None:
            outfp = name.replace("true", "greedy")
        pic.dump(net, open(outfp, "wb"))

    elif args.hybrid:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_hybrid = solve_lineage_instance(
            target_nodes_uniq,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_hybrid[0]

        if outfp is None:
            outfp = name.replace("true", "hybrid")
        pic.dump(net, open(outfp, "wb"))

    elif args.ilp:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_ilp = solve_lineage_instance(
            target_nodes_uniq,
            method="ilp",
            hybrid_subset_cutoff=cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_ilp[0]
        # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample)
        if outfp is None:
            outfp = name.replace("true", "ilp")
        pic.dump(net, open(outfp, "wb"))

    elif args.neighbor_joining:

        if verbose:
            print("Running Neighbor-Joining on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        infile = "".join(name.split(".")[:-1]) + "infile.txt"
        fn = "".join(name.split(".")[:-1]) + "phylo.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        aln = AlignIO.read(infile, "phylip-relaxed")

        aln = unique_alignments(aln)

        t0 = time.time()
        calculator = DistanceCalculator("identity", skip_letters="?")
        constructor = DistanceTreeConstructor(calculator, "nj")

        tree = constructor.build_tree(aln)

        tree.root_at_midpoint()

        nj_net = Phylo.to_networkx(tree)

        # convert labels to characters for writing to file
        i = 0
        rndict = {}
        for n in nj_net:

            if n.name is None:
                rndict[n] = Node("state-node", [])
                # n.name = "internal" + str(i)
                # i += 1
            else:
                rndict[n] = Node(n.name, [])

        nj_net = nx.relabel_nodes(nj_net, rndict)

        # convert labels to strings, not Bio.Phylo.Clade objects
        # c2str = map(lambda x: x.name, list(nj_net.nodes()))
        # c2strdict = dict(zip(list(nj_net.nodes()), c2str))
        # nj_net = nx.relabel_nodes(nj_net, c2strdict)

        cm = pd.read_csv(fn, sep="\t", index_col=0)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        nj_net = fill_in_tree(nj_net, cm)

        nj_net = tree_collapse(nj_net)

        for n in nj_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj")
        pic.dump(nj_net, open(outfp, "wb"))
        # Phylo.write(tree, out, 'newick')

        os.system("rm " + infile)
        os.system("rm " + fn)

    elif args.neighbor_joining_weighted:

        if verbose:
            print("Running Neighbor-Joining with Weighted Scoring on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        target_node_charstrings = np.array(
            [t.get_character_vec() for t in target_nodes_uniq])
        dm = compute_distance_mat(target_node_charstrings,
                                  len(target_node_charstrings),
                                  priors=prior_probs)

        ids = [t.name for t in target_nodes_uniq]
        cm_uniq = pd.DataFrame(target_node_charstrings)
        cm_uniq.index = ids
        dm = sp.spatial.distance.squareform(dm)

        dm = DistanceMatrix(dm, ids)

        newick_str = nj(dm, result_constructor=str)

        tree = newick_to_network(newick_str, cm_uniq)

        nj_net = fill_in_tree(tree, cm_uniq)
        nj_net = tree_collapse(nj_net)

        cm_lookup = dict(
            zip(
                list(
                    cm_uniq.apply(
                        lambda x: "|".join([str(k) for k in x.values]),
                        axis=1)),
                cm_uniq.index.values,
            ))

        rdict = {}
        for n in nj_net:
            if n.char_string in cm_lookup:
                n.is_target = True
            else:
                n.is_target = False

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj_weighted")
        pic.dump(nj_net, open(outfp, "wb"))

    elif args.camin_sokal:

        if verbose:
            print("Running Camin-Sokal Max Parsimony Algorithm on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        samples_to_cells = {}
        indices = []
        for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq):
            samples_to_cells["s" + str(i)] = n.name
            indices.append(n.name)
            n.name = str(i)

        infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt"
        fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt"
        weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(script) + " " + fn + " " + infile
        pi = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(pi.pid, 0)

        weights = construct_weights(infile, weights_fn)

        os.system("touch outfile")
        os.system("touch outtree")

        outfile = stem + "outfile.txt"
        outtree = stem + "outtree.txt"
        # run phylip mix with camin-sokal
        responses = "." + stem + ".temp.txt"
        FH = open(responses, "w")
        current_dir = os.getcwd()
        FH.write(infile + "\n")
        FH.write("F\n" + outfile + "\n")
        FH.write("P\n")
        FH.write("W\n")
        FH.write("Y\n")
        FH.write(weights_fn + "\n")
        FH.write("F\n" + outtree + "\n")
        FH.close()

        t0 = time.time()
        cmd = "~/software/phylip-3.697/exe/mix"
        cmd += " < " + responses + " > screenout1"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        consense_outtree = stem + "consenseouttree.txt"
        consense_outfile = stem + "consenseoutfile.txt"

        FH = open(responses, "w")
        FH.write(outtree + "\n")
        FH.write("F\n" + consense_outfile + "\n")
        FH.write("Y\n")
        FH.write("F\n" + consense_outtree + "\n")
        FH.close()

        if verbose:
            print("Computing Consensus Tree, elasped time: " +
                  str(time.time() - t0))

        cmd = "~/software/phylip-3.697/exe/consense"
        cmd += " < " + responses + " > screenout"
        p2 = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p2.pid, 0)

        newick_str = ""
        with open(consense_outtree, "r") as f:
            for l in f:
                l = l.strip()
                newick_str += l

        cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str)
        cm.index = indices

        cs_net = newick_to_network(newick_str, cm)

        for n in cs_net:
            if n.name in samples_to_cells:
                n.name = samples_to_cells[n.name]

        cs_net = fill_in_tree(cs_net, cm)

        cs_net = tree_collapse2(cs_net)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        for n in cs_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net)
        if outfp is None:
            outfp = name.replace("true", "cs")
        pic.dump(cs_net, open(outfp, "wb"))

        os.system("rm " + outfile)
        os.system("rm " + responses)
        os.system("rm " + outtree)
        os.system("rm " + consense_outfile)
        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal"
        )
Example #32
0
#plt.tick_params(axis='both', which='minor', labelsize=6)

plt.xticks(rotation=45, horizontalalignment='right', fontweight='light', fontsize=6 )

#fig.text(0.5, 0.04, 'Sequence', ha='center')
fig.text(0.04, 0.5, 'Distances', va='center', rotation='vertical', fontsize=6 )

plt.savefig( results_file, dpi = 200, bbox_inches='tight')


###################################################################################################################
##########                   phylogenetics              ###########################################################
###################################################################################################################

dm = DistanceMatrix(DIST_proposed, sequences)
tree = nj(dm)
#print(tree.ascii_art())
newick_str_fos = nj(dm, result_constructor=str)
t = PhyloTree(newick_str_fos)
f = open(current_dir + "/results/" + path_netwick + "/fos.txt", "w")
f.write(newick_str_fos)
f.close()
#t.show()

dm = DistanceMatrix(DIST_proposed_glcm, sequences)
tree = nj(dm)
#print(tree.ascii_art())
newick_str_glcm = nj(dm, result_constructor=str)
t = PhyloTree(newick_str_glcm)
f = open(current_dir + "/results/" + path_netwick + "/glcm.txt", "w")
f.write(newick_str_glcm)
Example #33
0
from ete3 import PhyloTree, TreeStyle

from skbio import DistanceMatrix
from skbio.tree import nj

data = [[0, 8, 4, 6], [8, 0, 8, 8], [4, 8, 0, 6], [6, 8, 6, 0]]
ids = list('abcd')
dm = DistanceMatrix(data, ids)
tree = nj(dm)
print(tree.ascii_art())
newick_str = nj(dm, result_constructor=str)
print(newick_str)
#print(newick_str[:55], "...")
t = PhyloTree(newick_str)
t.show()

alg = """
 >Dme_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEAL--YYASQTDDIKDRREEAH
 >Dme_002
 MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
 >Cfa_001
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
 >Mms_001
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
 >Hsa_001
 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
 >Ptr_002
 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH
 >Mmu_002
 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
def construct_cluster(args, dm):
        
        # neighbor joining:
        from skbio.tree import nj
        mycluster = nj(dm)
        return mycluster
Example #35
0
    for j in range(len(my_data[i])):
        if (math.isnan(my_data[i][j])):
            my_data[i][j] = 0

my_data = numpy.array(my_data)

data = my_data.T + my_data

for i in data:
    for j in i:
        print(j, end='   ')
    print()

dm = DistanceMatrix(data, ids)

tree = nj(dm, disallow_negative_branch_length=False)
print(tree.ascii_art())

tree_file = open(tree_file, 'w+')
tree_file.write(tree.ascii_art())
tree_file.close()

nws = nj(dm, result_constructor=str)
print(nws)

nws_file_l = open(nws_file, 'w+')
nws_file_l.write(nws)
nws_file_l.close()

bio_tree = Phylo.read("work/NLP/Trees/output_data.txt", 'newick')
Example #36
0
    log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
    parser.add_argument(
        '--log-level', '-l', default="INFO", choices=log_choices,
        help="Set logging level. Default is info."
    )

    return parser

if __name__ == '__main__':
    parser = get_argument_parser()
    args = parser.parse_args()

    level = getattr(logging, args.log_level.upper(), logging.INFO)
    logging.basicConfig(level=level)

    sequences = SequenceCollection.read(args.infile, format=args.format)

    if args.parallel == 0 and len(sequences) > 16:
        pool_size = multiprocessing.cpu_count()
    else:
        pool_size = 1

    dmatrix = create_distance_matrix(sequences, d2.distance, pool_size,
                                     statistic=d2.d2_neighbourhood_dna)

    print(dmatrix)
    phylo_tree = nj(dmatrix)
    print(phylo_tree.ascii_art())
    phylo_tree.write(args.outfile, format=args.target)
Example #37
0
trees_file.close()
print("\n%d/%d trees were < 4." % (count, len(matrix_df.columns.tolist())))




matrix_df = matrix_df.fillna(0.0)
sps = matrix_df.index.tolist()
size = len(sps)
dmat = np.zeros((size, size))
for i in range(size):
    for j in range(i, size):
        dmat[i][j] = distance.euclidean(matrix_df.loc[sps[i]], matrix_df.loc[sps[j]])
        dmat[j][i] = dmat[i][j]


dmat = DistanceMatrix(dmat, sps)
nw = str(nj(dmat).root_at_midpoint()).replace('root','')
njf =  open(os.path.join(trees_path, 'nj_tree_euclidean.txt'),'w')
njf.write(nw)
njf.close()



all_annotations = pd.read_csv(os.path.join(current_path, 'dataset/all_prostate_cancer_annotations.csv'), index_col=0, header=0)

selected_proteins_annotations = all_annotations.loc[up_proteins]
selected_proteins_annotations.to_csv(os.path.join(current_path, 'dataset/selected_prostate_cancer_annotations_up_regulated.csv'), header=True, index=True)

selected_proteins_annotations = all_annotations.loc[down_proteins]
selected_proteins_annotations.to_csv(os.path.join(current_path, 'dataset/selected_prostate_cancer_annotations_down_regulated.csv'), header=True, index=True)