Ejemplo n.º 1
0
 def __init__(self, dm, link):
     self._nodes = list(dm.index)
     self._newick = None
     if link == "single":
         self._linkage = fastcluster.single(squareform(dm.distance))
     elif link == "average":
         self._linkage = fastcluster.average(squareform(dm.distance))
     else:
         raise AttributeError("Invalid value {} for link in Dendrogram.".format(link))
     self._tree = hierarchy.to_tree(self._linkage, False)
Ejemplo n.º 2
0
 def make_tree(self, profile_file, names=None):
     profiles = pd.read_csv(profile_file, sep="\t", index_col=0)
     new_names = {}
     if names:
         for i in names:
             new_names[i] = re.sub(r"_S\d+_L\d{3}[\d\w_-]+", "",
                                   names[i]).replace("-", ".")
     profiles.columns = list(map(lambda x: new_names[x], profiles.columns))
     self._nodes = list(profiles.columns)
     distances = distance_matrix(profiles)
     self._linkage = fastcluster.average(squareform(distances))
     self._tree = hierarchy.to_tree(self._linkage, False)
Ejemplo n.º 3
0
def set_threshold(arr, CLUSTERING='single'):
    print("starting clustering")
    arr = arr.reshape(-1)
    arr = arr[arr > settings.MIN_TH]
    N_CLUSTER = 2
    target_cluster = 1
    print("max, min: ", arr.max(), arr.min())

    arr = arr[iqr(arr)]

    if CLUSTERING == 'kmeans':
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=N_CLUSTER,
                        init=np.array([settings.MIN_TH, arr.max()]).reshape(-1, 1))

        labels = kmeans.fit_predict(arr.reshape(-1, 1))
    else:
        import fastcluster
        from scipy.cluster.hierarchy import fcluster
        from scipy.spatial.distance import pdist

        Z = pdist(arr.reshape(-1, 1))
        if CLUSTERING == 'single':
            X = fastcluster.single(Z)
        elif CLUSTERING == 'average':
            X = fastcluster.average(Z)
        elif CLUSTERING == 'centroid':
            X = fastcluster.centroid(Z)
        else:
            return settings.THRESHOLD

        labels = N_CLUSTER - fcluster(X, N_CLUSTER, 'maxclust')

    # setting 0 for the minimum cluster
    # np.ma.masked_array returns only values where the mask is 0
    index = {}
    for i, l in enumerate(labels):
        index[l] = arr[i]
        if len(index.keys()) == N_CLUSTER:
            break

    index = sorted(index.items(), key=lambda kv: kv[1]) # list of tuples sorted by values
    target_label = index[target_cluster - 1][0] # the label of the desired cluster
    th = np.max(arr[np.flatnonzero(labels == target_label)]) # max of the down cluster
    print("found threshold: " + str(th))
    # print(str(np.ma.masked_array(arr, 1 - labels).min()))

    return th
Ejemplo n.º 4
0
def average_linkage_clustering(pairwise_estimates):
    """
    Perform average linkage clustering using ``fastcluster``. The first two
    columns of the output contain the node indices which are joined in each
    step. The input nodes are labeled 0, . . . , N - 1, and the newly generated
    nodes have the labels N, . . . , 2N - 2. The third column contains the
    distance between the two nodes at each step, ie. the current minimal
    distance at the time of the merge. The fourth column counts the number of
    points which comprise each new node.

    :param pairwise_estimates: dictionary with data frames with pairwise
        estimates of Ks, Ka and Ka/Ks (or at least Ks), as returned by
        :py:func:`analyse_family`.
    :return: average linkage clustering as performed with
        ``fastcluster.average``.
    """
    clustering = fastcluster.average(pairwise_estimates)

    return clustering
Ejemplo n.º 5
0
def average_linkage_clustering(pairwise_estimates):
    """
    Perform average linkage clustering using ``fastcluster``. The first two
    columns of the output contain the node indices which are joined in each
    step. The input nodes are labeled 0, . . . , N - 1, and the newly generated
    nodes have the labels N, . . . , 2N - 2. The third column contains the
    distance between the two nodes at each step, ie. the current minimal
    distance at the time of the merge. The fourth column counts the number of
    points which comprise each new node.

    :param pairwise_estimates: dictionary with data frames with pairwise
        estimates of Ks, Ka and Ka/Ks (or at least Ks), as returned by
        :py:func:`analyse_family`.
    :return: average linkage clustering as performed with
        ``fastcluster.average``.
    """
    # fill NaN values with something larger than all the rest, not a foolproof
    # approach, but should be reasonable in most cases
    if np.any(np.isnan(pairwise_estimates)):
        logging.warning("Ks matrix contains NaN values, replaced with 1000")
        pairwise_estimates.fillna(1000, inplace=True)
    clustering = fastcluster.average(pairwise_estimates)

    return clustering
Ejemplo n.º 6
0
def cluster(dataset, new_matrix = None):

    genes, conditions, matrix = dataset
    print 'shape of original matrix:'
    print matrix.shape

    # Get the indices of NaNs in the matrix
    nan_inds = np.isnan(matrix)
    num_nans = np.sum(nan_inds)

    # Replace the NaN values with extremely small noise values
    # (noise has standard deviation of 1 million times less than the data)
    np.random.seed(96857463)
    data_sd = np.nanstd(matrix)
    noise = np.random.randn(num_nans) * data_sd / 1e6
    matrix[nan_inds] = noise

    ## Remove rows and columns that do not have any values - they kill the clustering process!
    ## print 'row_nansum: {}'.format(np.nansum(matrix, axis = 1))
    ## print 'col_nansum: {}'.format(np.nansum(matrix, axis = 0))
    #good_rows = np.nansum(matrix, axis = 1).astype(np.bool)
    #good_cols = np.nansum(matrix, axis = 0).astype(np.bool)

    #print 'number of good rows: {}'.format(np.nansum(good_rows))
    #print 'number of good cols: {}'.format(np.nansum(good_cols))

    #genes = np.array(genes)[good_rows]
    #conditions = np.array(conditions)[good_cols]
    #matrix = matrix[np.ix_(good_rows, good_cols)]

    #print 'shape of good matrix:'
    #print matrix.shape

    num_genes = len(genes)
    num_conds = len(conditions)

    # Compute distance matrices
    cols_dist = pdist(matrix.transpose(), 'cosine')
    rows_dist = pdist(matrix, 'cosine')

    ## Get the names of rows and columns that have NaN dissimilarity values
    #rows_dist_nan_inds_1, rows_dist_nan_inds_2 = [x[np.isnan(rows_dist)] for x in np.triu_indices(matrix.shape[0], 1)]
    #cols_dist_nan_inds_1, cols_dist_nan_inds_2 = [x[np.isnan(cols_dist)] for x in np.triu_indices(matrix.shape[1], 1)]

    #row_names_nan_dist_1, row_names_nan_dist_2 = genes[rows_dist_nan_inds_1], genes[rows_dist_nan_inds_2]
    #col_names_nan_dist_1, col_names_nan_dist_2 = conditions[cols_dist_nan_inds_1], conditions[cols_dist_nan_inds_2]

    ## And print out the rows (strains) and columns (conditions) with NaN dissimilarity values
    #print "Strain pairs with NaN dissimilarity values:"
    #for i, row_name_1 in enumerate(row_names_nan_dist_1):
    #    print row_name_1, row_names_nan_dist_2[i]
    #print ""
    #
    #print "Condition pairs with NaN dissimilarity values:"
    #for i, col_name_1 in enumerate(col_names_nan_dist_1):
    #    print col_name_1, col_names_nan_dist_2[i]
    #print ""

    

    # Cluster the matrix using fastcluster!
    print 'clustering columns...'
    cols_clust_mat = fastcluster.average(cols_dist)
    print 'clustering rows...'
    rows_clust_mat = fastcluster.average(rows_dist)

    # Transform the values in the clustering matrices so they can be used with Bio.Cluster
    for i in range(num_genes - 1):
        if rows_clust_mat[i, 0] > (num_genes - 1):
            rows_clust_mat[i, 0] = -(rows_clust_mat[i, 0] - (num_genes - 1))
        if rows_clust_mat[i, 1] > (num_genes - 1):
            rows_clust_mat[i, 1] = -(rows_clust_mat[i, 1] - (num_genes - 1))


    for i in range(num_conds - 1):
        if cols_clust_mat[i, 0] > (num_conds - 1):
            cols_clust_mat[i, 0] = -(cols_clust_mat[i, 0] - (num_conds - 1))
        if cols_clust_mat[i, 1] > (num_conds - 1):
            cols_clust_mat[i, 1] = -(cols_clust_mat[i, 1] - (num_conds - 1))

    # Turn into lists of nodes
    cols_nodes_list = [Node(int(cols_clust_mat[i, 0]), int(cols_clust_mat[i, 1]), cols_clust_mat[i, 2]) for i in range(cols_clust_mat.shape[0])]
    rows_nodes_list = [Node(int(rows_clust_mat[i, 0]), int(rows_clust_mat[i, 1]), rows_clust_mat[i, 2]) for i in range(rows_clust_mat.shape[0])]

    # Create trees
    cols_tree = Tree(cols_nodes_list)
    rows_tree = Tree(rows_nodes_list)

    # Add the NaNs back into the matrix, so it can be visualized properly
    matrix[nan_inds] = np.nan

    # If a "new_matrix" was specified, that means that we wanted to use the original dataset to
    # get the clustering but then actually use a different matrix for the data. So, at this point
    # we set the variable "matrix" to be the values of "new_matrix"
    if new_matrix is not None:
        matrix = new_matrix

    # Create a giant text string so that the input data can be turned into a "record" object
    row1 = 'ORF\tNAME\tGWEIGHT\t' + '\t'.join(conditions)
    row2 = 'EWEIGHT\t\t\t' + '\t'.join(['1' for i in range(len(conditions))])
    rows_rest = [['' for i in range(len(conditions) + 3)] for j in range(len(genes))]
    for i in range(len(genes)):
        rows_rest[i][0:2] = [genes[i] for j in range(2)]
        rows_rest[i][2] = '1'
        for j in range(len(conditions)):
            rows_rest[i][j+3] = str(matrix[i, j])
    rows_rest_intermed = ['\t'.join(x) for x in rows_rest]
    rows_rest_final = '\n'.join(rows_rest_intermed)
    final_string = '%s\n%s\n%s' % (row1, row2, rows_rest_final)

    # Read in as a "record" object
    handle = StringIO.StringIO(final_string)
    record = Bio.Cluster.read(handle)

    return record, rows_tree, cols_tree
import scipy.cluster.hierarchy as hcluster

random.seed(42)
np.random.seed(42)

regions = dgw.data.parsers.read_bed('encode_regions_around_tss.bed')
random_regions = regions.ix[random.sample(regions.index, 1000)]

data = dgw.read_bam('/Users/saulius/dev/coursework/proj/data/interesting/broad/K562/wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam', random_regions)
data = data.to_log_scale()

dm = dgw.dtw.parallel.parallel_pdist(data)

single = fastcluster.single(dm)
complete = fastcluster.complete(dm)
average = fastcluster.average(dm)

hcluster.dendrogram(single, no_labels=True, color_threshold=0)
plt.title('Single linkage')
# plt.savefig('single.pdf')
# plt.close('all')
#
# hcluster.dendrogram(complete, no_labels=True, color_threshold=0)
# plt.title('Complete linkage')
# plt.savefig('complete.pdf')
# plt.close('all')
#
# hcluster.dendrogram(average, no_labels=True, color_threshold=0)
# plt.title('Average linkage')
# plt.savefig('average.pdf')
# plt.close('all')
Ejemplo n.º 8
0
def cluster_weights(
    X: "npt.ArrayLike",
    y: "npt.ArrayLike",
    grouping: "Optional[npt.ArrayLike]" = None,
) -> np.ndarray:
    """ Compute clusters on the X values based on Manhattan distance, then
    weight by cluster size.

    This function ignores information in the y-values.

    Examples:

    >>> import numpy as np
    >>> from selectml.sk.weighting import cluster_weights
    >>> from selectml.data import basic
    >>> X, y, indivs = basic()
    >>> cluster_weights(X, y)
    array([4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
           4., 4., 4., 4., 4., 4., 4., 4.])
    """

    from fastcluster import average
    from scipy.cluster.hierarchy import cut_tree
    from scipy.cluster.hierarchy import cophenet
    from scipy.spatial.distance import pdist, squareform

    X_ = np.array(X)

    x = pd.DataFrame({
        "index": np.arange(X_.shape[0]),
        "genotypes": np.apply_along_axis(
            lambda z: "".join(str(z_i) for z_i in z), 1, X_)
    })
    firsts = pd.DataFrame(X_).groupby(x["genotypes"]).first()
    groups = (
        x
        .groupby("genotypes")["index"]
        .unique()
        .apply(pd.Series)
        .unstack()
        .reset_index(level=0, drop=True)
        .reset_index()
        .rename(columns={0: "index"})
    )

    dist = pdist(firsts.values, "cityblock")
    hier = average(dist)
    coph = squareform(cophenet(hier))

    height = np.percentile(coph[coph > 0], 0.5)
    clusters = pd.DataFrame({
        "genotypes": firsts.index.values,
        "clusters": cut_tree(hier, height=height)[:, 0]
    })
    clusters = (
        pd.merge(groups, clusters, left_on="genotypes", right_on="genotypes")
        .drop(columns="genotypes")
    )

    cluster_counts = (
        clusters.groupby("clusters").count()["index"]
        .apply(lambda x: (clusters.shape[0] - x) / x)
        .reset_index()
        .rename(columns={"index": "weight"})
    )

    clusters = pd.merge(
        clusters,
        cluster_counts,
        on="clusters"
    ).set_index("index")
    clusters = clusters.loc[np.arange(X_.shape[0]), "weight"]

    return clusters.values
Ejemplo n.º 9
0
                kmerdist[total:total+position] = temp[r][remaining:]
                total+=position
                position-=1
                remaining+=1
            del(kmerdist)
            gc.collect()
        kmerdist=np.memmap(filename,dtype='float32',mode='r')
    else:
        print("Writing distance matrix to disk. {}".format(time.asctime()))
        kmerdist[:] = pdist(final,'hamming')
else:
    #if neither memory saving strategies are selected
    kmerdist = pdist(final,'hamming')

print("Building kmer tree using average linkage with an average number of allowed based of: {} {}".format(degen_base_num,time.asctime()))
Z = fastcluster.average(kmerdist)
kmer_length=final.shape[1]
maxdist=round((degen_base_num/kmer_length), 2)
clusters = fcluster(Z,maxdist,criterion='distance')
myclusters = {key:[] for key in set(clusters)}
for index, clust in enumerate(clusters):
    myclusters[clust].append(index)

clustergroups = []
for amp in Counter(clusters).keys():
    clustergroups.append(final.iloc[myclusters[amp]])

print("Building alignments for kmer motifs. {}".format(time.asctime()))
#group resulting clusters into de facto alignment objects
alignments = []
for c in clustergroups: