Example #1
0
 def __init__(self, dm, link):
     self._nodes = list(dm.index)
     self._newick = None
     if link == "single":
         self._linkage = fastcluster.single(squareform(dm.distance))
     elif link == "average":
         self._linkage = fastcluster.average(squareform(dm.distance))
     else:
         raise AttributeError("Invalid value {} for link in Dendrogram.".format(link))
     self._tree = hierarchy.to_tree(self._linkage, False)
def _rsl_large_kdtree_fastcluster(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric,
                                                            p, k, alpha)

    single_linkage_tree = single(mutual_reachability_)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
Example #3
0
def set_threshold(arr, CLUSTERING='single'):
    print("starting clustering")
    arr = arr.reshape(-1)
    arr = arr[arr > settings.MIN_TH]
    N_CLUSTER = 2
    target_cluster = 1
    print("max, min: ", arr.max(), arr.min())

    arr = arr[iqr(arr)]

    if CLUSTERING == 'kmeans':
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=N_CLUSTER,
                        init=np.array([settings.MIN_TH, arr.max()]).reshape(-1, 1))

        labels = kmeans.fit_predict(arr.reshape(-1, 1))
    else:
        import fastcluster
        from scipy.cluster.hierarchy import fcluster
        from scipy.spatial.distance import pdist

        Z = pdist(arr.reshape(-1, 1))
        if CLUSTERING == 'single':
            X = fastcluster.single(Z)
        elif CLUSTERING == 'average':
            X = fastcluster.average(Z)
        elif CLUSTERING == 'centroid':
            X = fastcluster.centroid(Z)
        else:
            return settings.THRESHOLD

        labels = N_CLUSTER - fcluster(X, N_CLUSTER, 'maxclust')

    # setting 0 for the minimum cluster
    # np.ma.masked_array returns only values where the mask is 0
    index = {}
    for i, l in enumerate(labels):
        index[l] = arr[i]
        if len(index.keys()) == N_CLUSTER:
            break

    index = sorted(index.items(), key=lambda kv: kv[1]) # list of tuples sorted by values
    target_label = index[target_cluster - 1][0] # the label of the desired cluster
    th = np.max(arr[np.flatnonzero(labels == target_label)]) # max of the down cluster
    print("found threshold: " + str(th))
    # print(str(np.ma.masked_array(arr, 1 - labels).min()))

    return th
Example #4
0
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, 
                                      metric='minkowski', p=2):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric, 
                                                            p, min_samples)

    single_linkage_tree = single(mutual_reachability_)
    condensed_tree = condense_tree(single_linkage_tree,
                                   min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)
    
    labels = -1 * np.ones(X.shape[0], dtype=int)
    for index, cluster in enumerate(cluster_list):
        labels[cluster] = index
    return labels, condensed_tree, single_linkage_tree, None
Example #5
0
def _rsl_large_kdtree_fastcluster(X,
                                  cut,
                                  k=5,
                                  alpha=1.4142135623730951,
                                  gamma=5,
                                  metric='minkowski',
                                  p=2):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(
        X, metric, p, k, alpha)

    single_linkage_tree = single(mutual_reachability_)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
Example #6
0
def _hdbscan_large_kdtree_fastcluster(X, min_cluster_size=5, min_samples=None, alpha=1.0,
                                      metric='minkowski', p=2, gen_min_span_tree=False):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(X, metric,
                                                            p, min_samples, alpha)

    single_linkage_tree = single(mutual_reachability_)
    condensed_tree = condense_tree(single_linkage_tree,
                                   min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)

    labels = -1 * np.ones(X.shape[0], dtype=int)
    probabilities = np.zeros(X.shape[0], dtype=float)
    for index, (cluster, prob) in enumerate(cluster_list):
        labels[cluster] = index
        probabilities[cluster] = prob
    return labels, probabilities, condensed_tree, single_linkage_tree, None
Example #7
0
def _hdbscan_large_kdtree_fastcluster(X,
                                      min_cluster_size=5,
                                      min_samples=None,
                                      metric='minkowski',
                                      p=2):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(
        X, metric, p, min_samples)

    single_linkage_tree = single(mutual_reachability_)
    condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)

    labels = -1 * np.ones(X.shape[0], dtype=int)
    for index, cluster in enumerate(cluster_list):
        labels[cluster] = index
    return labels, condensed_tree, single_linkage_tree, None
Example #8
0
def main():
    infile = sys.argv[1]
    outfile = sys.argv[2]

    data = np.genfromtxt(infile, delimiter=',')
    print('Received {} points, clustering...'.format(data.shape[0]))

    if data.size > 0:
        clusters = fastcluster.single(data)
        print('Finished clustering')
    else:
        clusters = []
        print('Insufficient data to cluster')

    # from of the output: an (N-1)*4 matrix where each row is the 2 joined
    # indices along with the distance and number of points
    with open(sys.argv[2], 'w') as f:
        string_repr = ''
        for row in clusters:
            string_repr += ','.join(map(str, row))
            string_repr += '\n'

        f.write(string_repr)
Example #9
0
def _hdbscan_large_kdtree_fastcluster(X,
                                      min_cluster_size=5,
                                      min_samples=None,
                                      alpha=1.0,
                                      metric='minkowski',
                                      p=2,
                                      gen_min_span_tree=False):
    if p is None:
        p = 2

    mutual_reachability_ = kdtree_pdist_mutual_reachability(
        X, metric, p, min_samples, alpha)

    single_linkage_tree = single(mutual_reachability_)
    condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)

    labels = -1 * np.ones(X.shape[0], dtype=int)
    probabilities = np.zeros(X.shape[0], dtype=float)
    for index, (cluster, prob) in enumerate(cluster_list):
        labels[cluster] = index
        probabilities[cluster] = prob
    return labels, probabilities, condensed_tree, single_linkage_tree, None
Example #10
0
# id_series = pd.Series(url_list)
# cluster_series = pd.Series(km.labels_)
# results = (pd.concat([id_series,cluster_series], axis=1))
# results.columns = ['id', 'cluster']
# results.to_csv("clustering_f.txt", sep=',', columns=['id', 'cluster'], header=False, index=False, encoding='utf-8')
# print("Time taken for storing results of flat clustering: ", time.time() - start_time)

# Apply Hierarchical Clustering (Single link)
dist = 1 - cosine_similarity(X)
print("Time taken for computing cosine similarity: ", time.time() - start_time)

agg_d = fastcluster.linkage(dist, method='single', metric='euclidean')
print("Time taken for single linkage: ", time.time() - start_time)

fig, ax = plt.subplots()
ax = dendrogram(fastcluster.single(agg_d), orientation="right", labels=url_list)
print("Time taken for applying hierarchical clustering: ", time.time() - start_time)

# Get labels
for key in ax:
    if key == "ivl":
        ward_key = ax[key]
    if key == "color_list":
        ward_dict = dict([(y,x+1) for x,y in enumerate(sorted(set(ax[key])))])
        ward_value = [ward_dict[x] for x in ax[key]]
print("Time taken for getting labels: ", time.time() - start_time)

# Store hierarchical clustering results in a file
ward_cluster_series = pd.Series(ward_value)
ward_id_series = pd.Series(ward_key)
ward_results = (pd.concat([ward_id_series,ward_cluster_series], axis=1))
import fastcluster
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hcluster

random.seed(42)
np.random.seed(42)

regions = dgw.data.parsers.read_bed('encode_regions_around_tss.bed')
random_regions = regions.ix[random.sample(regions.index, 1000)]

data = dgw.read_bam('/Users/saulius/dev/coursework/proj/data/interesting/broad/K562/wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam', random_regions)
data = data.to_log_scale()

dm = dgw.dtw.parallel.parallel_pdist(data)

single = fastcluster.single(dm)
complete = fastcluster.complete(dm)
average = fastcluster.average(dm)

hcluster.dendrogram(single, no_labels=True, color_threshold=0)
plt.title('Single linkage')
# plt.savefig('single.pdf')
# plt.close('all')
#
# hcluster.dendrogram(complete, no_labels=True, color_threshold=0)
# plt.title('Complete linkage')
# plt.savefig('complete.pdf')
# plt.close('all')
#
# hcluster.dendrogram(average, no_labels=True, color_threshold=0)
# plt.title('Average linkage')