def time_subcluster(self, locs): # Getting subclusters at Mapzen's limit cluster_linkage = linkage(locs, method='ward') clusters = fcluster(cluster_linkage, 50, criterion='maxclust') cluster_means = np.array([np.mean( locs[np.where(clusters == i)], axis=0 ) for i in range(1, 51)]) mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means] mapzen_matrix = self.mapzen_matrix(mapzen_locs) # Cluster labels used for mapping back together # Subtracting one to use 0 index cl = clusters - 1 # Get a matching distance matrix of lat/lon distance, get ratios cluster_km_dist = squareform(pdist(cluster_means, (lambda u,v: haversine(u,v)))) dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix, cluster_km_dist)) # Divide items by mean to normalize a bit dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix, dist_ratio_matrix.mean())) locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v)))) # Iterate through each, updating by ratio in dist_ratio_matrix it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]] it.iternext() return locs_km_dist
def writeClusters(results): threshold = 0.9 results = numpy.fromiter(results, dtype=[('pairs', 'i8', 2), ('score', 'f4', 1,)]) i_to_id, condensed_distances, N = condensedDistance(results) linkages = fastcluster.linkage(condensed_distances, method='ward') partition = hcluster.fcluster(linkages, threshold, criterion='inconsistent') clusters = {} for (i, cluster_id) in enumerate(partition): clusters.setdefault(cluster_id, []).append(i_to_id[i]) i = 0 for cluster in clusters.values(): images = [] for index in cluster: image_name = all_images[index] image_path = os.path.join(imagedir, image_name) cluster_path = 'clustered_images/{0}'.format(str(i)) # There must be a better way to do this try: os.mkdir(cluster_path) except OSError: for f in os.listdir(cluster_path): try: os.remove(f) except OSError: pass print('writing %s' % image_name) with open(image_path, 'rb') as inp: with open(os.path.join('clustered_images', str(i), image_name), 'wb') as outp: outp.write(inp.read()) i += 1
def time_series_clusters(Y,ct=0.5,return_clusters=False): D = pdist(transpose(Y),'correlation') D = abs(D) if return_clusters: L = linkage(D,method='single',metric='cosine') C = fcluster(L,ct,criterion='distance') return cluster_sets(C) plot_clusters(D,ct)
def time_series_clusters(Y, ct=0.5, return_clusters=False): D = pdist(transpose(Y), 'correlation') D = abs(D) if return_clusters: L = linkage(D, method='single', metric='cosine') C = fcluster(L, ct, criterion='distance') return cluster_sets(C) plot_clusters(D, ct)
def __call__(self): # Can continue to play around with these self.cluster_linkage = linkage(self.point_arr, method='ward') self.clusters = fcluster(self.cluster_linkage, self.num_clusters, criterion='maxclust') [p[0].update({'group': p[1]}) for p in zip(self.locations, self.clusters.tolist())] return self.locations
def cluster(dupes, threshold=.5, max_components=30000): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: (i_to_id, condensed_distances) = condensedDistance(sub_graph) N = max(i_to_id) + 1 linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') clusters = {} for (i, sub_cluster_id) in enumerate(partition): clusters.setdefault(cluster_id + sub_cluster_id, []).append(i) cophenetic_distances = hcluster.cophenet(linkage) for cluster_id, items in clusters.iteritems() : if len(items) > 1 : score = clusterConfidence(items, cophenetic_distances, N) clustering[cluster_id] = (tuple(i_to_id[item] for item in items), 1 - score) cluster_id += max(partition) + 1 else: ids, score = sub_graph[0] clustering[cluster_id] = tuple(ids), score cluster_id += 1 return clustering.values()
def cluster(dupes, threshold=.5): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in dupes) dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 2: pair_gen = ((x[0:2], x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = numpy.fromiter(pair_gen, dtype=score_dtype) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) else: clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) > 1] return clusters
def wavelet_clusters(Y,ct=0.5,weights=False,return_clusters=False,swt=False): if weights: D = abs(c_dists(Y,level_weights=True,use_swt=False)) Dr = [] for i in range(D.shape[0]-1): Dr += list(D[i,i+1:]) else: Dr = c_dists(Y,use_swt=swt) if return_clusters: L = linkage(Dr,method='single',metric='cosine') C = fcluster(L,ct,criterion='distance') return cluster_sets(C) plot_clusters(Dr,ct)
def cluster(dupes, threshold=.5): """ Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall """ threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from(((x[0], x[1], y) for x, y in dupes)) del dupes dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs : if len(sub_graph) > 2 : pair_gen = ((x[0:2], x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = numpy.fromiter(pair_gen, dtype=score_dtype) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) else : clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) > 1] return clusters
def cluster(dupes, threshold=.5, max_components=30000): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensedDistance(sub_graph) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=True) partition = hcluster.fcluster(linkage, threshold, criterion='distance') clusters = {} for i, partition_id in enumerate(partition): clusters.setdefault(partition_id, []).append(i) for items in viewvalues(clusters) : if len(items) > 1 : items = tuple(items) scores = confidences(items, condensed_distances, N) clustering[cluster_id] = (tuple(i_to_id[i] for i in items), scores) cluster_id += 1 else: ids, score = sub_graph[0] clustering[cluster_id] = (tuple(ids), tuple([score]*2)) cluster_id += 1 return clustering.values()
def cluster_analysis_hcluster(self, vectors): from hcluster import linkage, fcluster import numpy params = self.params.multiple_lattice_search.cluster_analysis.hcluster X = numpy.array(vectors) linkage_method = params.linkage.method linkage_metric = params.linkage.metric criterion = params.cutoff_criterion Z = linkage(X, method=linkage_method, metric=linkage_metric) cutoff = params.cutoff i_cluster = fcluster(Z, cutoff, criterion=criterion) i_cluster = flex.int(i_cluster.astype(numpy.int32)) return i_cluster
def wavelet_clusters(Y, ct=0.5, weights=False, return_clusters=False, swt=False): if weights: D = abs(c_dists(Y, level_weights=True, use_swt=False)) Dr = [] for i in range(D.shape[0] - 1): Dr += list(D[i, i + 1:]) else: Dr = c_dists(Y, use_swt=swt) if return_clusters: L = linkage(Dr, method='single', metric='cosine') C = fcluster(L, ct, criterion='distance') return cluster_sets(C) plot_clusters(Dr, ct)
def cluster_struct_affinities(affinities, ctype = 'mlpy', k = None): ''' Return a list of cluster memberships in the form of an N-array having k unique elements. ''' if ctype == 'hcluster': return hcluster.fcluster(vecs,1.1,criterion='inconsistent',method = 'complete' ) elif ctype == 'mlpy': import mlpy HC = mlpy.HCluster(method='euclidean', link='complete') clusts = HC.compute(vecs ) cut = HC.cut(HC.heights[-k]) return cut else: raise Exception()
def cluster(dupes: numpy.ndarray, threshold: float = .5, max_components: int = 30000) -> Clusters: ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number between 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' distance_threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensedDistance(sub_graph) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=True) partition = hcluster.fcluster(linkage, distance_threshold, criterion='distance') clusters: Dict[int, List[int]] = defaultdict(list) for i, cluster_id in enumerate(partition): clusters[cluster_id].append(i) squared_distances = condensed_distances**2 for cluster in clusters.values(): if len(cluster) > 1: scores = confidences(cluster, squared_distances, N) yield tuple(i_to_id[i] for i in cluster), scores else: (ids, score), = sub_graph if score > threshold: yield tuple(ids), (score, ) * 2
def cluster(self): """Cluster strokes""" # the purpose of this step is to cluster strokes using # the previously calculated distance matrix matrix = numpy.load(self.DTW_DATA) Y = hcluster.squareform(matrix) Z = hcluster.linkage(Y, method=self.CLUSTERING_METHOD) T = hcluster.fcluster(Z, 1.15) clusters = self.get_cluster_dict_from_array(T) if self.verbose: self.print_clusters(clusters) if not os.path.exists(self.CLUSTER_ROOT): os.makedirs(self.CLUSTER_ROOT) pickle.dump(clusters, open(self.CLUSTER_DATA, "w"))
def hierarchical_cluster(clusters, threshold): threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] # lclassifier.predict_proba(distances)[0][1] > threshold dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in clusters) dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_scores = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 2: pair_gen = ((sorted(x[0:2]), x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = np.fromiter(pair_gen, dtype=score_dtype) pairlist = list(pairs) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) elif len(sub_graph) == 2: clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) >= 2] return (clusters, cluster_scores)
def cluster(dupes, threshold=.5, max_components=30000): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' distance_threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensedDistance(sub_graph) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=True) partition = hcluster.fcluster(linkage, distance_threshold, criterion='distance') clusters = defaultdict(list) for i, cluster_id in enumerate(partition): clusters[cluster_id].append(i) for cluster in viewvalues(clusters): if len(cluster) > 1: scores = confidences(cluster, condensed_distances, N) yield tuple(i_to_id[i] for i in cluster), scores else: (ids, score), = sub_graph if score > threshold: yield tuple(ids), (score,) * 2
def cluster(dupes, threshold=.5): """ Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall """ (i_to_id, condensed_distances) = condensedDistance(dupes) linkage = fastcluster.linkage(numpy.array(condensed_distances), method='centroid') partition = hcluster.fcluster(linkage, threshold) clustering = {} for (i, cluster_id) in enumerate(partition): clustering.setdefault(cluster_id, []).append(i_to_id[i]) clusters = [set(l) for l in clustering.values() if len(l) > 1] return clusters
import hcluster # For using with published example: # url = "http://examples.obspy.org/dissimilarities.pkl" # dissimilarity = pickle.load(urllib.urlopen(url)) dissimilarity = pickle.load(open("dissimilarities.pkl", "rb")) # Storing local pickle object: # dis_temp = pickle.dumps(dissimilarity) # pickle.dump(dissimilarity, open("dissimilarities.pkl","wb")) plt.subplot(121) plt.imshow(dissimilarity, interpolation="nearest") dissimilarity = hcluster.squareform(dissimilarity) threshold = 0.3 linkage = hcluster.linkage(dissimilarity, method="single") clusters = hcluster.fcluster(linkage, threshold, criterion="distance") plt.subplot(122) hcluster.dendrogram(linkage, color_threshold=threshold) plt.show() ''' >> print hcluster.__doc__ Function Reference ------------------ These functions cut hierarchical clusterings into flat clusterings or find the roots of the forest formed by a cut by providing the flat cluster ids of each observation. +------------------+-------------------------------------------------+ |*Function* | *Description* |
def maxclust_dists(dists, k, method = 'complete'): d2 = hcluster.squareform(dists) Z = hcluster.linkage(d2, method = method) fcl = hcluster.fcluster(Z, t = k, criterion = 'maxclust') return fcl
users = pd.read_csv('data/mindmatch_example.csv').to_dict(orient='records') n_users = len(users) print('Number of registered users: {}'.format(n_users)) users_df = pd.DataFrame(users).fillna('') users_dict = {r['user_id']: dict(r) for _, r in users_df.iterrows()} # map of user id to details persons_1 = list(map(preprocess, list(users_df['abstracts']))) persons_2 = list(map(preprocess, list(users_df['abstracts']))) A = affinity_computation(persons_1, persons_2, n_components=30, min_df=2, max_df=0.8, weighting='tfidf', projection='svd') cois_list = compute_conflicts(users_df) for i, j in cois_list: A[i, j] = -1 A_cluster = - A A_cluster[A_cluster == 1000] = 1 A_rand = np.random.randn(n_users, n_users) * 0.01 * A_cluster.var() # add randomness z = linkage(A_cluster + A_rand, method='average', metric='euclidean', optimal_ordering=True) cluster = hcluster.fcluster(z, t=0.01, criterion='distance') # distance users_group_df['cluster'] = cluster users_sorted_df = users_group_df.sort_values('cluster') cluster_numbers = generate_pod_numbers(n_users=len(users_sorted_df), n_per_group=5) users_sorted_df['cluster'] = cluster_numbers users_sorted_df.to_csv('group_matching_users.csv', index=False)
import hcluster import matplotlib.pyplot as plt import pickle import urllib url = "http://examples.obspy.org/dissimilarities.pkl" dissimilarity = pickle.load(urllib.urlopen(url)) plt.subplot(121) plt.imshow(1 - dissimilarity, interpolation="nearest") dissimilarity = hcluster.squareform(dissimilarity) threshold = 0.3 linkage = hcluster.linkage(dissimilarity, method="single") clusters = hcluster.fcluster(linkage, 0.3, criterion="distance") plt.subplot(122) hcluster.dendrogram(linkage, color_threshold=0.3) plt.xlabel("Event number") plt.ylabel("Dissimilarity") plt.show()
def fclusters(Z, seed_pts, nclusters): flats = hcluster.fcluster(Z, t=nclusters, criterion='maxclust') clusters = [set() for _ in range(max(flats))] for idx, f in enumerate(flats): clusters[f-1].add(seed_pts[idx]) return clusters
def cluster(dupes: numpy.ndarray, cluster_threshold: float = 0.5, max_components: int = 30000, id_to_match: str = None) -> Clusters: """ Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold `https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html` Args: dupes: (np.array)[tuple(list[str], float)] A list of tuples, where each tuple contains an id pair and a probability that they are a match: id_pair_tuple: ([record_id_1, record_id_2], prob) dtype: np.dtype([('pairs', '<U256', 2), ('score', 'f4', 1)]) threshold: (float) number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall """ distance_threshold = cluster_threshold score_threshold = 1 - cluster_threshold dupe_sub_graphs = connected_components(dupes, max_components) # logger.info(f"Dupes: {dupes}") for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensed_distance(sub_graph) logger.debug(f"{condensed_distances}") linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=True) partition = hcluster.fcluster(linkage, distance_threshold, criterion='distance') clusters: Dict[int, List[int]] = defaultdict(list) logger.debug(f"Partition: {partition}") logger.debug(f"Linkage: {linkage}") for i, cluster_id in enumerate(partition): clusters[cluster_id].append(i) logger.info(f"Clusters: {clusters}") for cluster in clusters.values(): if len(cluster) > 1: scores = confidences(cluster, condensed_distances, N) logger.info( f"Cluster Ids and scores: {tuple(i_to_id[i] for i in cluster)}, {scores}" ) ids = [i_to_id[i] for i in cluster] if id_to_match in ids and id_to_match is not None: yield tuple(ids), scores elif id_to_match is None: yield tuple(ids), scores else: (ids, score), = sub_graph if score > score_threshold and id_to_match in ids and id_to_match is not None: # logger.info(tuple(ids), ((score,) * 2)) yield tuple(ids), (score, ) * 2 elif score > score_threshold and id_to_match is None: yield tuple(ids), (score, ) * 2
import numpy from numpy.random import rand from main.util.common import dataPath # load distance matrix # Z = linkage(distanceMatrix) # numpy.save("dendrogram.npy", Z) # dendrogram(Z) # show() Z = numpy.load(dataPath("dendrogram.npy")) dendrogram(Z) show() clu = fcluster(Z, 2, depth=5000, criterion='distance') cluInstances = {} for i in clu: cluInstances[i] = cluInstances.get(i, 0) + 1 # numpy.save(dataPath("clusters.npy"), clu) # clu = numpy.load(dataPath("clusters.npy")) """hist1 = numpy.histogram(list(cluInstances.itervalues())) print len(cluInstances.keys()) print hist1[0] print map(lambda x: int(x), hist1[1])"""
row[i] = int( row[i] ) for i in [6,7,8,9,10]: row[i] = float( row[i] ) water_dict[ (row[12], row[5]) ] = row water_list.append( row ) X = [ row[6:9] for row in water_list ] print X Y = pdist(X, 'euclidean') Z = linkage(Y, 'average') print Z dendrogram(Z) fclust = fcluster(Z, 2, criterion='distance') clust_dict = defaultdict( list ) for i, row in enumerate(water_list): #print fclust[i], str(fclust[i]) clust_dict[ str(fclust[i]) ].append( row ) #print clust_dict for c in clust_dict: print 'select water and (' + ' or '.join( [ '(~' + w[12] + ' and ' + str(w[5]) + ')' for w in clust_dict[c] ] ) + '); isosurface id "foo' + c + '" color lightblue center {selected} SPHERE @{ [ {selected}.x.stddev, {selected}.y.stddev, {selected}.z.stddev, 0.5 ].max *2 } translucent' + ';' sys.exit(0)