def main(): ''' >>> main() # stuff happens ''' args = parse_args() setup_logging(args.log, verbose=args.verbose) chunks = sequence_chunk_generator(args.fasta_file, chunk_size=args.chunk_size) hasher = HashingVectorizer(analyzer='char', n_features = 2 ** 18, ngram_range=(args.ngram_min, args.ngram_max), ) estimator = AffinityPropagation() for chunk in chunks: logging.info('hashing chunk') chunk_vector = hasher.transform([ str(i.seq) for i in chunk ]) logging.info('clustering') estimator.fit(chunk_vector) logging.info('got %s clusters' % len(set(estimator.labels_)))
def run_affinity_propagation(affinities, preference): ap = AffinityPropagation(affinity='precomputed', preference=preference) ap.fit(affinities) # print(affinities == ap.affinity_matrix_) cluster_centers_indices = ap.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) return n_clusters_
def test_affinity_propagation(): # Affinity Propagation algorithm # Compute similarities S = -euclidean_distances(X, squared=True) preference = np.median(S) * 10 # Compute Affinity Propagation cluster_centers_indices, labels = affinity_propagation( S, preference=preference, random_state=39 ) n_clusters_ = len(cluster_centers_indices) assert n_clusters == n_clusters_ af = AffinityPropagation( preference=preference, affinity="precomputed", random_state=28 ) labels_precomputed = af.fit(S).labels_ af = AffinityPropagation(preference=preference, verbose=True, random_state=37) labels = af.fit(X).labels_ assert_array_equal(labels, labels_precomputed) cluster_centers_indices = af.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) assert np.unique(labels).size == n_clusters_ assert n_clusters == n_clusters_ # Test also with no copy _, labels_no_copy = affinity_propagation( S, preference=preference, copy=False, random_state=74 ) assert_array_equal(labels, labels_no_copy)
def affinity_propagation(crime_rows, column_names): """ damping : float, optional, default: 0.5 Damping factor between 0.5 and 1. convergence_iter : int, optional, default: 15 Number of iterations with no change in the number of estimated clusters that stops the convergence. max_iter : int, optional, default: 200 Maximum number of iterations. preference : array-like, shape (n_samples,) or float, optional Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. affinity : string, optional, default=``euclidean`` Which affinity to use. At the moment precomputed and euclidean are supported. euclidean uses the negative squared euclidean distance between points. """ crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] print("Running Affinity Propagation") # TODO: Parameterize this affinity_prop = AffinityPropagation() #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy) affinity_prop.fit(random_sampling(crime_xy, num_samples=5000)) affinity_propagation_labels = affinity_prop.predict(crime_xy) print("formatting....") return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, column_names)
def cluster(scope): # Setup data df = pd.read_sql('playtype_data', db_engine) # Manipulate data into scope if scope == 'Team': df = df.drop('Player', 1).groupby('Team', as_index=False).mean() elif scope == 'Player': df = df.drop('Team', 1) else: raise Exception('This is never supposed to happen') # Normalize the data df[FEATURES] = (df[FEATURES] - df[FEATURES].mean()) / (df[FEATURES].max() - df[FEATURES].min()) # Run clustering clstr = AffinityPropagation() clstr.fit(df[FEATURES]) # Clump results df['cluster'] = clstr.labels_ df = df.sort('cluster') # Convert results to JSON for frontend return clusters_to_json(df, scope)
def clusterAffinityPropagation(self): """ Cluster the embeddings with affinity propagation :return: """ affin = AffinityPropagation() affin.fit(self.emb1.m) aflabels1 = affin.labels_ afclusters1 = dict() word2cluster1 = dict() for i,l in enumerate(aflabels1): points = afclusters1.setdefault(l,list()) points.append(self.emb1.rd[i]) for l,c in afclusters1.items(): for w in c: word2cluster1[w] = l self.cluster1 = afclusters1 self.word2cluster1 = word2cluster1 affin.fit(self.emb2.m) aflabels2 = affin.labels_ afclusters2 = dict() word2cluster2 = dict() for i,l in enumerate(aflabels2): points = afclusters2.setdefault(l,list()) points.append(self.emb2.rd[i]) for l,c in afclusters2.items(): for w in c: word2cluster2[w] = l self.cluster2 = afclusters2 self.word2cluster2 = word2cluster2
def saxcluster(self, preference=None, lookup=True): cls = AffinityPropagation(preference=preference, affinity='precomputed') if lookup else \ AffinityPropagation(preference=preference) if self.dists is None: if lookup: data = self.dists = self.__saxDists() else: data = self.dists = self.avdata.values() else: data = self.dists cls.fit(data) reps = self.indexes.keys() self.cluster_sax = [reps[i] for i in cls.cluster_centers_indices_] self.cluster_centers = [self.avdata[sax] for sax in self.cluster_sax] self.clusters = collections.defaultdict(list) for ind, label in enumerate(cls.labels_): sax = self.cluster_sax[label] self.clusters[sax] += self.indexes.values()[ind] self.asax_data = dict() for sax in self.clusters: self.asax_data[sax] = self.data[self.clusters[sax], :].mean(axis=0) self.ass = [0] * self.N for sax in self.cluster_sax: v = self.cluster_sax.index(sax) for ind in self.clusters[sax]: self.ass[ind] = v self.n_clusters = len(self.clusters)
def affinity_descriptor(descriptor_list): print("Affinity Propagation starting...") af = AffinityPropagation() af.fit(descriptor_list) visual_words = af.cluster_centers_ print("Visual words are ready.") return visual_words
def clustering(self): # Calculate similarity matrix X = self.create_tfidf_vector() X = X.toarray() pca = PCA(n_components=300, copy=False) X = pca.fit(X).transform(X) S = cosine_similarity(X, X) # Run affinity propogation af = AffinityPropagation() af.fit(S) # Formulate result tmp_clusters = defaultdict(list) goal_clusters = defaultdict(list) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 for label in labels: tmp_clusters[\ self.goal_list[cluster_centers_indices[label]]].append(\ self.goal_list[count]) count += 1 # 2nd-layer clutering of each cluster for goal, item_list in tmp_clusters.items(): subclusters = self.subcluster_by_editdistance(goal, item_list) for subgoal, items in subclusters.items(): goal_clusters[subgoal] = items return goal_clusters
def make_cluster_map(damping=0.992): test_labels, prediction = pickle.load(open(f_path_pred, 'rb')) prob_conf = np.zeros((121, 121)) for l in range(121): inds = np.squeeze(np.array(np.where(test_labels == l))) class_conf = prediction[inds, :].mean(axis=0) prob_conf[l, :] = class_conf F = prob_conf D = (1-F) np.fill_diagonal(D, 0) D_p = 0.5*(D+D.T) clst = AP(damping=damping, # damping determines # of clusters max_iter=500, convergence_iter=15, affinity='euclidean', verbose=False) clst.fit(D_p) print 'Number of cluster:', len(clst.cluster_centers_) membership = np.c_[range(121), clst.labels_] fine_to_coarse = dict(membership) coarse_to_fine = {l: [] for l in clst.labels_} for k, v in fine_to_coarse.items(): coarse_to_fine[v].append(k) pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb')) pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
def affinity(): # affinity propagation clustering from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import AffinityPropagation from matplotlib import pyplot # define dataset X, _ = make_classification( n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4, ) print(X) # define the model model = AffinityPropagation(damping=0.9) # fit the model model.fit(X) # assign a cluster to each example yhat = model.predict(X) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot pyplot.show()
def doAffinity(X): model = AffinityPropagation(damping=0.5, max_iter=250, affinity='euclidean') model.fit(X) clust_labels2 = model.predict(X) return (clust_labels2)
def optimize_recommend(self, param_set, max_recommend=3, gamma=1.0, delta=1.0, gpr=None, Xd=None, return_data=False): """Optimizes GPR model, using each data point as initial value Clusters the result using Affinity Propagation, and returns the cluster representatives, choosing the number of clusters automatically. The results are decoded into parameter sets.""" x = self.optimize(gamma=gamma, delta=delta, gpr=gpr, Xd=Xd) aff = AffinityPropagation() aff.fit(x) #x_rec = pd.DataFrame(aff.cluster_centers_, columns=x.columns) # select the lowest validation loss from each cluster x_rec = pd.concat([x, pd.DataFrame({'cluster_id' : aff.labels_})], axis=1) x_rec.sort_values(by=['cluster_id', 'gpr_optimum'], inplace=True) x_rec = x_rec.groupby('cluster_id').first() x_rec.sort_values(by=['gpr_optimum'], inplace=True) if max_recommend < 1: max_recommend = x.shape[0] x_rec = x.iloc[:max_recommend] #x_rec.index = range(len(x_rec)) #x_rec = x_rec.drop(['gpr_optimum'], axis=1) paramdictlist = self.decode_dummies(x_rec, param_set) if return_data: return paramdictlist, x_rec else: return paramdictlist
def get_region2label_table(X, clutter, damping, metric='cosine'): ''' metric: cosine | iou ''' # compute affinity if metric == 'cosine': A = cosine_similarity(X) A = A / 2. + .5 elif metric == 'iou': raise RuntimeError pref = np.percentile(A, clutter) # bbox clustering af = AffinityPropagation(preference=pref, affinity='precomputed', damping=damping) af.fit(A) # p(l|r) # mat of N_label x N_region Tcr = A[:, af.cluster_centers_indices_] Tcr /= Tcr.sum(axis=1, keepdims=True) Tcr = Tcr.T return Tcr
def affinity_propagation(words, algo="word2vec", use_model=False): """ Uses wordnet similarity to cluster the words in the sentences :param words: input sentence :return: two lists which correspond the clusters """ words = semantic_similarity.pos_filter(words, False, strict=False) words = np.asarray(words) # So that indexing with a list will work if algo == "word2vec": lev_similarity = np.array([[semantic_similarity.word2vec_distance(w1, w2, use_model=use_model) for w1 in words] for w2 in words]) if algo == "wordnet": lev_similarity = np.array([[semantic_similarity.word2vec_distance(w1, w2) for w1 in words] for w2 in words]) if len(lev_similarity) < 2: return [[], []] affprop = AffinityPropagation(affinity="precomputed", damping=0.5) affprop.fit(lev_similarity) if np.isnan(np.sum(affprop.labels_)): print "No labels" return [[], []] clusters = [] flattened_cluster = [] centroids = [] for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] centroids.append(words[affprop.cluster_centers_indices_[cluster_id]]) cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)]) clusters.append(list(cluster)) flattened_cluster.extend(cluster) return clusters, centroids
def get_labels(data_as_list, algorithm='meanshift'): dt = np.array(data_as_list) labels = [] print(' Algorithm =', algorithm) if algorithm == 'dbscan': dbs = DBSCAN(eps=0.1) dbs.fit(dt) labels = dbs.labels_ if algorithm == 'kmeans': kmeans = KMeans(n_clusters=10) kmeans.fit(dt) labels = kmeans.labels_ if algorithm == 'meanshift': # The following bandwidth can be automatically detected using try: bandwidth = estimate_bandwidth(dt, quantile=0.2, n_samples=len(dt)) except: bandwidth = 0.5 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(dt) labels = ms.labels_ if algorithm == 'affinitypropagation': af = AffinityPropagation() af.fit(dt) labels = af.labels_ return labels
def APWithSimilaryMatrix(similaryMatrix): p = np.mean(similaryMatrix) * 2 af = AffinityPropagation(max_iter=2000, preference=p, affinity='precomputed') af.fit(similaryMatrix) return (af.cluster_centers_indices_, af.labels_)
def clusterSimilarityWithSklearnAPC(data_file,damping=0.9,max_iter=200,convergence_iter=15,preference='min'): """ Compare Sparse Affinity Propagation (SAP) result with SKlearn Affinity Propagation (AP) Clustering result. Please note that convergence condition for Sklearn AP is "no change in the number of estimated clusters", for SAP the condition is "no change in the cluster assignment". So SAP may take more iterations and the there will be slightly difference in final cluster assignment (exemplars for each sample). """ # loading data simi_mat=loadMatrix(data_file) simi_mat_dense=simi_mat.todense() # get preference if preference=='min': preference=np.min(simi_mat_dense) elif preference=='median': preference=np.median(simi_mat_dense) print('{0}, start SKlearn Affinity Propagation'.format(datetime.now())) af=AffinityPropagation(damping=damping, preference=preference, affinity='precomputed',verbose=True) af.fit(simi_mat_dense) cluster_centers_indices,labels = af.cluster_centers_indices_,af.labels_ sk_exemplars=np.asarray([cluster_centers_indices[i] for i in labels]) print('{0}, start Fast Sparse Affinity Propagation Cluster'.format(datetime.now())) sap=SAP(preference=preference,convergence_iter=convergence_iter,max_iter=max_iter,damping=damping,verboseIter=100) sap_exemplars=sap.fit_predict(simi_mat_dense) # Caculate similarity between sk_exemplars and sap_exemplars exemplars_similarity=sparseAP_cy.arrSamePercent(np.array(sk_exemplars), np.array(sap_exemplars)) return exemplars_similarity
def Affinity_Propagation(data, SBS, C, EP, CP, selected_products): ap = AffinityPropagation(preference=-200) ap.fit(data) n_clusters = len(ap.cluster_centers_) EP_Length = len(EP) # list of lists arr = [[] for i in range(n_clusters)] for i, j in enumerate(ap.labels_): arr[j].append(i) cluster_nos_of_selected_products = [ ap.labels_[i] for i in selected_products ] # Run over the cluster from which majority of the products have been selected previously. cluster = max(set(cluster_nos_of_selected_products), key=cluster_nos_of_selected_products.count) EP_New, CP_New = [], [] for i in arr[cluster]: if i < EP_Length: EP_New.append(i) else: CP_New.append(i) return EP_New, CP_New, n_clusters
def make_cluster_map(damping=0.992): test_labels, prediction = pickle.load(open(f_path_pred, 'rb')) prob_conf = np.zeros((121, 121)) for l in range(121): inds = np.squeeze(np.array(np.where(test_labels == l))) class_conf = prediction[inds, :].mean(axis=0) prob_conf[l, :] = class_conf F = prob_conf D = (1 - F) np.fill_diagonal(D, 0) D_p = 0.5 * (D + D.T) clst = AP( damping=damping, # damping determines # of clusters max_iter=500, convergence_iter=15, affinity='euclidean', verbose=False) clst.fit(D_p) print 'Number of cluster:', len(clst.cluster_centers_) membership = np.c_[range(121), clst.labels_] fine_to_coarse = dict(membership) coarse_to_fine = {l: [] for l in clst.labels_} for k, v in fine_to_coarse.items(): coarse_to_fine[v].append(k) pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb')) pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time): BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X] labels = None if clusterType == 'kmeans': kmeans = KMeans(n_clusters=N_CLUSTERS) kmeans.fit(data) labels = kmeans.labels_ elif clusterType == 'affinity_propagation': ap = AffinityPropagation(damping=0.75) ap.fit(data) labels = ap.labels_ N_CLUSTERS = np.max(self.labels)+1 elif clusterType == 'DBSCAN': dbscan = DBSCAN() dbscan.fit(data) labels = dbscan.labels_ N_CLUSTERS = np.max(labels)+1 print 'N_CLUSTERS=' + str(N_CLUSTERS) elif clusterType == 'AgglomerativeClustering': ac = AgglomerativeClustering(n_clusters=N_CLUSTERS) ac.fit(data) labels = ac.labels_ else: print 'ERROR: clusterType: ' + clusterType + ' is not recognized' return (labels, N_CLUSTERS)
def test_sparse_input_for_predict(): # Test to make sure sparse inputs are accepted for predict # (non-regression test for issue #20049) af = AffinityPropagation(affinity="euclidean", random_state=42) af.fit(X) labels = af.predict(csr_matrix((2, 2))) assert_array_equal(labels, (2, 2))
def affinity_propagation(feature_matrix): sim = feature_matrix * feature_matrix.T sim = sim.todense() ap = AffinityPropagation() ap.fit(sim) clusters = ap.labels_ return ap, clusters
def get_clustered_data(data_matrix, clustering_algorithm=model_constants.KMEANS, distance_metric='euclidean', num_clusters=3): if clustering_algorithm.lower() == model_constants.AFFINITY_PROP: aff_prop = AffinityPropagation(affinity=distance_metric) aff_prop.fit(data_matrix) return aff_prop.labels_, aff_prop elif clustering_algorithm.lower() == model_constants.DBSCAN: dbscan = DBSCAN(metric=distance_metric) dbscan.fit(data_matrix) return dbscan.labels_, dbscan elif clustering_algorithm.lower() == model_constants.OPTICS: optics = OPTICS(metric=distance_metric) optics.fit(data_matrix) return optics.labels_, optics elif clustering_algorithm.lower() == model_constants.MEANSHIFT: mean_shift = MeanShift() mean_shift.fit(data_matrix) return mean_shift.labels_, mean_shift elif clustering_algorithm.lower() == model_constants.BIRCH: birch = Birch(n_clusters=num_clusters) birch.fit(data_matrix) return birch.labels_, birch elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE: agglomerative = AgglomerativeClustering(n_clusters=num_clusters, affinity=distance_metric) agglomerative.fit(data_matrix) return agglomerative.labels_, agglomerative else: kmeans = KMeans(n_clusters=num_clusters, random_state=42) kmeans.fit(data_matrix) return kmeans.labels_, kmeans
def cluster(mat, doc_indices): X = mat[:, doc_indices].T # Other clustering algorithms can easily be swapped in: # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster clust = AffinityPropagation() clust.fit(X) return zip(doc_indices, clust.labels_)
def ward_method_clustering(nodes): """ Performs agglomerative hierarchical clustering of user or transaction addresses with similar behavior patterns. :param nodes: The nodes of the network graph :return: dict: A dictionary of addresses where keys are the cluster labels and values are members of the same cluster """ result = [] levenshtein_distances = -1 * np.array( [[levenshtein_distance(w1, w2) for w1 in nodes] for w2 in nodes]) affinity_propagation = AffinityPropagation(affinity="precomputed", damping=0.5) affinity_propagation.fit(levenshtein_distances) cluster_center_indices = affinity_propagation.cluster_centers_indices_ unique_labels = np.unique(affinity_propagation.labels_) for cluster_id in unique_labels: cluster_list = [] for index, node in enumerate(nodes): if index == cluster_center_indices[cluster_id]: exemplar = node list_of_names = np.nonzero( affinity_propagation.labels_ == cluster_id) for i in list_of_names[0]: if index == i: cluster_list.append(node) cluster = np.unique(cluster_list) # cluster_str = ", ".join(cluster) result[exemplar] = cluster return result
def affinity_propagation(principal_components, principal_df): final_df = pd.concat([principal_df], axis=1) model = AffinityPropagation(damping=0.9, random_state=0) # fit the model model.fit(principal_components) # assign a cluster to each example y_hat = model.predict(principal_components) # retrieve unique clusters clusters = unique(y_hat) final_df['Segment'] = model.labels_ # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(y_hat == cluster) # create scatter of these samples plt.scatter(principal_components[row_ix, 0], principal_components[row_ix, 1], s=75) final_df.rename({ 0: 'PC1', 1: 'PC2', 2: 'PC3', 'y': 'Race' }, axis=1, inplace=True) plt.title("Affinity Propagation") add_race_labels(final_df) calc_silhouette(data=principal_components, prediction=y_hat, n_clusters=len(clusters)) return final_df
def __dtw_clustering(self, seq_f): ### Clustering sequences using affinity propagation, dtw ### Computing similarity/affinity matrix using dtw p_dist = np.zeros((len(seq_f), len(seq_f))) if isinstance(seq_f[0], tuple): seq = [item[0] for item in seq_f] freq = np.array([item[1] for item in seq_f]) else: seq = seq_f for i in range(len(seq)): for j in range(i, len(seq)): p_dist[i][j] = self.__pattern_distance(seq[i], seq[j]) if i != j: p_dist[j][i] = p_dist[i][j] p_dist_max = np.max(p_dist) if p_dist_max == 0: p_dist_max = 2 p_dist = p_dist_max - p_dist ### Affinity Propagation freq = 2 * p_dist_max * freq / max(freq) ap = AffinityPropagation(affinity='precomputed', preference=freq) ap.fit(p_dist) ### Arranging sequences by cluster label cluster_subseqs = dict() for seq, label in zip(seq_f, ap.labels_): if label not in cluster_subseqs: cluster_subseqs.update({label: [seq]}) else: cluster_subseqs[label].append(seq) return cluster_subseqs
def affinitypropagation(params): distance_path='' distance_path+=params["distance_path"] print(distance_path) distance=np.loadtxt(distance_path,dtype=np.float32) print(distance.shape) delta=2 affinity=np.exp(-distance ** 2/ (2. * delta ** 2)) #using default values, set metric to 'precomputed' aff=AffinityPropagation(affinity='precomputed') print(aff) aff.fit(affinity) #get labels labels = aff.labels_ print(labels,labels.shape) #get number of clusters no_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(no_clusters,"no_clusters") #for i in range(no_clusters): #print('Cluster : ', np.nonzero(labels == i)[0]) #print(type(labels)) return_val=tuple(labels.tolist()) #print(type(return_val)) return return_val
def affinity_propagation(dataset, axis, preference, affinity, damping=0.5, max_iter=200, convergence_iter=15, copy=True, verbose=False): """ Helper around sk-learn AffinityPropagation function. """ af = AffinityPropagation(damping=damping, max_iter=max_iter, convergence_iter=convergence_iter, copy=copy, preference=preference, affinity=affinity, verbose=verbose) if axis == 0: af.fit(dataset.T) elif axis == 1: af.fit(dataset) return af
def cluster(self, feat_mtx, df_lm_allusers): # clustering artists based on AffinityPropogation start = time.time() af = AffinityPropagation() af.fit(feat_mtx) self.labels = af.labels_ self.af = af # adding cluster labels to least misery dataframe and sorting by rank and cluster #df_least_misery_clustered = self.df_least_misery.copy() --> changing to df_lm_allusers print 'number of labels: ', len(self.labels) print 'labels', self.labels # print 'least misery clustered length', len(df_least_misery_clustered) df_least_misery_clustered = df_lm_allusers.copy() print 'len df least misery: ', len(df_least_misery_clustered) df_least_misery_clustered['cluster'] = self.labels df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float) ''' will do different sorting if not using rank ''' # now set to false as looking for highest score df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col], ascending = False) self.df_least_misery_clustered = df_least_misery_clustered end = time.time() print 'clustering completed in: ', end - start return df_least_misery_clustered
def reduce(lines): if lines is not None: af = AffinityPropagation(preference=-.01) af.fit(lines[:, 0] / np.array([[300, 1]])) real_lines = af.cluster_centers_ * np.array([[300, 1]]) return np.expand_dims(real_lines, 1)
def cluster_analyze(dataframe, cluster_type='KMeans', n_clusters=None): # coloured area plots ??) from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, SpectralClustering, Birch from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np import time df_mat = dataframe.as_matrix() if cluster_type == 'KMeans': assert n_clusters, "Number of clusters argument mandatory" cluster_callable = KMeans # seed of 10 for reproducibility. clusterer = cluster_callable(n_clusters=n_clusters, random_state=10) elif cluster_type == 'dbscan': assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % ( cluster_type) cluster_callable = DBSCAN clusterer = cluster_callable(eps=0.5) elif cluster_type == 'affinity_prob': assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % ( cluster_type) clusterer = AffinityPropagation(damping=.9, preference=-200) elif cluster_type == 'spectral': assert n_clusters, "Number of clusters argument mandatory" clusterer = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") elif cluster_type == 'birch': assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % ( cluster_type) clusterer = Birch(n_clusters=2) else: raise "Unknown clustering algorithm type" plt.figure(figsize=(2 + 3, 9.5)) colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) #plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,hspace=.01) t0 = time.time() clusterer.fit(df_mat) t1 = time.time() if hasattr(clusterer, 'labels_'): y_pred = clusterer.labels_.astype(np.int) else: y_pred = clusterer.predict(df_mat) dataframe['y_pred'] = y_pred # plot plt.title(cluster_type, size=18) plt.scatter(df_mat[:, 0], df_mat[:, 1]) # color=colors[y_pred].tolist(), s=10) if hasattr(clusterer, 'cluster_centers_'): centers = clusterer.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) plt.show()
def affinity_propagation(feature_matrix): sim = feature_matrix * feature_matrix.T sim = sim.todense() ap = AffinityPropagation() ap.fit(sim) clusters = ap.labels_ return ap, clusters
def affinity_propagation(X, args={}): """ AffinityPropagation聚类:图聚类的一种 """ from sklearn.cluster import AffinityPropagation model = AffinityPropagation(**args) model.fit(X) return model
def ap(X): x, params = X x[np.isnan(x)] = 0.0 x[np.isinf(x)] = 0.0 x = x - 1 af = AffinityPropagation(**params) af.fit(-x) labs = af.labels_ return labs
def get_partition(matrix, preference, damping=0.75): cl = AffinityPropagation(damping=damping, affinity='precomputed', preference=preference) cl.fit(matrix) partition = cl.labels_ return partition
def getNumClusters(doc_vectors): ''' Given a list of document vectors as returned by makeDocumentVectors, this function runs affinity propogation on the vectors to approximate the number of clusters the documents would fall into ''' clf = AffinityPropagation() clf.fit(doc_vectors) return len(clf.cluster_centers_indices_)
def affinityClustering(series): vectors = series.tolist() #Clustering affinity = AffinityPropagation() affinity.fit(vectors) #Cluster y_affinity = affinity.predict(vectors) return y_affinity
def affinity_propagation(feature_matrix): ''' Affinity propagation clustering ''' ap = AffinityPropagation() ap.fit(feature_matrix.todense()) clusters = ap.labels_ return ap, clusters
def cluster_prop(self, filtered_data): prop_dict={} for review in filtered_data: for dicti in review['line']: if not prop_dict.has_key(dicti["prop"][0]): prop_dict[dicti["prop"][0]]={"freq":0,"data":[],"idx":[]} prop_dict[dicti["prop"][0]]['idx'].append(review['index']) prop_dict[dicti["prop"][0]]["freq"] += 1 prop_dict[dicti["prop"][0]]["data"].append(dicti) d_list=[] word_list=[] for word in prop_dict: try: d_list.append(self.wmodel[word]) word_list.append(word) except: pass Aprop = AffinityPropagation(damping=0.6, convergence_iter=100, max_iter=10000) Aprop.fit(d_list) cluster_dict = {} for idx, each in enumerate(Aprop.labels_): vec = d_list[idx] if not cluster_dict.has_key(each): cluster_dict[each] = {"word":[],"freq":0,"seed":"","sim":0.0} cluster_dict[each]["word"].append(word_list[idx]) total_freq=0 for each in cluster_dict.keys(): target_group_id = each group_id = each last_group_id = target_group_id cluster_freq=0 max_seed="" max_freq=0 for idx,data in enumerate(cluster_dict[each]["word"]): cluster_freq+=prop_dict[data]["freq"] if prop_dict[data]["freq"] > max_freq: max_freq=prop_dict[data]["freq"] max_seed=data cluster_dict[each]["freq"]=cluster_freq cluster_dict[each]["seed"]=max_seed return (cluster_dict, prop_dict, Aprop)
def clustering_affinity_propagation(data_res): """ Executes sklearn's affinity propagation function with the given data frame """ af = AffinityPropagation() af.fit(data_res) predictions = af.predict(data_res) cluster_centers = af.cluster_centers_ return predictions, cluster_centers, af
def affinityprop(lngs, lats, city, cluster_diameter): city_area = city["area"] city_lng = city["lng"] city_lat = city["lat"] lngs = np.array(lngs)#*(math.cos(city["lat"])**2) affinity = AffinityPropagation(damping=0.75, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False) affinity.fit(np.array([lngs, lats]).transpose()) cluster_labels = np.array(affinity.labels_) return labels_to_index(cluster_labels)
def cluster_concepts(context="location"): """ Cluster related concepts of a specific type to different categories """ db = Database() concept_category = ConceptCategory() cmd = "SELECT * FROM %s" % (context) context_res = db.query_db(cmd) concept_list = [] concept_matrix = [] for item in context_res: concept_list = [] concept_matrix = [] if context == "action": context_id, context_chinese, context_name = item[:3] elif context == "location": context_id, context_name, context_chinese = item cmd = ( "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \ WHERE a.%s_id = %s AND a.concept_id = b.id" % (context, context, context_id) ) concept_res = db.query_db(cmd) if len(concept_res) == 0: continue for item in concept_res: concept, concept_id = item concept_vector = concept_category.concept_axes.row_named(concept) concept_list.append((concept_id, concept)) concept_matrix.append(concept_vector) # Run affinity propogation S = cosine_similarity(concept_matrix, concept_matrix) af = AffinityPropagation() af.fit(S) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 clusters = defaultdict(list) for label in labels: clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count]) count += 1 category_num = 0 for key, value in clusters.items(): category_num += 1 for concept in value: cmd = ( "UPDATE %s_concept SET category = %d WHERE \ %s_id = %s AND concept_id = %s" % (context, category_num, context, context_id, concept[0]) ) db.query_db(cmd) print concept[1].encode("utf-8") + " ", print "" print "----------" + context_chinese.encode("utf-8") + "----------"
def train_model( X, quantile, shift = 0, isKernel = False): if isKernel == False: preference = np.percentile(X,q = quantile)-shift model_affinityPropagation = AffinityPropagation(preference = preference) model_affinityPropagation.fit(X) return model_affinityPropagation else: kernel = pairwise_kernels(X,metric="rbf") preference = np.percentile(X,q = quantile)-shift model_affinityPropagation = AffinityPropagation(affinity='precomputed',preference = np.percentile(kernel,q = 0.318)) model_affinityPropagation.fit(kernel) return model_affinityPropagation
def do_issue(data, data_name): reduced_points, labels, km = reduce_npoints_kmeans(dataframe = data, dataset_name = dataset, data_name=data_name, n_datapoints = 1000, load_from_file = False) transformed_data, pca, components = calculate_pca(reduced_points, n_components=3) colormap = brewer2mpl.get_map('RdBu', 'diverging', 4, reverse=True) filename = figure_save_path + dataset + '_issue_29_1_%s_reduced_number_of_points.png'%data_name print "Making scatter plot of %s data for dataset %s, where the number of points have been reduced by K-Means clustering"%(data_name, dataset) make_color_grouped_scatter_plot(data_frame=transformed_data, x_name='d1', y_name='d2', color_by='d3', filename=filename, colormap=colormap) ap = AffinityPropagation(damping=affinity_damping) ap.fit(reduced_points) print "Making scatter plot of Affinity Propagation clusters of %s data for dataset %s"%(data_name, dataset) filename = figure_save_path + dataset + '_issue_29_2_%s_affinity.png'%data_name make_scatter_plot_for_labelled_data(data_frame=transformed_data, x_name='d1', y_name='d2', labels=ap.labels_, filename=filename, colormap = colormap, legend=True)
def cluster(self, feat_mtx): # clustering artists based on AffinityPropogation af = AffinityPropagation() af.fit(feat_mtx) self.labels = af.labels_ self.af = af # adding cluster labels to least misery dataframe and sorting by rank and cluster df_least_misery_clustered = self.df_least_misery.copy() df_least_misery_clustered['cluster'] = self.labels df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float) ''' will do different sorting if not using rank ''' df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col]) return df_least_misery_clustered
def affinity_propagation(self, affinity_matrix=None, sigma=1, **kwargs): """ :param kwargs: damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, verbose=False :return: """ if affinity_matrix is None: aff = rbf(self.dm.values, sigma) else: aff = affinity_matrix est = AffinityPropagation(affinity='precomputed', **kwargs) est.fit(aff.view(np.ndarray)) return Partition(est.labels_)
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'): if m=='load': centroidRead=open('centroid','r') labelClusterRead=open('labelCluster','r') labelPreRead=open('labelPre','r') centroid=pickle.load(centroidRead) labelCluster=pickle.load(labelClusterRead) labelPre=pickle.load(labelPreRead) else: dataArrayTestNorm = preprocessing.normalize(dataArrayTest) dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain) #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10) clf=AffinityPropagation() #clf=DBSCAN(min_samples=30) pre=clf.fit(dataArrayTrainNorm) centroid=pre.cluster_centers_ centroidWrite=open('centroid','w') #pickle.dump(centroid,centroidWrite) labelCluster=pre.labels_ labelClusterWrite=open('labelCluster','w') #pickle.dump(labelCluster,labelClusterWrite) labelPre=clf.predict(dataArrayTestNorm) labelPreWrite=open('labelPre','w') #pickle.dump(labelPre,labelPreWrite) return centroid,labelCluster,labelPre
def create_stratum(self, column_names, **kwargs): ''' Use affinity propagation to find number of strata for each column. column_names is a list of the covariates to be split into strata and used for classification. This funciton adds a column to the data frame for each column as column_name_strata that gives the strata designation for that variable. The whole data frame is returned. ''' for colname in column_names: X = self.data[colname].reshape(-1, 1) if np.isnan(X).any(): raise ValueError("There are NaN values in self.data[%s] that the \ clustering algorithm can't handle" % colname) elif np.unique(self.data[colname]).shape[0] <=2: string_name = colname+'_strata' self.data[string_name] = self.data[colname].astype(int) else: af_model = AP(damping = 0.9) strata_groups = af_model.fit(X) #cluster_centers_indices = af.cluster_centers_indices_ #n_clusters_ = len(cluster_centers_indices) string_name = colname+'_strata' self.data[string_name] = strata_groups.labels_ return self.data
def affinity_propagation_cluster_analysis(x,y,preference): # NOT WORKING BECAUSE I DONT REALLY UNDERSTAND WHAT IT DOES... # ADAPTED FROM: # http://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html#example-cluster-plot-affinity-propagation-py X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) af = AffinityPropagation() af = af.fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in xrange(len(np.unique(labels))): my_members = labels == i cluster_center = X[cluster_centers_indices[i]] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) for j in X[my_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]],c=colors[i],linestyle='--') tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def affinity_propagation(x, damping=0.9): ap = AffinityPropagation( damping=damping, max_iter=400, convergence_iter=30, copy=True, preference=None, affinity='euclidean', verbose=False ) ap.fit(x) centroids = ap.cluster_centers_ c = ap.labels_ k = len(centroids) return ap, (centroids, c, k)
def AlloyClustering(k): alloy_data = data_parser.parse("../../AlloyComps.csv") data = np.asarray(alloy_data.get_data(["Cu","Ni","Mn","P","Si","C"])) #est = KMeans(n_clusters=k) #est = AgglomerativeClustering(n_clusters = k) est = AffinityPropagation() est.fit(data) labels = est.labels_ '''print(len(labels)) for i in range(k): print("Cluster #{}".format(i)) print(np.asarray(alloy_data.get_data("Alloy"))[np.where(labels == i)]) print()''' return (labels,alloy_data)
def runAffinityPropagation(self): ''' This function runs the affinity propagation algorithm ''' distMatrix = distance.squareform(distance.pdist(self.coordinates, 'cosine')) size = distMatrix.shape for i in range(size[0]): for j in range(size[1]): distMatrix[i,j] = 2 - distMatrix[i,j] model = AffinityPropagation(damping = self.damping, max_iter = self.max_iter,affinity = 'precomputed') model.fit(distMatrix) self.center_id = model.cluster_centers_indices_.tolist() belongs = model.labels_.tolist() for i in range(len(belongs)): self.assignments[i]['assignment'] = 'centroid_' + str(belongs[i] + 1) self.silhouetteScore = metrics.silhouette_score(distMatrix, model.labels_, metric = 'cosine') trueLabel = dataProcessing.getTrueLabel(self.assignments) self.adjustedScore = metrics.adjusted_rand_score(belongs, trueLabel)
def dataset_fringes(X, cluster_algo, min_compression=64): if cluster_algo =='none' or len(X) <= min_compression: return X elif cluster_algo == 'AffinityPropagation': algo = AffinityPropagation() D = -spsp.distance.squareform(sp.spatial.distance.pdist(X)) algo.fit(D) return X[algo.cluster_centers_indices_] elif cluster_algo == 'DBSCAN': algo = DBSCAN(metric='precomputed', min_samples=2) D = -spsp.distance.squareform(sp.spatial.distance.pdist(X)) labels = algo.fit(D).labels_ return NearestCentroid().fit(X, labels).centroids_ elif cluster_algo == 'svm_outlier': algo = svm.OneClassSVM(nu=0.95 * 0.25 + 0.05, kernel="rbf") #, gamma=0.1) #UNFINISHED!!! else: print("BOH")
def affinity_umi_removal(molecular_barcodes, _): """ Tries to finds clusters of similar UMIs using an affinity based approach. It returns a list with all the non clustered UMIs, for clusters of multiple UMIs a random one will be selected. :param molecular_barcodes: a list of UMIs :return: a list of unique UMIs :rtype: list """ if len(molecular_barcodes) <= 2: return countUMINaive(molecular_barcodes, allowed_mismatches) words = np.asarray(molecular_barcodes) lev_similarity = -1 * np.array([[hamming_distance(w1,w2) for w1 in words] for w2 in words]) affprop = AffinityPropagation(affinity="precomputed", damping=0.5) affprop.fit(lev_similarity) unique_clusters = list() for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)]) unique_clusters.append(random.choice(cluster)) return unique_clusters
def _internal(preferences, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): ap = AffinityPropagation(preference=preferences[i], affinity='precomputed', max_iter=500) ap.fit(affinity_matrix) cluster_labels = ap.labels_.copy() nclusts = np.unique(cluster_labels).shape[0] save_results_clusters("res_ap_{:03d}_clust.csv" .format(nclusts), sample_names, ap.labels_) if nclusts > 1: try: silhouette_list = silhouette_samples(dist_matrix, ap.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) except BaseException: print(dist_matrix.shape, ap.labels_.shape)
def get_label_res2(similar_matrix, n_subs): cluster = AffinityPropagation(damping = 0.75 , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed') res = cluster.fit(similar_matrix) size_labels = len(set(res.labels_)) assert size_labels < 10, size_labels assert size_labels > 1, size_labels print res.labels_ return res.labels_
def compute_threshold(affmat): """ This function uses affinity propagation to cluster the sequences, and then computes minimum of minimum in-cluster pairwise identities to be used as a threshold value. """ ap = AffinityPropagation(affinity='precomputed') ap.fit(affmat) clusters = pd.DataFrame([i for i in zip(affmat.index, ap.labels_)]) clusters = clusters.set_index(0) clusters.columns = ['Cluster'] minval = 1 for group in clusters.groupby('Cluster'): accessions = group[1].index subset = affmat[accessions].loc[accessions, :] if np.matrix(subset).min() < minval: minval = np.matrix(subset).min() return minval