def __init__(self, categories, replacement, selection=None, lam=500, theta_init=None): """ Parameters ---------- replacement : eda.optimizer.replacement.replacement_base.ReplacementBase Replacement method. selection : eda.optimizer.selection.selection_base.SelectionBase, default None Selection method. """ super(AffEDA, self).__init__(categories, lam=lam, theta_init=theta_init) self.replacement = replacement self.selection = selection self.population = None self.fitness = None self.cluster = None self.ap = cluster.AffinityPropagation(affinity="precomputed", random_state=0)
def do_algo(self, input): control_params = input.algo_control.control_params if not self.check_input_params(self.get_input_params_definition(), control_params): log.error("Check input params type error.") return None mode = input.algo_control.mode data = input.algo_data.data if mode == 'training': try: model = cluster.AffinityPropagation( damping=control_params["damping"], preference=control_params["preference"], convergence_iter=control_params["convergence_iter"], max_iter=control_params["max_iter"] ) model.fit(data) algo_output = alc.AlgoParam(algo_control={'mode': 'training', 'control_params': ''}, algo_data={'data': data, 'label': None}, algo_model={'model_params': model.get_params(), 'model_instance': model}) except Exception as e: log.error(str(e)) algo_output = None else: algo_output = None return algo_output
def _affinity_propagation(feature, ground_truth, p_d): p = p_d['preference'] d = p_d['damping_factor'] if (p_d.get('affinity') and p_d['affinity'] == 'precomputed'): connectivity = kneighbors_graph(feature, n_neighbors=p_d['n_neighbors'], include_self=True) affinity_matrix = 0.5 * (connectivity + connectivity.T) affinity_matrix = np.asarray(affinity_matrix.todense(), dtype=float) af = cluster.AffinityPropagation( damping=d, affinity='precomputed').fit(affinity_matrix) else: af = cluster.AffinityPropagation(preference=p, damping=d).fit(feature) y_pred_af = af.labels_ ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af) return ars_af
def ClusterHouses(matches, plot_groups=False): groups = {} try: N = len(matches) X = np.zeros((N, 2)) for m in range(N): loc = RFAPI.house_location(matches[m]) #logging.debug("ClusterHouses({})".format(loc)) X[m] = (loc[0], loc[1]) params = { 'quantile': .3, 'eps': .15, 'damping': .9, 'preference': -5, 'n_neighbors': 2, 'n_clusters': 5 } # a bit buggy.. spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") # best so far! gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') # yielded one cluster.. affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) algorithm = ms algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) for m in range(len(matches)): key = str(y_pred[m]) if groups.get(key, None) == None: groups[key] = [] groups[key].append({ "adress": RFAPI.house_address(matches[m]), "location": [X[m][0], X[m][1]] }) logging.debug("groups = {}".format(groups)) if plot_groups: HouseScore._plot_groups(X, y_pred) except Exception as e: groups["error"] = str(e) logging.error(groups["error"]) return groups
def get_distances(con, cur=None, compid=0): import scipy.sparse as sp from sklearn import cluster owners = pd.read_sql(f'select * from component where compid={compid}', con) ostr = ','.join([str(o) for o in owners['ownerid']]) omap = {o: i for i, o in enumerate(owners['ownerid'])} owners['oid'] = owners['ownerid'].map(omap) nown = len(owners) pairs = pd.read_sql( f'select * from pair where ownerid1 in ({ostr}) or ownerid2 in ({ostr})', con) pairs['dist'] = pairs.apply(lambda df: affin(df['name1'], df['name2']), axis=1) pairs['oid1'] = pairs['ownerid1'].map(omap) pairs['oid2'] = pairs['ownerid2'].map(omap) dist = pd.DataFrame( [(o1, o2) for o1, o2 in product(owners['ownerid'], owners['ownerid'])], columns=['ownerid1', 'ownerid2']) dist = dist.join(pairs.set_index(['ownerid1', 'ownerid2'])['dist'], on=['ownerid1', 'ownerid2']).fillna(0.0) amat = dist['dist'].values.reshape([nown, nown]) # amat = sp.coo_matrix((pairs['dist'],(pairs['oid1'],pairs['oid2']))) # fit = cluster.SpectralClustering(affinity='precomputed').fit(amat) fit = cluster.AffinityPropagation(affinity='precomputed').fit(amat) # fit = cluster.DBSCAN(metric='precomputed').fit(amat) nclust = np.max(fit.labels_) + 1 cids = [np.nonzero(fit.labels_ == i)[0] for i in range(nclust)] cown = [owners[owners['oid'].isin(c)]['ownerid'] for c in cids] cname = [get_names(olist=c) for c in cown] return owners, pairs, cname
def names_clustering(stringVect): ''' Create clusters of most commonly appearing sub-strings and assign them to items passed in. Clustering is done on the similarity matrix, which we will call here on our input Requires: sklearn.AffinityPropagation fuzzywuzzy.fuzz Input: stringVect - vector of strings Output: dfCluster - a dataframe that contains the original stringVect inputs and their associated cluster ''' # Generate the similarity matrix on input S = generate_similarity_score_matrix(stringVect) # Fit the Affinity Propagation clustering algorithm on similarity matrix, S clusters = cluster.AffinityPropagation(affinity='precomputed', random_state=None).fit_predict(S) # Create the output dataframe dfCluster = pd.DataFrame(list(zip(stringVect, clusters)), columns=['input_names', 'cluster']) return dfCluster
def trace_clustering(dataframe, output_path, filename): array = np.transpose(dataframe.values) clustering_method = 'KMeans' if clustering_method == 'AffinityPropagation': clustering = cluster.AffinityPropagation().fit(array) elif clustering_method == 'KMeans': clustering = cluster.KMeans(n_clusters=6).fit(array) labels = clustering.labels_ print(labels) for cluster_group_index in np.unique(labels): fig = plt.figure() ax = fig.gca() trace_index_list = np.argwhere(labels == cluster_group_index) for count, trace_index in enumerate(trace_index_list): trace_length = array[trace_index, :].shape[1] trace = array[trace_index, :].reshape((trace_length)) if count == 0: average = trace elif count >= 1: average = np.mean(np.concatenate( (average, trace)).reshape(2, trace_length), axis=0) ax.plot(np.arange(0, len(trace), 1), trace, 'b') ax.plot(np.arange(0, len(average), 1), average, 'r') plt.show() fig.savefig(output_path + filename + clustering_method + '_' + str(cluster_group_index) + '_cluster.png')
def affinity_propagation(similiarity_matrix): """Perform Affinity Propagation Clustering of data Note: This function is a wrapper for AffinityPropagation from scikit-learn Source: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html Parameters --------- similarity_matrix: pandas DataFrame, shape (n_samples, n_samples) Matrix of similarities between points. Returns ------- clusters: dictionary A dictionary of <sample: cluster label> key-value pairs. """ clusters = {} labels = cluster.AffinityPropagation().fit_predict(similiarity_matrix) n_labels = labels.max() clusters = {} for i in range(n_labels + 1): for neuron in list(similiarity_matrix.columns[labels == i]): clusters[neuron] = i return clusters
def investigateOptimalAlgorithms(kmerId, kmerPca): plot.setLibrary('bokeh') pca = kmerPca.loc[:, PCA_DATA_COL_NAMES] plots = {} algos = (('KMeans', cluster.KMeans()), ('Affinity', cluster.AffinityPropagation()), ('MeanShift', cluster.MeanShift()), ('Spectral', cluster.SpectralClustering()), ('Agglomerative', cluster.AgglomerativeClustering(linkage='average')), ('Agglomerative', cluster.AgglomerativeClustering(linkage='ward')), ('DBSCAN', cluster.DBSCAN()), ('Gaussian', GaussianMixture())) ## Visualise data and manually determine which algorithm will be good for i, (name, algo) in enumerate(algos, 1): labels = _getLabels(algo, pca) labels = pd.DataFrame(labels, columns=[CLABEL_COL_NAME]) kmerDf = pd.concat([kmerId, pca, labels], axis=1) dataset = hv.Dataset(kmerDf, PCA_DATA_COL_NAMES) scatter = dataset.to(hv.Scatter, PCA_DATA_COL_NAMES, groupby=CLABEL_COL_NAME).overlay() scatter.opts(opts.Scatter(size=10, show_legend=True)) plots[name] = scatter plots = hv.HoloMap(plots, kdims='algo') plots = plots.collate() return plots
def Affinity(tfidf, cluster_list): affinity = cluster.AffinityPropagation(preference=10).fit(tfidf) labels = affinity.labels_ # result = normalized_mutual_info_score(labels, cluster_list) result=v_measure_score(labels,cluster_list ) print("the Affinity propagation cluster algorithm result is:",result)
def groupROIClusters(clusters:List[ROICluster], factor=-1, normalize=True, prefFunc=lambda mat: (np.min(mat)+np.median(mat))/2) -> List[List[ROICluster]]: def dist(A, B): widthA = A[0]; widthB = B[0] heightA = A[1]; heightB = B[1] # return (abs(widthA-widthB)+abs(heightA-heightB))/2 return (abs(widthA-widthB)+abs(heightA-heightB))/1 # return abs(A.parent.bbox['area'] - B.parent.bbox['area']) # afmat = pdist(np.matrix([cluster.parent.bbox['area'] for cluster in clusters]).transpose(), lambda x,y:2*abs(x-y)/(x+y)) # afmat = pdist(np.matrix([cluster.parent.bbox['area'] for cluster in clusters]).transpose(), lambda x,y:abs(x-y)) afmat = pdist(np.matrix([ [cluster.parent.bbox['width'], cluster.parent.bbox['height']] for cluster in clusters ]), dist) # afmat = (-100)*squareform(afmat/np.max(afmat)) # afmat = (-100)*squareform(afmat) if normalize == True: afmat = (factor)*squareform(afmat/np.max(afmat)) else: afmat = (factor)*squareform(afmat) if prefFunc: pref = prefFunc(afmat) else: pref = np.min(np.min(np.min(afmat))) ap = cluster.AffinityPropagation(affinity='precomputed', preference=pref) ap.fit(afmat) # allpoints_labels, centers_indices = (ap.labels_, ap.cluster_centers_indices_) groups = set() for label in ap.labels_: groups.add(frozenset([i for i,x in enumerate(ap.labels_) if x==label])) groups = [list(group) for group in groups] return [[clusters[i] for i in indexGroup] for indexGroup in groups]
def cluster_model(newdata, data, model_name, input_param): ds = data params = input_param if str.lower(model_name) == 'kmeans': cluster_obj = cluster.KMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MiniBatchKMeans'): cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('SpectralClustering'): cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MeanShift'): cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth']) if str.lower(model_name) == str.lower('DBSCAN'): cluster_obj = cluster.DBSCAN(eps=params['eps']) if str.lower(model_name) == str.lower('AffinityPropagation'): cluster_obj = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']) cluster_obj.fit(ds) if str.lower(model_name) == str.lower('Birch'): cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters']) if str.lower(model_name) == str.lower('GaussianMixture'): cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') cluster_obj.fit(ds) if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']: model_result = cluster_obj.predict(ds) else: model_result = cluster_obj.fit_predict(ds) newdata[model_name] = pd.DataFrame(model_result) return newdata
def AffinityProp(D, pref, damp): aff = cluster.AffinityPropagation(affinity='precomputed', preference=pref, damping=damp, verbose=True) labels = aff.fit_predict(D) return labels
def determine_source_locations_instance(r_ref, l_ref, node_events, **kwargs): """ Determines the position in the probability grid that has the highest probability of being the position of the source. """ max_vals = determine_source_position_list(r_ref, l_ref, node_events, **kwargs) positions = np.array([p.to_list() for p, _ in max_vals]) af = clustering.AffinityPropagation().fit(positions) max_prob_centers = determine_peaks(max_vals, af.labels_) prob_list = [ position_probability(p.x, p.y, r_ref, l_ref, node_events) for p in max_prob_centers ] ret_list = [ Location(p, conf) for p, conf in zip(max_prob_centers, prob_list) ] return ret_list
def _init_model(self, embedding_model=None): if embedding_model is None: self.load_embeddings_model() else: self.emb_model = embedding_model return cluster.AffinityPropagation(damping=0.9, max_iter=2000, convergence_iter=1000, preference=None, affinity='precomputed', verbose=True)
def getSortedRowClusters(self, objs): ''' Determine row clusters and their order. Clusters that create rows are determined by the user-specified algorithm. They are then sorted by location, and lists of indices for each cluster are returned in order. ''' if self.row_algorithm == 'affinity': algorithm = cluster.AffinityPropagation(**self.row_params) elif self.row_algorithm == 'DBSCAN': algorithm = cluster.DBSCAN(**self.row_params) elif self.row_algorithm == 'MeanShift': algorithm = cluster.MeanShift(**self.row_params) Y = np.array([[y.baseline] for y in objs], dtype=np.float64) rows = algorithm.fit_predict(Y) if self.row_algorithm == 'affinity': # Here, samples are the found location, so just sort directly. row_set = set(rows) def ordered_clusters(): # ABBYY coordinates are bottom-to-top, so reverse list. for i in sorted(row_set, reverse=True): yield np.where(rows == i)[0] return ordered_clusters(), len(row_set), False elif self.row_algorithm == 'DBSCAN': # Here, samples are labelled, so go back and find the original # locations. fuzzy = -1 in rows num_clusters = len(set(rows)) - (1 if fuzzy else 0) clusters = [] cluster_centres = np.empty(num_clusters) for i in range(num_clusters): index = np.where(rows == i) clusters.append(index[0]) cluster_centres[i] = np.mean(np.take(Y, index)) ordered_clusters = ( clust for centre, clust in sorted(zip(cluster_centres, clusters))) return ordered_clusters, num_clusters, fuzzy elif self.row_algorithm == 'MeanShift': # Here, samples are labelled, but cluster locations are provided. fuzzy = -1 in rows num_clusters = len(set(rows)) - (1 if fuzzy else 0) clusters = [] for i in range(num_clusters): index = np.where(rows == i) clusters.append(index[0]) ordered_clusters = (clust for centre, clust in sorted( zip(algorithm.cluster_centers_, clusters))) return ordered_clusters, num_clusters, fuzzy
def affinitypropagation(pointarrays, candforpre=None, preference=None): ap = cluster.AffinityPropagation() if candforpre == None: ap.fit(array(pointarrays)) return ap.labels_, ap.cluster_centers_indices_, None else: ap.fit(array(candforpre)) labels = ap.fit_predict(array(pointarrays)) return labels, None, None
def use_af(mat, n_cluster): clusters = cls.AffinityPropagation(damping=0.99282, affinity='precomputed').fit(mat) n_cluster = max(clusters.labels_) + 1 hist, bin_edges = np.histogram(clusters.labels_, bins=np.arange(n_cluster + 1)) print 'Affinity Propagation clustering:', clusters.labels_ print hist return clusters.labels_
def get_algorithm(algorithm_name: str, clusters: int) -> cluster: if algorithm_name == "Birch": return cluster.Birch(n_clusters=clusters) elif algorithm_name == "Spectral Clustering": return cluster.SpectralClustering(n_clusters=clusters) elif algorithm_name == 'Affinity Propagation': return cluster.AffinityPropagation() else: raise NotImplementedError(f'algorithm: {algorithm_name} not implemented')
def _cluster(self, acts, method='KM', param_dict=None): print('Starting clustering with {} for {} activations'.format( method, acts.shape[0])) if param_dict is None: param_dict = {} centers = None if method == 'KM': n_clusters = param_dict.pop('n_clusters', 25) km = cluster.KMeans(n_clusters) d = km.fit(acts) centers = km.cluster_centers_ d = np.linalg.norm(np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1) asg, cost = np.argmin(d, -1), np.min(d, -1) elif method == 'AP': damping = param_dict.pop('damping', 0.5) ca = cluster.AffinityPropagation(damping) ca.fit(acts) centers = ca.cluster_centers_ d = np.linalg.norm(np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1) asg, cost = np.argmin(d, -1), np.min(d, -1) elif method == 'MS': ms = cluster.MeanShift(n_jobs=self.num_workers) asg = ms.fit_predict(acts) elif method == 'SC': n_clusters = param_dict.pop('n_clusters', 25) sc = cluster.SpectralClustering(n_clusters=n_clusters, n_jobs=self.num_workers) asg = sc.fit_predict(acts) elif method == 'DB': eps = param_dict.pop('eps', 0.5) min_samples = param_dict.pop('min_samples', 20) sc = cluster.DBSCAN(eps, min_samples, n_jobs=self.num_workers) asg = sc.fit_predict(acts) else: raise ValueError('Invalid Clustering Method!') if centers is None: ## If clustering returned cluster centers, use medoids centers = np.zeros((asg.max() + 1, acts.shape[1])) cost = np.zeros(len(acts)) for cluster_label in range(asg.max() + 1): cluster_idxs = np.where(asg == cluster_label)[0] cluster_points = acts[cluster_idxs] pw_distances = metrics.euclidean_distances(cluster_points) centers[cluster_label] = cluster_points[np.argmin( np.sum(pw_distances, -1))] cost[cluster_idxs] = np.linalg.norm( acts[cluster_idxs] - np.expand_dims(centers[cluster_label], 0), ord=2, axis=-1) print('Created {} clusters'.format(len(np.unique(asg)))) return asg, cost, centers
def cluster_query(method): # 用来聚类杨老师那边给出的数据 load_raw_query() load_hidden_vector() # 检测数量匹配 if (len(documents) != len(hidden_vectors)): print "日志数量与向量数量不符,请检查后重试" sys.exit() # 接下来是正常处理流程 print "生成隐藏向量数组" t0 = datetime.datetime.now() X = np.array([[ele for ele in vector[:-1].split("\t")] for vector in hidden_vectors]) t1 = datetime.datetime.now() print "耗时", t1 - t0 # print "归一化数据集(特征选择)" # # normalized dataset for easier parameter selection # t0 = datetime.datetime.now() # X = StandardScaler().fit_transform(X) # t1 = datetime.datetime.now() # print "耗时", t1-t0 if (method == "kmeans"): print "开始 %s 聚类,中心个数 %d" % (method, num_topic) algorithm = cluster.MiniBatchKMeans(n_clusters=num_topic) elif (method == "ap"): print "开始 %s 聚类,中心个数待定" % method algorithm = cluster.AffinityPropagation(damping=.5, preference=None) t0 = datetime.datetime.now() algorithm.fit(X) t1 = datetime.datetime.now() print "耗时", t1 - t0 # 输出结果 print "按照类别写入到结果中" y_pred = algorithm.labels_.astype(np.int) # 找到类别中的最大值 maxY = max(y_pred) print "类别个数", maxY + 1 # 各个类别的结果 topic_result = [[] for i in range(maxY + 1)] for i in range(len(documents)): topic_result[y_pred[i]].append(documents[i]) for i in range(len(topic_result)): filepath = "%stopic%d.txt" % (kmeans_result_dir, i) # print "写入类别 %d 至 %s" % (i, filepath) with codecs.open(filepath, "w", "utf-8") as f: f.write("类别 %d 的记录个数 %d\n" % (i, len(topic_result[i]))) for line in topic_result[i]: f.write(line) print "聚类结果处理完成"
def clustering(X, algorithm, n_clusters=2): X = np.transpose(X) # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Generate the new colors: if algorithm == 'KMeans': model = cluster.KMeans(n_clusters=n_clusters, random_state=0) elif algorithm == 'Birch': model = cluster.Birch(n_clusters=n_clusters) elif algorithm == 'DBSCAN': model = cluster.DBSCAN(eps=.2) elif algorithm == 'AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm == 'MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm == 'SpectralClustering': model = cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm == 'Ward': model = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) elif algorithm == 'AgglomerativeClustering': model = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=n_clusters, connectivity=connectivity) model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) return X, y_pred
def affinitypropagation(words,querys=None, preference=None): ap = cluster.AffinityPropagation(0.6) if querys == None: ap.fit(array(words)) return ap.labels_, ap.cluster_centers_indices_,None else: ap.fit(array(words)) w_labels = ap.labels_ labels = ap.fit_predict(array(querys)) return w_labels,None,labels
def compute_clusters(vectors, clusters, algorithm='kmeans'): # select clustering algorithm if algorithm == 'kmeans': algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters))) elif algorithm == 'dbscan': algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1) elif algorithm == 'optics': algorithm = cluster.OPTICS(min_samples=10, eps=10, cluster_method='dbscan', n_jobs=-1) elif algorithm == 'birch': algorithm = cluster.Birch(n_clusters=len(set(clusters))) elif algorithm == 'spectral': algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)), eigen_solver='arpack', affinity="nearest_neighbors", n_jobs=-1) elif algorithm == 'affinity': algorithm = cluster.AffinityPropagation(damping=.9, preference=-200) else: raise NotImplementedError(f"Not implemented for algorithm {algorithm}") # predict cluster memberships algorithm.fit(vectors) if hasattr(algorithm, 'labels_'): labels = algorithm.labels_.astype(np.int) else: labels = algorithm.predict(vectors) #transform categorical labels to digits if isinstance(clusters[0], str): labels_true = LabelEncoder().fit_transform(clusters) elif isinstance(clusters[0], (int, np.int)): labels_true = clusters # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectors, labels)) return labels, algorithm
def select_n_clusters(data, data_pca, preference_range): scores = [] for preference in preference_range: ap = cluster.AffinityPropagation(preference=preference).fit(data_pca) score = get_score(data, ap) scores.append(score) for i, score_function in enumerate( ['n_clusters', 'silhouette_score', 'calinski_harabaz_score']): plt.subplot(1, 3, i + 1) plt.title(score_function) plt.plot(preference_range, [item[score_function] for item in scores]) plt.show()
def __init__(self, conn, args, data, split_type, num_clusters): """Constructor for Cluster object. :param conn: database connection object. :param args: dict of arguments read from the arguments file. :param data: data to cluster. :param split_type: Split train test data randomly or by date to allow testing by specific date ranges. :param num_clusters: Number of clusters to create. :return: Cluster instance. """ self.conn = conn self.args = args self.data = data self.split_type = split_type self.pca_model = None self.cluster_model = None self.algorithm = args['cluster_algorithm'] # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html hdbsc = hdbscan.HDBSCAN(min_cluster_size=10) affinity_propagation = cluster.AffinityPropagation() ms = cluster.MeanShift(bin_seeding=True) spectral = cluster.SpectralClustering(n_clusters=num_clusters, eigen_solver='arpack', affinity="nearest_neighbors", random_state=self.args['seed']) ward = cluster.AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') birch = cluster.Birch(n_clusters=num_clusters) two_means = cluster.MiniBatchKMeans(n_clusters=num_clusters, random_state=self.args['seed']) average_linkage = cluster.AgglomerativeClustering( linkage="average", n_clusters=num_clusters) hdbsc = hdbscan.HDBSCAN(min_cluster_size=10) kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=self.args['seed']) dbscan = cluster.DBSCAN() self.clustering_algorithms = { 'MiniBatchKMeans': two_means, 'AffinityPropagation': affinity_propagation, 'MeanShift': ms, 'SpectralClustering': spectral, 'Ward': ward, 'AgglomerativeClustering': average_linkage, 'DBSCAN': dbscan, 'Birch': birch, 'HDBSCAN': hdbsc, 'KMeans': kmeans }
def affinity(fig): global X_iris, geo ax = fig.add_subplot(geo + 3, projection='3d', title='affinity') affinity = cluster.AffinityPropagation(preference=-50) affinity.fit(X_iris) res = affinity.labels_ for n, i in enumerate(X_iris): ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o') ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') return res
def cluster_recogniser(self, corpus): corpus_res = {} ngram_vectorizer = skfe.text.CountVectorizer(analyzer='char', ngram_range=(2, 4)) counts = ngram_vectorizer.fit_transform(corpus) machine = sc.AffinityPropagation() list_num = list(machine.fit_predict(counts)) groups = [[] for i in range(max(list_num) + 1)] for i in range(len(corpus)): groups[list_num[i]].append(corpus[i]) for i in groups: corpus_res[i[0]] = i return corpus_res
def AffinityProp(D, pref, damp): """ Perform SKLearn affinity propagation (clustering) with specified data and parameters, returning labels. :param pref: preference parameter for the affinity propagation :param damp: damping parameter for the affinity propagation :return: labels """ aff = cluster.AffinityPropagation(affinity='precomputed', preference=pref, damping=damp, verbose=True) labels = aff.fit_predict(D) return labels
def affinity_propagation(threshold, matrix, taxa, revert=False): """ Compute affinity propagation from the matrix. """ if not taxa: taxa = list(range(1, len(matrix) + 1)) # turn distances to similarities matrix = np.array(matrix) # iterate over matrix for i, line in enumerate(matrix): matrix[i][i] = 10 for j in range(i + 1, len(matrix)): score = matrix[i][j] if score < threshold: matrix[i][j] = -np.log2(1 - score**2) #-np.log2(score+0.01) matrix[j][i] = matrix[i][j] #score ** 2#-np.log2(score+0.01) else: matrix[i][j] = -score**5 #0.0 matrix[j][i] = -score**5 # 0.0 ap = cluster.AffinityPropagation(affinity='precomputed') labels = ap.fit_predict(matrix) #centers,labels = cluster.affinity_propagation( # matrix, # affinity='precomputed' # ) # change to our internal cluster style idx = max(labels) + 1 if idx == 0: idx += 1 for i, c in enumerate(labels): if c == -1: labels[i] = idx idx += 1 # check for revert if revert: return dict(zip(range(len(taxa)), labels)) # return stuff clr = {} for i, t in enumerate(taxa): try: clr[labels[i]] += [t] except KeyError: clr[clusters[i]] = [t] return clr