def clustering(self): # Calculate similarity matrix X = self.create_tfidf_vector() X = X.toarray() pca = PCA(n_components=300, copy=False) X = pca.fit(X).transform(X) S = cosine_similarity(X, X) # Run affinity propogation af = AffinityPropagation() af.fit(S) # Formulate result tmp_clusters = defaultdict(list) goal_clusters = defaultdict(list) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 for label in labels: tmp_clusters[\ self.goal_list[cluster_centers_indices[label]]].append(\ self.goal_list[count]) count += 1 # 2nd-layer clutering of each cluster for goal, item_list in tmp_clusters.items(): subclusters = self.subcluster_by_editdistance(goal, item_list) for subgoal, items in subclusters.items(): goal_clusters[subgoal] = items return goal_clusters
def main(): ''' >>> main() # stuff happens ''' args = parse_args() setup_logging(args.log, verbose=args.verbose) chunks = sequence_chunk_generator(args.fasta_file, chunk_size=args.chunk_size) hasher = HashingVectorizer(analyzer='char', n_features = 2 ** 18, ngram_range=(args.ngram_min, args.ngram_max), ) estimator = AffinityPropagation() for chunk in chunks: logging.info('hashing chunk') chunk_vector = hasher.transform([ str(i.seq) for i in chunk ]) logging.info('clustering') estimator.fit(chunk_vector) logging.info('got %s clusters' % len(set(estimator.labels_)))
def clusterAffinityPropagation(self): """ Cluster the embeddings with affinity propagation :return: """ affin = AffinityPropagation() affin.fit(self.emb1.m) aflabels1 = affin.labels_ afclusters1 = dict() word2cluster1 = dict() for i,l in enumerate(aflabels1): points = afclusters1.setdefault(l,list()) points.append(self.emb1.rd[i]) for l,c in afclusters1.items(): for w in c: word2cluster1[w] = l self.cluster1 = afclusters1 self.word2cluster1 = word2cluster1 affin.fit(self.emb2.m) aflabels2 = affin.labels_ afclusters2 = dict() word2cluster2 = dict() for i,l in enumerate(aflabels2): points = afclusters2.setdefault(l,list()) points.append(self.emb2.rd[i]) for l,c in afclusters2.items(): for w in c: word2cluster2[w] = l self.cluster2 = afclusters2 self.word2cluster2 = word2cluster2
def affinity_propagation(crime_rows, column_names): """ damping : float, optional, default: 0.5 Damping factor between 0.5 and 1. convergence_iter : int, optional, default: 15 Number of iterations with no change in the number of estimated clusters that stops the convergence. max_iter : int, optional, default: 200 Maximum number of iterations. preference : array-like, shape (n_samples,) or float, optional Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. affinity : string, optional, default=``euclidean`` Which affinity to use. At the moment precomputed and euclidean are supported. euclidean uses the negative squared euclidean distance between points. """ crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] print("Running Affinity Propagation") # TODO: Parameterize this affinity_prop = AffinityPropagation() #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy) affinity_prop.fit(random_sampling(crime_xy, num_samples=5000)) affinity_propagation_labels = affinity_prop.predict(crime_xy) print("formatting....") return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, column_names)
def execute(args): ############################################################################## if len(args) < 1: usage() sys.exit() names, labels_true, X = parse(args[0]) indices = [int(i) for i in args[1:]] relevant_names = names[1:] if len(indices) > 0: X = np.asarray([[sample[i] for i in indices] for sample in X]) relevant_names = [relevant_names[i] for i in indices] print "Clustering on", str(relevant_names) + "..." ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50) # cluster_centers_indices = af.cluster_centers_indices_ # labels = af.labels_ # # n_clusters_ = len(cluster_centers_indices) y_pred = af.fit_predict(X) if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray: return 0 counts = get_cluster_counts(labels_true, y_pred) print counts
def clusterSimilarityWithSklearnAPC(data_file,damping=0.9,max_iter=200,convergence_iter=15,preference='min'): """ Compare Sparse Affinity Propagation (SAP) result with SKlearn Affinity Propagation (AP) Clustering result. Please note that convergence condition for Sklearn AP is "no change in the number of estimated clusters", for SAP the condition is "no change in the cluster assignment". So SAP may take more iterations and the there will be slightly difference in final cluster assignment (exemplars for each sample). """ # loading data simi_mat=loadMatrix(data_file) simi_mat_dense=simi_mat.todense() # get preference if preference=='min': preference=np.min(simi_mat_dense) elif preference=='median': preference=np.median(simi_mat_dense) print('{0}, start SKlearn Affinity Propagation'.format(datetime.now())) af=AffinityPropagation(damping=damping, preference=preference, affinity='precomputed',verbose=True) af.fit(simi_mat_dense) cluster_centers_indices,labels = af.cluster_centers_indices_,af.labels_ sk_exemplars=np.asarray([cluster_centers_indices[i] for i in labels]) print('{0}, start Fast Sparse Affinity Propagation Cluster'.format(datetime.now())) sap=SAP(preference=preference,convergence_iter=convergence_iter,max_iter=max_iter,damping=damping,verboseIter=100) sap_exemplars=sap.fit_predict(simi_mat_dense) # Caculate similarity between sk_exemplars and sap_exemplars exemplars_similarity=sparseAP_cy.arrSamePercent(np.array(sk_exemplars), np.array(sap_exemplars)) return exemplars_similarity
def cluster(mat, doc_indices): X = mat[:, doc_indices].T # Other clustering algorithms can easily be swapped in: # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster clust = AffinityPropagation() clust.fit(X) return zip(doc_indices, clust.labels_)
def make_cluster_map(damping=0.992): test_labels, prediction = pickle.load(open(f_path_pred, 'rb')) prob_conf = np.zeros((121, 121)) for l in range(121): inds = np.squeeze(np.array(np.where(test_labels == l))) class_conf = prediction[inds, :].mean(axis=0) prob_conf[l, :] = class_conf F = prob_conf D = (1-F) np.fill_diagonal(D, 0) D_p = 0.5*(D+D.T) clst = AP(damping=damping, # damping determines # of clusters max_iter=500, convergence_iter=15, affinity='euclidean', verbose=False) clst.fit(D_p) print 'Number of cluster:', len(clst.cluster_centers_) membership = np.c_[range(121), clst.labels_] fine_to_coarse = dict(membership) coarse_to_fine = {l: [] for l in clst.labels_} for k, v in fine_to_coarse.items(): coarse_to_fine[v].append(k) pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb')) pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
def create_stratum(self, column_names, **kwargs): ''' Use affinity propagation to find number of strata for each column. column_names is a list of the covariates to be split into strata and used for classification. This funciton adds a column to the data frame for each column as column_name_strata that gives the strata designation for that variable. The whole data frame is returned. ''' for colname in column_names: X = self.data[colname].reshape(-1, 1) if np.isnan(X).any(): raise ValueError("There are NaN values in self.data[%s] that the \ clustering algorithm can't handle" % colname) elif np.unique(self.data[colname]).shape[0] <=2: string_name = colname+'_strata' self.data[string_name] = self.data[colname].astype(int) else: af_model = AP(damping = 0.9) strata_groups = af_model.fit(X) #cluster_centers_indices = af.cluster_centers_indices_ #n_clusters_ = len(cluster_centers_indices) string_name = colname+'_strata' self.data[string_name] = strata_groups.labels_ return self.data
def cluster(scope): # Setup data df = pd.read_sql('playtype_data', db_engine) # Manipulate data into scope if scope == 'Team': df = df.drop('Player', 1).groupby('Team', as_index=False).mean() elif scope == 'Player': df = df.drop('Team', 1) else: raise Exception('This is never supposed to happen') # Normalize the data df[FEATURES] = (df[FEATURES] - df[FEATURES].mean()) / (df[FEATURES].max() - df[FEATURES].min()) # Run clustering clstr = AffinityPropagation() clstr.fit(df[FEATURES]) # Clump results df['cluster'] = clstr.labels_ df = df.sort('cluster') # Convert results to JSON for frontend return clusters_to_json(df, scope)
def affinity_propagation_cluster_analysis(x,y,preference): # NOT WORKING BECAUSE I DONT REALLY UNDERSTAND WHAT IT DOES... # ADAPTED FROM: # http://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html#example-cluster-plot-affinity-propagation-py X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) af = AffinityPropagation() af = af.fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in xrange(len(np.unique(labels))): my_members = labels == i cluster_center = X[cluster_centers_indices[i]] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) for j in X[my_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]],c=colors[i],linestyle='--') tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def cluster(self, feat_mtx, df_lm_allusers): # clustering artists based on AffinityPropogation start = time.time() af = AffinityPropagation() af.fit(feat_mtx) self.labels = af.labels_ self.af = af # adding cluster labels to least misery dataframe and sorting by rank and cluster #df_least_misery_clustered = self.df_least_misery.copy() --> changing to df_lm_allusers print 'number of labels: ', len(self.labels) print 'labels', self.labels # print 'least misery clustered length', len(df_least_misery_clustered) df_least_misery_clustered = df_lm_allusers.copy() print 'len df least misery: ', len(df_least_misery_clustered) df_least_misery_clustered['cluster'] = self.labels df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float) ''' will do different sorting if not using rank ''' # now set to false as looking for highest score df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col], ascending = False) self.df_least_misery_clustered = df_least_misery_clustered end = time.time() print 'clustering completed in: ', end - start return df_least_misery_clustered
def cluster_trajectories( curves ): """Given a list of curves, cluster_trajectories will cluster them.""" n_curves = len(curves) X_2B_clstrd = np.zeros( (n_curves, 4) ) X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] ) X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] ) X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] ) X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] ) for col in range( 4 ): X_2B_clstrd[:,col] /= X_2B_clstrd[:,col].std() def distance_metric(a,b): #A distance metric on R^4 modulo the involution #(x0,x2,x3,x4) -> (x3,x4,x1,x2) d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) ) T = lambda x: np.array([x[2],x[3],x[0],x[1]]) return min( d(a,b) , d(T(a),b) ) from sklearn.cluster import AffinityPropagation clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100) aff = np.zeros((n_curves, n_curves)) for i in range(n_curves): for j in range(i+1,n_curves): aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2) aff[j,i] = aff[i,j] #clusterer.Affinity = aff cluster_labels = clusterer.fit_predict(aff) out = [] for label in set( cluster_labels): cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) ) out.append( cluster ) return map( align_cluster, out)
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'): if m=='load': centroidRead=open('centroid','r') labelClusterRead=open('labelCluster','r') labelPreRead=open('labelPre','r') centroid=pickle.load(centroidRead) labelCluster=pickle.load(labelClusterRead) labelPre=pickle.load(labelPreRead) else: dataArrayTestNorm = preprocessing.normalize(dataArrayTest) dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain) #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10) clf=AffinityPropagation() #clf=DBSCAN(min_samples=30) pre=clf.fit(dataArrayTrainNorm) centroid=pre.cluster_centers_ centroidWrite=open('centroid','w') #pickle.dump(centroid,centroidWrite) labelCluster=pre.labels_ labelClusterWrite=open('labelCluster','w') #pickle.dump(labelCluster,labelClusterWrite) labelPre=clf.predict(dataArrayTestNorm) labelPreWrite=open('labelPre','w') #pickle.dump(labelPre,labelPreWrite) return centroid,labelCluster,labelPre
def clusterise_data(data_obj): """ Assigns a cluster label to each days present in the data received using three different algorithms: MeanShift, Affinity Propagation, or KMeans. @param data_obj: List of dictionaries """ L = len(data_obj) #Simply converts data_obj to a 2D list for computation List2D = [[None for _ in range(4)] for _ in range(L-1)] for i in range(L-1): #don't include current day #wake_up and sleep_duration are the most important factors List2D[i][0] = 5 * data_obj[i]["wake_up"] List2D[i][1] = 1 * data_obj[i]["sleep"] List2D[i][2] = 5 * data_obj[i]["sleep_duration"] List2D[i][3] = 0.5 * data_obj[i]["activity"] points = NumpyArray(List2D) #converts 2D list to numpyarray if ALGO == "Affinity Propagation": labels = AffinityPropagation().fit_predict(points) elif ALGO == "KMeans": labels= KMeans(init='k-means++', n_clusters=5, n_init=10) .fit_predict(points) elif ALGO == "MeanShift": bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20) labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points) else: raise Exception("Algorithm not defined: "+str(ALGO)) for i in range(L-1): data_obj[i]["cluster"] = labels[i] for unique_label in remove_duplicates(labels): debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points") debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time): BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X] labels = None if clusterType == 'kmeans': kmeans = KMeans(n_clusters=N_CLUSTERS) kmeans.fit(data) labels = kmeans.labels_ elif clusterType == 'affinity_propagation': ap = AffinityPropagation(damping=0.75) ap.fit(data) labels = ap.labels_ N_CLUSTERS = np.max(self.labels)+1 elif clusterType == 'DBSCAN': dbscan = DBSCAN() dbscan.fit(data) labels = dbscan.labels_ N_CLUSTERS = np.max(labels)+1 print 'N_CLUSTERS=' + str(N_CLUSTERS) elif clusterType == 'AgglomerativeClustering': ac = AgglomerativeClustering(n_clusters=N_CLUSTERS) ac.fit(data) labels = ac.labels_ else: print 'ERROR: clusterType: ' + clusterType + ' is not recognized' return (labels, N_CLUSTERS)
def affinity_propagation(feature_matrix): sim = feature_matrix * feature_matrix.T sim = sim.todense() ap = AffinityPropagation() ap.fit(sim) clusters = ap.labels_ return ap, clusters
def clustering_affinity_propagation(data_res): """ Executes sklearn's affinity propagation function with the given data frame """ af = AffinityPropagation() af.fit(data_res) predictions = af.predict(data_res) cluster_centers = af.cluster_centers_ return predictions, cluster_centers, af
def cluster_prop(self, filtered_data): prop_dict={} for review in filtered_data: for dicti in review['line']: if not prop_dict.has_key(dicti["prop"][0]): prop_dict[dicti["prop"][0]]={"freq":0,"data":[],"idx":[]} prop_dict[dicti["prop"][0]]['idx'].append(review['index']) prop_dict[dicti["prop"][0]]["freq"] += 1 prop_dict[dicti["prop"][0]]["data"].append(dicti) d_list=[] word_list=[] for word in prop_dict: try: d_list.append(self.wmodel[word]) word_list.append(word) except: pass Aprop = AffinityPropagation(damping=0.6, convergence_iter=100, max_iter=10000) Aprop.fit(d_list) cluster_dict = {} for idx, each in enumerate(Aprop.labels_): vec = d_list[idx] if not cluster_dict.has_key(each): cluster_dict[each] = {"word":[],"freq":0,"seed":"","sim":0.0} cluster_dict[each]["word"].append(word_list[idx]) total_freq=0 for each in cluster_dict.keys(): target_group_id = each group_id = each last_group_id = target_group_id cluster_freq=0 max_seed="" max_freq=0 for idx,data in enumerate(cluster_dict[each]["word"]): cluster_freq+=prop_dict[data]["freq"] if prop_dict[data]["freq"] > max_freq: max_freq=prop_dict[data]["freq"] max_seed=data cluster_dict[each]["freq"]=cluster_freq cluster_dict[each]["seed"]=max_seed return (cluster_dict, prop_dict, Aprop)
def cluster_concepts(context="location"): """ Cluster related concepts of a specific type to different categories """ db = Database() concept_category = ConceptCategory() cmd = "SELECT * FROM %s" % (context) context_res = db.query_db(cmd) concept_list = [] concept_matrix = [] for item in context_res: concept_list = [] concept_matrix = [] if context == "action": context_id, context_chinese, context_name = item[:3] elif context == "location": context_id, context_name, context_chinese = item cmd = ( "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \ WHERE a.%s_id = %s AND a.concept_id = b.id" % (context, context, context_id) ) concept_res = db.query_db(cmd) if len(concept_res) == 0: continue for item in concept_res: concept, concept_id = item concept_vector = concept_category.concept_axes.row_named(concept) concept_list.append((concept_id, concept)) concept_matrix.append(concept_vector) # Run affinity propogation S = cosine_similarity(concept_matrix, concept_matrix) af = AffinityPropagation() af.fit(S) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 clusters = defaultdict(list) for label in labels: clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count]) count += 1 category_num = 0 for key, value in clusters.items(): category_num += 1 for concept in value: cmd = ( "UPDATE %s_concept SET category = %d WHERE \ %s_id = %s AND concept_id = %s" % (context, category_num, context, context_id, concept[0]) ) db.query_db(cmd) print concept[1].encode("utf-8") + " ", print "" print "----------" + context_chinese.encode("utf-8") + "----------"
def affinityprop(lngs, lats, city, cluster_diameter): city_area = city["area"] city_lng = city["lng"] city_lat = city["lat"] lngs = np.array(lngs)#*(math.cos(city["lat"])**2) affinity = AffinityPropagation(damping=0.75, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False) affinity.fit(np.array([lngs, lats]).transpose()) cluster_labels = np.array(affinity.labels_) return labels_to_index(cluster_labels)
def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [article for article in articles if article["text_content"].strip() != ""] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
def affinity_propagation(): """ AffinityPropagation creates clusters by sending messages between pairs of samples until convergence. The messages sent between pairs represent the suitability for one sample to be the exemplar of the other, which is updated in response to the values from other pairs. this updates occurs iteratively until convergence, at which point the final exemplars are chosen and hence the final cluster is given. Algorithm: The message sent between pairs belongs to one of two categories. The first is the responsibility, r(i,k), which is the accumulated evidence that sample k should the exemplar for sample i. The second is the availability, a(i,k), which is the accumulated evidence that sample i should chose sample k to be its exemplar, and considers the values for all other samples that k should be an exemplar. In this case exemplars are chosen by samples if they are: - similar enough to many samples, and - chosen by many samples to be representative of themselves. """ # Generate a generic data sample. n_samples = 300 std = 0.3 seed = 0 centers = [ [-1., 0.], [0., 1.5], [1., 0.] ] data, target = make_blobs(n_samples = n_samples, centers = centers, cluster_std = std, random_state = seed) # Set the preference for each point: samples with large preference values # are more likely to be chosen as exemplars. The number of exemplars, i.e., # clusters, is influenced by the input preference values. If preferences are # not passed as arguments, they will be set to the median of the input # similarities. # pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)] pref = -50 # Compute affinity propagation. clf = AffinityPropagation(preference = pref) aff_y = clf.fit_predict(data) # Find mismatches between predicted and true values. cnt = int(0) for idx in range(n_samples): if(target[idx] != aff_y[idx]): cnt += 1 # Print results. print('Approximated number of clusters ', len(clf.cluster_centers_indices_)) print('Accuracy ', float(n_samples - cnt) / float(n_samples)) print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_)) print('Completeness ', metrics.completeness_score(target, clf.labels_)) # Plot resulting clusters. plt.figure(figsize = (8,8)) plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50) plt.title('Affinity clustering') plt.show()
def get_label_res2(similar_matrix, n_subs): cluster = AffinityPropagation(damping = 0.75 , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed') res = cluster.fit(similar_matrix) size_labels = len(set(res.labels_)) assert size_labels < 10, size_labels assert size_labels > 1, size_labels print res.labels_ return res.labels_
def do_issue(data, data_name): reduced_points, labels, km = reduce_npoints_kmeans(dataframe = data, dataset_name = dataset, data_name=data_name, n_datapoints = 1000, load_from_file = False) transformed_data, pca, components = calculate_pca(reduced_points, n_components=3) colormap = brewer2mpl.get_map('RdBu', 'diverging', 4, reverse=True) filename = figure_save_path + dataset + '_issue_29_1_%s_reduced_number_of_points.png'%data_name print "Making scatter plot of %s data for dataset %s, where the number of points have been reduced by K-Means clustering"%(data_name, dataset) make_color_grouped_scatter_plot(data_frame=transformed_data, x_name='d1', y_name='d2', color_by='d3', filename=filename, colormap=colormap) ap = AffinityPropagation(damping=affinity_damping) ap.fit(reduced_points) print "Making scatter plot of Affinity Propagation clusters of %s data for dataset %s"%(data_name, dataset) filename = figure_save_path + dataset + '_issue_29_2_%s_affinity.png'%data_name make_scatter_plot_for_labelled_data(data_frame=transformed_data, x_name='d1', y_name='d2', labels=ap.labels_, filename=filename, colormap = colormap, legend=True)
def cluster(self, feat_mtx): # clustering artists based on AffinityPropogation af = AffinityPropagation() af.fit(feat_mtx) self.labels = af.labels_ self.af = af # adding cluster labels to least misery dataframe and sorting by rank and cluster df_least_misery_clustered = self.df_least_misery.copy() df_least_misery_clustered['cluster'] = self.labels df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float) ''' will do different sorting if not using rank ''' df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col]) return df_least_misery_clustered
def affinity(DF): ''' calculate and plot affinity propagation ts clustering algoritm, return partition ''' X = normaliseTimeseries(DF) A = AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15) af = A.fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) M = metrics.silhouette_score(X, labels, metric='sqeuclidean') print('Estimated number of clusters: %d' % n_clusters_) print("Silhouette Coefficient: %0.3f" % M) fig, axes = plt.subplots(nrows=n_clusters_, figsize=(24, 18), sharex='all') colors = nColors(k=n_clusters_, cmap='spectral') ticks = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] dates = [datetime.datetime(year=2015, month=1, day=i, hour=0, minute=0) for i in range(1, 8)] for k, col in zip(range(n_clusters_), colors): X.iloc[cluster_centers_indices[k], :].plot( lw=1, c=col, label=k, alpha=.5, ax=axes[k]) X[labels == k].T.plot(lw=.5, c=col, alpha=0.2, ax=axes[k], legend=0) axes[k].set_title('cluster %d, %d zipcodes' % (k, len(X[labels == k])), fontsize=16) axes[k].set_xticklabels([], minor=False) # the default axes[k].set_xticklabels(ticks, minor=True) axes[k].set_yticklabels([], minor=False) for d in dates: axes[k].axvline(x=d, ymin=0, ymax=1, alpha=.5, linewidth=2) plt.tight_layout() result = DF.T result['label'] = labels result.reset_index(inplace=1) result.rename(columns={'index': 'postalCode'}, inplace=1) return result[['postalCode', 'label']]
def evaluate_clustering(): similarity_matrix = get_sense_similarity_submatrix(range(10000)) matrix_size = len(similarity_matrix) print('got matrix') affinity_propagation = AffinityPropagation() labels1 = affinity_propagation.fit_predict(similarity_matrix) print('affinity propagation') dbscan = DBSCAN(min_samples=1) labels2 = dbscan.fit_predict(similarity_matrix) print('print dbscan') distance_matrix = np.ndarray((matrix_size, matrix_size)) for i in range(matrix_size): for j in range(matrix_size): distance_matrix[i, j] = 1 - similarity_matrix[i, j] print(distance_matrix[1, 2]) print(distance_matrix[1, 1]) print('created distance matrix') cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1) cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2) print(cluster_map1) print(cluster_map2) sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean') sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean') sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix) sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix) num_elements1 = [len(values) for values in cluster_map1.values()] num_elements2 = [len(values) for values in cluster_map2.values()] print(num_elements1) print(num_elements2) print('Number of clusters Affinity Propagation: %f' % len(cluster_map1)) print('Number of clusters DBSCAN: %f' % len(cluster_map2)) print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1)) print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2)) print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1)) print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2)) print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1) print('Silouhette score DBSCAN (distance matrix): %f' % sc2) print('Dunn index Affinity Propagation (distance matrix): %f' % sc5) print('Dunn index DBSCAN (distance matrix): %f' % sc6)
def affinity_propagation(self, affinity_matrix=None, sigma=1, **kwargs): """ :param kwargs: damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, verbose=False :return: """ if affinity_matrix is None: aff = rbf(self.dm.values, sigma) else: aff = affinity_matrix est = AffinityPropagation(affinity='precomputed', **kwargs) est.fit(aff.view(np.ndarray)) return Partition(est.labels_)
def affinity_cluster(all_features): # cluster all features X = np.array(all_features) af = AffinityPropagation(verbose=True, preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print ("Estimated number of clusters: %d" % n_clusters_) # use transform method of trained model of KMeans to do vector quantization X_fit = af.fit(X) return X_fit, n_clusters_
def test_affinity_propagation_random_state_warning(): # test that a warning is raised when random_state is not defined. X = np.array([[0, 0], [1, 1], [-2, -2]]) match = "'random_state' has been introduced in 0.23." with pytest.warns(FutureWarning, match=match): AffinityPropagation().fit(X)
def affinityPropagation(Data,DataLabels): print("=======AffinityPropagation========") clustering = AffinityPropagation().fit(Data) printResult(clustering.labels_,DataLabels)
def handleEndGame(self, event, replay): try: pdict = {} for player in replay.players: player.bases = {} player.base_cluster = {} pdict[player.team_id] = player old_frames = {p.pid: 0 for p in replay.players} for frame in self.keyframes: for player in replay.players: player.bases[frame] = {} player.base_cluster[frame] = {} if frame > 0: for f in range(old_frames[player.pid] + 1, frame): player.bases[f] = player.bases[old_frames[ player.pid]] player.base_cluster[f] = player.base_cluster[ old_frames[player.pid]] old_frames[player.pid] = frame for f, ls in self.lookup.items(): if f <= frame: locs, teamids, finishes, prefs = zip( *list(ls.values())) unit_ids = list(ls.keys()) else: break locs = np.array(locs) prefs = np.array(prefs) finishes = np.array(finishes) self.logger.debug( f"(frame {frame}): locs = {locs.tolist()} prefs = {prefs.tolist()}" ) af = AffinityPropagation( preference=[0 if p else -5000 for p in prefs], random_state=None).fit(locs) count = 1 while -1 in af.labels_ and count < 100: # indicates clustering did not converge, so we retry until it does (giving up after 100 tries) af = AffinityPropagation( preference=[0 if p else -5000 for p in prefs], random_state=None, max_iter=5000).fit(locs) count += 1 if count > 1: self.logger.warning( f"(frame {frame}): Tried {count} times to achieve convergence --- {'FAILED' if count == 100 else 'SUCCEEDED'}" ) cluster_centers_indices = af.cluster_centers_indices_ centers = af.cluster_centers_.tolist() labels = af.labels_ self.logger.debug(f"(frame {frame}): labels = {labels}") n_clusters = len(cluster_centers_indices) # mining location? must be separate cluster new_centers = [] for k in range(n_clusters): # mining bases in this cluster mining_locs = [(loc, finish) for loc, finish, pref in zip( locs[labels == k], finishes[labels == k], prefs[ labels == k]) if pref] if len(mining_locs) > 1: # split up clusters with more than one mining base self.logger.debug( f"(frame {frame}): mining_locs = {mining_locs}") original = min(filter(lambda x: x is not None, mining_locs), key=lambda x: x[1]) to_split = [ x for x in mining_locs if x[0].tolist() != original[0].tolist() ] self.logger.debug( f"(frame {frame}): original = {original}, to_split = {to_split}" ) for i, (loc, finish) in enumerate(to_split): new_label = n_clusters + i self.logger.debug( f"(frame {frame}): changing {labels[(locs == loc).all(axis=1).nonzero()]} to {new_label}" ) labels[(locs == loc).all( axis=1).nonzero()] = new_label members = [(loc, finish) for loc, finish, pref in zip(locs[labels == k], finishes[ labels == k], prefs[labels == k]) if not pref] for ml, mf in members: if dist(ml, loc) == min( dist(ml, x[0]) for x in [original] + to_split[:i] + to_split[i + 1:]) and mf >= finish: self.logger.debug( f"(frame {frame}): changing {labels[(locs == ml).all(axis=1).nonzero()]} to {new_label}" ) labels[(locs == ml).all( axis=1).nonzero()] = new_label new_centers.append(loc) for c in new_centers: cluster_centers_indices = np.append( cluster_centers_indices, (locs == c).all(axis=1).nonzero()) n_clusters += 1 # maximum distance new_centers = [] for loc in locs: if all( dist(loc, c) / self.map_dim > 0.1 for c in centers ): # too far away from any cluster center, should be split if any( dist(loc, select_center(cs)) / self.map_dim <= 0.1 for cs in new_centers ): # close to an already split building, merge _, i = min((dist(loc, select_center(cs)), i) for i, cs in enumerate(new_centers)) labels[(locs == loc).all( axis=1).nonzero()] = n_clusters + i new_centers[i].append(tuple(loc)) else: # start a new cluster labels[(locs == loc).all(axis=1).nonzero( )] = n_clusters + len(new_centers) new_centers.append([tuple(loc)]) for cs in new_centers: central = select_center(cs) cluster_centers_indices = np.append( cluster_centers_indices, (locs == central).all(axis=1).nonzero()) n_clusters += 1 self.logger.debug( f"(frame {frame}): set(labels) = {set(labels)} center indices = {cluster_centers_indices}" ) base_types = {} for loc, label in zip(locs, labels): if any(np.array_equal(loc, m) for m in self.mains): base_types[label] = BaseType.MAIN elif label not in base_types and (is_mining_loc( self.resource_clusters, loc)): base_types[label] = BaseType.EXPANSION for unit_id, loc, team_id, label in zip( unit_ids, locs, teamids, labels): pdict[team_id].bases[frame][unit_id] = loc pdict[team_id].base_cluster[frame][unit_id] = BaseCluster( label, locs[cluster_centers_indices[label]], base_types.get(label, BaseType.PROXY)) except: print(locs) print(replay.filename) traceback.print_exc() for player in replay.players: if frame < replay.frames: for f in range(frame + 1, replay.frames + 1): player.bases[f] = player.bases[frame] player.base_cluster[f] = player.base_cluster[frame] assert len( player.bases ) == replay.frames + 1, f"{len(player.bases)} base entries, {replay.frames} frames {sorted(player.bases.keys())}"
class AP(object): def __init__(self, damping=.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state='warn'): """ Parameters ---------- damping : TYPE, optional 阻尼系数 0.5~1 之间 DESCRIPTION. The default is .5. max_iter : TYPE, optional 最大迭代次数 DESCRIPTION. The default is 200. convergence_iter : TYPE, optional 停止收敛的估计簇数没有变化的迭代数 DESCRIPTION. The default is 15. copy : TYPE, optional 复制输入数据 True DESCRIPTION. The default is True. preference : TYPE, optional DESCRIPTION. The default is None. affinity : TYPE, optional {"euclidean","precomputed"} 欧氏距离 与与计算 DESCRIPTION. The default is 'euclidean'. verbose : TYPE, optional DESCRIPTION. The default is False. random_state : TYPE, optional DESCRIPTION. The default is 'warn'. Returns ------- None. """ self.ap_cluster = AffinityPropagation( damping=damping, max_iter=max_iter, convergence_iter=convergence_iter, copy=copy, preference=preference, affinity=affinity, verbose=verbose, random_state=random_state) def fit(self, x, y=None): self.ap_cluster.fit(X=x, y=y) def fit_predict(self, x, y=None): return self.ap_cluster.fit_predict(X=x, y=y) def get_params(self, deep=True): return self.ap_cluster.get_params(deep=deep) def set_params(self, params): self.ap_cluster.set_params(**params) def predict(self, x): return self.ap_cluster.predict(X=x) def get_cluster_centers_indices(self): return self.ap_cluster.cluster_centers_indices_ def get_cluster_centers(self): return self.ap_cluster.cluster_centers_ def get_labels(self): return self.ap_cluster.labels_ def get_affinity_matrix(self): return self.ap_cluster.affinity_matrix_ def get_n_iter(self): return self.ap_cluster.n_iter_
def ReadFilesClusters(file2): print(file2) #Leyendo file file = open(file2, "r") lineName = file2.split('.') lineName = lineName[0].split('/') print("lineName[1]=" + lineName[1]) ## Crear carpetas newpath = r'C:/Users/gquis/Documents/Visual Studio 2015/Projects/Kmeans/Kmeans/clusters/' + lineName[ 1] if not os.path.exists(newpath): os.makedirs(newpath) # Read data from files file2 = open("clusters/size" + lineName[1] + ".txt", "r") line = file2.readlines() [n, dim] = [int(val) for val in line[0].split()] dataSubCluster = np.zeros( (n, dim)) # Aqui va la subdata que pertenece a un cluster # Se tiene que mapear los Id de los clusters que entran IDs = np.zeros((n, ), dtype=int) #IDs segun la lista grande idx = 0 for lineG in file: #print(lineG) # cada line es un numero line2 = lineG.split() dataSubCluster[idx] = X[int(lineG)] # Features obtenidos IDs[idx] = int(lineG) idx = idx + 1 #print('pintandodataSubCluster') #print(dataSubCluster) sizeCluster = len(dataSubCluster) ## Aplicar de nuevo clustering ############################################################################## # Se hace validacion para clusters con un elemento if (sizeCluster == 1): cluster_centers_indices = [0] #print(cluster_centers_indices) labels = [0] n_clusters_ = 1 else: af = AffinityPropagation(preference=None).fit(dataSubCluster) cluster_centers_indices = af.cluster_centers_indices_ #print(cluster_centers_indices) labels = af.labels_ n_clusters_ = len(cluster_centers_indices) instring = r'clusters/' + lineName[1] + '/ClustersOutPut.txt' print('instring=' + instring) file = open(instring, "w") for i in range(0, len(labels)): print( "labels[" + str(IDs[i]) + "]=" + str(IDs[int(labels[i])]) ) #Ojo en esta parte se esta asumiendo que label toma encuenta el Id como el orden en el que entra al proceso de clustering. file.write(str(IDs[int(labels[i])]) + " " + str(IDs[i]) + '\n') file.close() ########## Despues de este modulo se va tener que escribir alguna validacion ## para tomar solo las carpetas que salieron con elementos en los clusters. ## ya que los archivos para el proceso de jutar indices esta en c++ y no es posible # crearlos automaticamente (si es posible pero va demorar xd). ####################### Parte donde recogemos los clusters ##################### file = open('clusters/' + lineName[1] + '/dim.dim', "w") file.write(str(dim)) file.close() ################Borrar los archvios .cluster .tex y .pdf #################### os.system( 'cd clusters/' + lineName[1] + '/ && del *.cluster && del *.pdf && del *.log && del *.aux && del sizecluster*' ) os.system('cd clusters/' + lineName[1] + '/ && g++ recogerElementosClusters.cpp && a.exe') ## Presentar Clusters creando .tex ################################################################################ os.system('cd clusters/' + lineName[1] + '/ && python PresentClusters.py') # construye el archivo .tex ## Presentar generar PDFS ############################################################################### #Aqui si es posible ejecutamos los .tex comandos os.system('cd clusters/' + lineName[1] + '/ && python generarPDF.py') # Sólo genera pdf's
# train_x=X[:train_len] # train_y=labels[:train_len] # # test_x=X[train_len:] # test_y=labels[train_len:] # KMeans km = KMeans(n_clusters=class_num) km.fit(X) pred_y = km.labels_ nmi = normalized_mutual_info_score(labels, pred_y) print('KMeans NMI:{:.4f}'.format(nmi)) # AffinityPropagation affinity_propagation = AffinityPropagation(damping=0.9, preference=-1) affinity_propagation.fit(X) pred_y = affinity_propagation.labels_ nmi = normalized_mutual_info_score(labels, pred_y) print('AffinityPropagation NMI:{:.4f}'.format(nmi)) # Mean-shift bandwidth = estimate_bandwidth(X, quantile=0.2) mean_shift = MeanShift(bandwidth=0.8, bin_seeding=True) mean_shift.fit(X) pred_y = mean_shift.labels_ nmi = normalized_mutual_info_score(labels, pred_y) print('Mean-shift NMI:{:.4f}'.format(nmi))
def test_affinity_propagation_predict(): # Test AffinityPropagation.predict af = AffinityPropagation(affinity="euclidean", random_state=63) labels = af.fit_predict(X) labels2 = af.predict(X) assert_array_equal(labels, labels2)
str(''.join(letter)) for letter_array in mat['field'] for letter in letter_array ] ## preprocessing Y = UCData attrind = np.array(range(1, 51) + range(62, 78, 3)) Field = [Field[i] for i in range(1, 51) + range(62, 78, 3)] X = AttrData[:, attrind] X[np.isnan(X)] = 0 scaler = preprocessing.StandardScaler().fit(X) Xn = scaler.fit_transform(X) ### cluster model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000) model = AffinityPropagation(preference=-150, verbose=True) #model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True) model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100), bin_seeding=True) label = SSRS.Cluster(X, model) ### classification model = tree.DecisionTreeClassifier() model = GaussianNB() model = svm.SVC() model = SGDClassifier() Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model) SSRS.plotErrorMap(label, Tp)
#df = pd.read_csv('C:/Users/neshragh/ecounter/Affinity_Sample_SPY/1-Dataset_ecounter/data analysis/HourlyAP/29apriloneweek.csv')#one day df = pd.read_csv( 'C:/Users/neshragh/ecounter/Affinity_Sample_SPY/1-Dataset_ecounter/data analysis/monthIntervention.csv' ) #one day # ############################################################################# #Choose data for algorithm #df = df.loc[(df.Date >= '5/30/2019') & (df.Date <= '5/5/2019')] df = df.loc[(df['Date'] >= '4/29/2019') & (df.Date <= '5/4/2019')] #df = df[df] df = df.loc[(df.Time >= '07') & (df.Time <= '19')] X = df.loc[df.index, ['Count', 'Position']].to_numpy() # Compute Affinity Propagation af = AffinityPropagation(preference=-4, damping=.95, max_iter=500).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) #print('Estimated number of clusters: %d' % n_clusters_) #print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) #print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) #print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) #print("Adjusted Rand Index: %0.3f" # % metrics.adjusted_rand_score(labels_true, labels)) #print("Adjusted Mutual Information: %0.3f" # % metrics.adjusted_mutual_info_score(labels_true, labels)) '''print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))'''
class LexRank(object): def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=[ 'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number' ], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError( "available similarity functions are: cosine, jaccard, normalized_cooccurrence" ) self.sim = lambda sentence1, sentence2: self.decay( sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix ) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [ 0 for index in range(matrix.shape[0]) ] else: raise LexRankError( "available clustering algorithms are: birch, markov, no-clustering(use `None`)" ) self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify def summarize(self, text): self.sentences = self.factory.text2sentences(text) self.num_sentences = len(self.sentences) self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size) self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True) self.tfidfs = self.model[self.corpus.bows] self._inject_tfidfs() self._build_matrix() self._clustering() if self.compactify: self._compactify() self.graphs = [] for i in range(self.num_clusters): graph = self.sentences2graph(self.clusters[i]) pagerank = networkx.pagerank(graph, weight='weight') self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True) self.graphs.append(graph) def _sim_jaccard(self, sentence1, sentence2): if sentence1 == sentence2: return 1 p = sum((sentence1.counter & sentence2.counter).values()) q = sum((sentence1.counter | sentence2.counter).values()) return p / q if q else 0 def _sim_cosine(self, sentence1, sentence2): if sentence1 == sentence2: return 1 sentence1_tfidf = { word_id: tfidf for word_id, tfidf in sentence1.tfidf } sentence2_tfidf = { word_id: tfidf for word_id, tfidf in sentence2.tfidf } vector1, vector2 = self.vectorizer.fit_transform( [sentence1_tfidf, sentence2_tfidf]).toarray() return vector1.dot(vector2) def _sim_normalized_cooccurrence(self, sentence1, sentence2): if sentence1 == sentence2: return 1 return len(set(sentence1.tokens) & set(sentence2.tokens)) / ( math.log(len(sentence1.tokens)) + math.log(len(sentence2.tokens))) def decay(self, sentence1, sentence2): distance = abs(sentence1.index - sentence2.index) closeness = max(self.decay_window - distance, 0) / self.decay_window return math.pow(closeness, self.decay_alpha) def _inject_tfidfs(self): for index in range(self.num_sentences): bow = self.corpus.bows[index] self.sentences[index].bow = bow self.sentences[index].tfidf = self.model[bow] def _build_matrix(self): self.matrix = np.zeros((self.num_sentences, self.num_sentences)) for sentence1 in self.sentences: for sentence2 in self.sentences: self.matrix[sentence1.index, sentence2.index] = self.sim(sentence1, sentence2) if self.matrix_smoothing: for index in range(self.num_sentences): self.matrix[index, index] = 0 self.matrix[index, index] = max(self.matrix[index]) def sentences2graph(self, sentences): graph = networkx.Graph() graph.add_nodes_from(sentences) for sentence1 in sentences: for sentence2 in sentences: weight = self.matrix[sentence1.index, sentence2.index] if weight: graph.add_edge(sentence1, sentence2, weight=weight) return graph def _clustered(self): self.clusters = [ cluster for cluster in self.clusters if len(cluster) >= self.min_cluster_size ] self.num_clusters = len(self.clusters) self.clusters = sorted(self.clusters, key=lambda cluster: len(cluster), reverse=True) def _clustering(self): cls = self._clusterer(self.matrix) bucket = {} for index in range(len(cls)): key = str(cls[index]) if key not in bucket: bucket[key] = [] bucket[key].append(self.sentences[index]) self.clusters = bucket.values() self._clustered() def _compactify(self): clusters = [] for cluster in self.clusters: compact_cluster = [] cluster_size = len(cluster) for i in range(cluster_size): cluster[i].duplicated = False for i in range(cluster_size): if cluster[i].duplicated: continue for j in range(i + 1, cluster_size): if cluster[j].duplicated: continue if self.uniform_sim( cluster[i], cluster[j]) > self.similarity_threshold: cluster[j].duplicated = True compact_cluster.append(cluster[i]) clusters.append(compact_cluster) self.clusters = clusters self._clustered() def _verbose(self): summaries = sorted(self.summaries, key=lambda sentence: sentence.index) return [sentence.text for sentence in summaries] def probe(self, k=None): if not hasattr(self, 'clusters'): raise LexRankError("summarize it first") if not k: k = max(2, self.num_clusters) if k < 0: raise LexRankError( "appropriate value for `k`: float(0 ~ 1) for compress rate, or natural number for exact number of sentences" ) if k > self.num_sentences: raise LexRankError("this will not give a summarization") if k < 1: k = int(self.num_sentences * k) self.summaries = [] ends = np.array([len(cluster) for cluster in self.clusters]) drones = np.zeros(ends.shape) for i in range(self.num_clusters): self.summaries.append(self.clusters[i][0]) drones[i] += 1 if len(self.summaries) == k: return self._verbose() while True: branch = np.array([drones + 1, ends]).min(axis=0) / ends leach = int(branch.argmin()) drone = int(drones[leach]) self.summaries.append(self.clusters[leach][drone]) drones[leach] += 1 if len(self.summaries) == k: return self._verbose()
from sklearn.datasets import load_digits from sklearn.preprocessing import scale import numpy as np from sklearn.decomposition import PCA from time import time # ############################################################################# #sample data digits = load_digits() data = scale(digits.data) n_samples, n_features = data.shape n_digits = len(np.unique(digits.target)) labels_true = digits.target # Compute Affinity Propagation t0 = time() af = AffinityPropagation(preference=-5000).fit(data) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('name\t\t\t\ttime\thomo\tcompl\tNMI') print('%-15s\t%.2fs\t%.3f\t%.3f\t%.3f' % ( 'AffinityPropagation', (time() - t0), metrics.homogeneity_score(labels_true, labels), metrics.completeness_score(labels_true, labels), metrics.normalized_mutual_info_score( labels_true, labels, average_method='arithmetic'), )) # Plot result import matplotlib.pyplot as plt from itertools import cycle
def test_affinity_propagation_pairwise_is_deprecated(): afp = AffinityPropagation(affinity='precomputed') msg = r"Attribute _pairwise was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): afp._pairwise
def test_affinity_propagation(): # Affinity Propagation algorithm # Compute similarities S = -euclidean_distances(X, squared=True) preference = np.median(S) * 10 # Compute Affinity Propagation cluster_centers_indices, labels = affinity_propagation( S, preference=preference, random_state=39) n_clusters_ = len(cluster_centers_indices) assert n_clusters == n_clusters_ af = AffinityPropagation(preference=preference, affinity="precomputed", random_state=28) labels_precomputed = af.fit(S).labels_ af = AffinityPropagation(preference=preference, verbose=True, random_state=37) labels = af.fit(X).labels_ assert_array_equal(labels, labels_precomputed) cluster_centers_indices = af.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) assert np.unique(labels).size == n_clusters_ assert n_clusters == n_clusters_ # Test also with no copy _, labels_no_copy = affinity_propagation(S, preference=preference, copy=False, random_state=74) assert_array_equal(labels, labels_no_copy) # Test input validation with pytest.raises(ValueError): affinity_propagation(S[:, :-1]) with pytest.raises(ValueError): affinity_propagation(S, damping=0) af = AffinityPropagation(affinity="unknown", random_state=78) with pytest.raises(ValueError): af.fit(X) af_2 = AffinityPropagation(affinity='precomputed', random_state=21) with pytest.raises(TypeError): af_2.fit(csr_matrix((3, 3)))
def worker(X, damping): method = AffinityPropagation(damping=damping) method.fit(X) key = methodName + "/length_" + length + "/" + deg + "/individuals/affinity_propagation_damping_" + str( damping) np.savetxt(key + "_labels.csv", method.labels_, fmt="%d")
print(type(X_test.iloc[1, 1])) #X_train = X_train.fillna(0) X_ktrain = X_train.values y_ktrain = y_train.values #print(X_train.head()) N = X_ktrain.shape[0] affinity = np.zeros((N, N)) for i in range(N): affinity[i, :] = bdist(X_ktrain, X_ktrain[i], 5, 1e-3, 1e-25) #Time tsum = 0 t = time.process_time() cluster = AffinityPropagation(damping=0.5, affinity='precomputed') labels = cluster.fit_predict(affinity) C = np.unique(labels).size clusters = X_ktrain[cluster.cluster_centers_indices_] # estimate positions for test data pred, error3D, error2D, fdetect, cused, true_labels, acc_pred = position_route( method, X_ktrain, y_ktrain, x_test, y_test, clusters, labels, N=5, eps=1e-3)
def Affinity_Propagation(X, Y): print("Affinity_Propagation") label_pred = AffinityPropagation().fit_predict(X) score(Y, label_pred)
'n_neighbors': 10, 'n_clusters': 5 } bandwidth = estimate_bandwidth(embedding, quantile=params['quantile']) connectivity = kneighbors_graph(embedding, n_neighbors=params['n_neighbors'], include_self=False) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ward = AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = DBSCAN(eps=params['eps']) affinity_propagation = AffinityPropagation(damping=params['damping'], preference=params['preference']) average_linkage = AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = Birch(n_clusters=params['n_clusters']) gmm = GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)) #now plot everything f, ax = plt.subplots(2, 4, figsize=(20, 15)) for idx, (name, algorithm) in enumerate(clustering_algorithms):
from sklearn.datasets.samples_generator import make_blobs # ############################################################################# #import data f = open("./pc1-pc2-completetn-aromagroups.txt") x = np.loadtxt(f, delimiter='\t', skiprows=1) # create np array for data points data = np.array(x).astype("float") #data[i][j], i varies the row (chooses the coordinates [pc1, pc2] at row i) #data[i][j], j varies the column (chooses between pc1 and pc2 respectively 0 or 1) X = data # ############################################################################# # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle plt.close('all')
plt.show() #k-means clustring kmeans = KMeans(n_clusters=3).fit(X_trans) #n_clusters=3 # Fitting the input data and getting the cluster labels labels_k = kmeans.labels_ print labels_k #AgglomerativeClustering agglomerative = AgglomerativeClustering(n_clusters=3).fit(X_trans) # Fitting the input data and getting the cluster labels labels_agg = agglomerative.labels_ print labels_agg #AffinityPropagation affinity = AffinityPropagation().fit(X_trans) # Fitting the input data and getting the cluster labels labels_aff = affinity.labels_ print labels_aff ''' --------------------- Question 3 --------------------- ''' #set parameter for Gridsearch cv = 10 n_features = X_train.shape[1] #KNeighborsClassifier n_neighbors_range = np.arange(1, 20, 1) param_grid_n = dict(n_neighbors=n_neighbors_range) #set tuning parameter range
def affinity(df_std): AffPro = AffinityPropagation(max_iter=300, preference=-50, verbose=True) aa = AffPro.fit(df_std) return aa
from sklearn.preprocessing import scale from time import time digits = load_digits() #获得原始数据 origin_data = digits.data #获得原始数据的标签,即属于哪一类 labels = digits.target #对原始数据进行标准化 data = scale(origin_data) #查看label中一共有多少类 n_classes = len(np.unique(labels)) km = KMeans(init='random', n_clusters=10) ap = AffinityPropagation() ms = MeanShift() sc = SpectralClustering(n_clusters=10, gamma=0.1) ac = AgglomerativeClustering(n_clusters=10, linkage='average') whc = AgglomerativeClustering(n_clusters=10, linkage='ward') db = DBSCAN() gm = GaussianMixture(n_components=10) print(82 * '_') print('name\t\ttime\t\th_score\t\tc_score\t\tnmi') def bench(estimator, name, data): t0 = time() estimator.fit(data) print('%s\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f' %
## 生成的测试数据的中心点 centers = [[1, 1], [-1, -1], [1, -1]] ##生成数据 Xn, labels_true = make_blobs(n_samples=150, centers=centers, cluster_std=0.5, random_state=0) simi = [] for m in Xn: ##每个数字与所有数字的相似度列表,即矩阵中的一行 temp = [] for n in Xn: ##采用负的欧式距离计算相似度 s = -np.sqrt((m[0] - n[0])**2 + (m[1] - n[1])**2) temp.append(s) simi.append(temp) p = -50 ##3个中心 #p = np.min(simi) ##9个中心, #p = np.median(simi) ##13个中心 ap = AffinityPropagation(damping=0.5, max_iter=500, convergence_iter=30, preference=p).fit(Xn) cluster_centers_indices = ap.cluster_centers_indices_ for idx in cluster_centers_indices: print(Xn[idx])
train_data = np.loadtxt('Train_Data.csv', dtype=np.float32, delimiter=',') train_labels = np.loadtxt('Train_Labels.csv', dtype=np.int32, delimiter=',') test_data = np.loadtxt('Test_Data.csv', dtype=np.float32, delimiter=',') test_labels = np.loadtxt('Test_Labels.csv', dtype=np.int32, delimiter=',') class_names = ['1', '2', '3'] # Feature Selection all_data = np.vstack((train_data,test_data)) all_data_labels=np.hstack((train_labels,test_labels)) sel = VarianceThreshold(threshold=0.90*(1-0.90)) all_data = sel.fit_transform(all_data) all_data_size, _ = all_data.shape _, feature_size = all_data.shape clustering = AffinityPropagation(preference= -1200,damping=0.92).fit(all_data) tmp = clustering.labels_ replace_all(tmp,0,10) replace_all(tmp,1,20) replace_all(tmp,2,30) replace_all(tmp,10,1) replace_all(tmp,20,3) replace_all(tmp,30,2)
chem_map = pd.read_csv("data/chem_all.csv").to_numpy() topn_smile = [chem_map[int(idx.split('_')[1]), 0] for idx in topn_idx] sm = np.zeros((n, n)) for i in range(n): for j in range(i, n): m1, m2 = Chem.MolFromSmiles(topn_smile[i]), Chem.MolFromSmiles(topn_smile[j]) sm[i, j] = FingerprintSimilarity(Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2)) sm = sm + sm.T - np.eye(n) from sklearn.cluster import AffinityPropagation af = AffinityPropagation().fit(sm) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print("{} clusters: ".format(n_clusters_)) print(" Center index: {}".format(cluster_centers_indices.tolist())) print(" Labels: {}".format(labels.tolist())) send = {'1EVZ': ['CHEM_833', 'CHEM_84524', 'CHEM_6372', 'CHEM_28096', 'CHEM_16023'], '1P33': ['CHEM_6372', 'CHEM_40322', 'CHEM_4109', 'CHEM_8472'], '3HQQ': ['CHEM_6372', 'CHEM_40322', 'CHEM_4109', 'CHEM_16498'], '1T10': ['CHEM_6372', 'CHEM_40322', 'CHEM_3777', 'CHEM_38064', 'CHEM_74497']}
shell=True) if opts.match: subprocess.call([ TOMTOM + ' -oc ' + opts.outfile + '-aggthresh=' + str(thresh) + ' ' + opts.outfile + '-aggthresh=' + str(thresh) + '.meme ' + MOUSEMEME ], shell=True) else: #opts.clustering == 'representative' #shrink motif entropy to end up with fewer clusters? preference = np.array( [1.0 / motif_entropy(motif_dict[motif]) for motif in motifs]) * 0.25 agg_clust = AffinityPropagation(affinity='precomputed', preference=preference).fit( np.clip(similarity_cor, 0, np.max(similarity_cor))) seq_motifs = {} print('# clusters:', len(set(agg_clust.labels_))) for label_ind, cluster_center in enumerate( agg_clust.cluster_centers_indices_): seq_motifs[motifs[cluster_center]] = motif_dict[motifs[cluster_center]] with open(opts.outfile + '-' + motifs[cluster_center] + '.cluster', 'w') as f: for mind in np.where(agg_clust.labels_ == label_ind)[0]: f.write(motifs[mind] + '\n') with open(opts.outfile + '-affinityprop.motifs', 'w') as f: for key in seq_motifs.keys(): f.write('>' + key + '\n') for i in range(seq_motifs[key].shape[0]): f.write('\t'.join([str(v)
class FeatureClusterer(ClusterMixin, BaseEstimator): #clusters features together #like sklearn feature agglomeration, but can work on dataframes and tracks names of the features def __init__(self, base_model = 'default', scale = False): if base_model is None or base_model == 'default': self.base_model = AffinityPropagation() else: self.base_model = base_model self.scale = scale assert(hasattr(self.base_model, 'fit_predict')) def fit(self, x, y = None): if self.scale: x = StandardScaler().fit_transform(x) x = x.transpose() self.labels = self.base_model.fit_predict(x) self.labels = self.map_to_zero(self.labels) def map_to_zero(self, labels): labels -= labels.min() unique_labels = set(labels) n_labels = len(unique_labels) if n_labels == labels.max(): return labels for i in range(n_labels): if i not in set(labels): args = np.argwhere(labels > i) labels[args] -= 1 return labels def predict(self, x, y = None): index = list(x.index) x = x.transpose() # if hasattr(self.base_model, 'predict'): # labels = self.base_model.predict(x) # else: # labels = self.base_model.fit_predict(x) # labels = self.map_to_zero(labels) # print(sorted(set(labels))) is_df = isinstance(x, pd.DataFrame) groups = [[] for x in range(len(set(self.labels)))] group_names = [[] for x in range(len(set(self.labels)))] for pos, groupnum in enumerate(self.labels): if is_df: feature = x.iloc[pos] groups[groupnum].append(feature.values) group_names[groupnum].append(feature.name) else: groups[groupnum].append(x[pos]) f_out = np.zeros((len(set(self.labels)), x.shape[1])) for row, vals in enumerate(groups): f_out[row] = np.mean(vals, axis = 0) x_out = f_out.transpose() group_names = [','.join(gn) for gn in group_names] if is_df: x_out = pd.DataFrame(x_out, index=index, columns = group_names) return x_out def fit_predict(self, x, y = None): self.fit(x) return self.predict(x)
class ComplexBuilder(object): def __init__(self, method="HDBSCAN"): "" if method == "OPTICS": self.clustering = OPTICS(min_samples=2, metric="precomputed", n_jobs=4) elif method == "AGGLOMERATIVE_CLUSTERING": self.clustering = AgglomerativeClustering(affinity="precomputed") elif method == "AFFINITY_PROPAGATION": self.clustering = AffinityPropagation(affinity="precomputed") elif method == "HDBSCAN": self.clustering = hdbscan.HDBSCAN(min_cluster_size=2) self.method = method def set_params(self, params): self.clustering.set_params(**params) def fit(self, X, metricColumns, scaler=None, inv=False, poolMethod="min", umapKwargs={ "min_dist": 1e-7, "n_neighbors": 4, "random_state": 350 }, generateSquareMatrix=True, preCompEmbedding=None, useSquareMatrixForCluster=False, entryColumns=["E1", "E2"]): """ Fits predicted interactions to potential macromolecular complexes. """ pooledDistances = None if X is not None and generateSquareMatrix and preCompEmbedding is None: # print("Generate Square Matrix ..") # print(scaler) X, labels, pooledDistances = self._makeSquareMatrix( X, metricColumns, scaler, inv, poolMethod, entryColumns) # print(X) print("Info :: Umap calculations started.") umapKwargs["metric"] = "precomputed" embed = umap.UMAP(**umapKwargs).fit_transform(X) elif preCompEmbedding is not None: embed = preCompEmbedding.values labels = preCompEmbedding.index.values pooledDistances = None print("Info :: Aligned UMAP was precomputed. ") elif not generateSquareMatrix: labels = X.index.values umapKwargs["metric"] = "correlation" embed = umap.UMAP(**umapKwargs).fit_transform(X) else: raise ValueError( "X and preCompEmbedding are both None. No data for UMAP.") # print("done .. - starting clustering") if self.method == "OPTICS": clusterLabels = self.clustering.fit_predict(X) return clusterLabels, labels, X, self.clustering.reachability_[ self.clustering.ordering_], self.clustering.core_distances_[ self.clustering.ordering_] elif self.method in [ "AGGLOMERATIVE_CLUSTERING", "AFFINITY_PROPAGATION" ]: clusterResult = self.clustering.fit_predict(X) return clusterResult, labels, X, ["None"] * labels.size, [ "None" ] * labels.size elif self.method == "HDBSCAN": if useSquareMatrixForCluster: self.set_params({"metric": "precomputed"}) clusterResult = self.clustering.fit(X) else: clusterResult = self.clustering.fit(embed) # self.clustering.condensed_tree_.to_pandas() return clusterResult.labels_, labels, X, clusterResult.probabilities_, [ "None" ] * labels.size, embed, pooledDistances def _makeSquareMatrix(self, X, metricColumns, scaler, inv, poolMethod, entryColumns): if scaler is None: if poolMethod == "mean": X["meanDistance"] = X[metricColumns].mean(axis=1) elif poolMethod == "max": X["meanDistance"] = X[metricColumns].max(axis=1) elif poolMethod == "min": X["meanDistance"] = X[metricColumns].min(axis=1) else: if poolMethod == "mean": X["meanDistance"] = scaler(X[metricColumns]).mean(axis=1) elif poolMethod == "max": X["meanDistance"] = scaler(X[metricColumns]).max(axis=1) elif poolMethod == "min": X["meanDistance"] = scaler(X[metricColumns]).min(axis=1) if inv: X['meanDistance'] = 1 - X['meanDistance'] X = X.dropna(subset=["meanDistance"]) uniqueValues = np.unique(X[entryColumns]) uniqueVDict = dict([(value, n) for n, value in enumerate(uniqueValues)]) nCols = nRows = uniqueValues.size print("Info :: Creating {} x {} distance matrix".format(nCols, nCols)) matrix = np.full(shape=(nRows, nCols), fill_value=2.0 if scaler is not None else 1.0) columnNames = entryColumns + ["meanDistance"] for row in X[columnNames].values: nRow = uniqueVDict[row[0]] nCol = uniqueVDict[row[1]] matrix[[nRow, nCol], [nCol, nRow]] = row[2] if scaler is not None: matrix = (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix)) np.fill_diagonal(matrix, 0) return matrix, uniqueValues, X
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=[ 'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number' ], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError( "available similarity functions are: cosine, jaccard, normalized_cooccurrence" ) self.sim = lambda sentence1, sentence2: self.decay( sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix ) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [ 0 for index in range(matrix.shape[0]) ] else: raise LexRankError( "available clustering algorithms are: birch, markov, no-clustering(use `None`)" ) self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify
def handleEndGame(self, event, replay): try: pdict = {} for player in replay.players: player.bases = {} pdict[player.team_id] = player step_size = int(20 * 22.4) for frame in range(0, replay.frames + 1, step_size): for player in replay.players: player.bases[frame] = {} if frame > 0: for f in range(frame - step_size + 1, frame): player.bases[f] = player.bases[frame - step_size] for f, ls in self.lookup.items(): if f <= frame: locs, teamids, finishes, prefs = zip(*list(ls.values())) unit_ids = list(ls.keys()) else: break locs = np.array(locs) prefs = np.array(prefs) finishes = np.array(finishes) af = AffinityPropagation(preference=[0 if p else -5000 for p in prefs], random_state=None).fit(locs) cluster_centers_indices = af.cluster_centers_indices_ centers = af.cluster_centers_.tolist() labels = af.labels_ n_clusters = len(cluster_centers_indices) # mining location? must be separate cluster new_centers = [] for k in range(n_clusters): # mining bases in this cluster mining_locs = [(loc, finish) for loc, finish, pref in zip(locs[labels == k], finishes[labels == k], prefs[labels == k]) if pref] if len(mining_locs) > 1: # split up clusters with more than one mining base original = min(mining_locs, key=lambda x: x[1]) to_split = [x for x in mining_locs if x[0].tolist() != original[0].tolist()] for i, (loc, finish) in enumerate(to_split): new_label = n_clusters + i labels[(locs == loc).all(axis=1).nonzero()] = new_label members = [(loc, finish) for loc, finish, pref in zip(locs[labels == k], finishes[labels == k], prefs[labels == k]) if not pref] for ml, mf in members: if dist(ml, loc) == min(dist(ml, x[0]) for x in [original] + to_split[:i] + to_split[i + 1:]) and mf >= finish: labels[(locs == ml).all(axis=1).nonzero()] = new_label new_centers.append(loc) for c in new_centers: cluster_centers_indices = np.append(cluster_centers_indices, (locs == c).all(axis=1).nonzero()) n_clusters += 1 # maximum distance new_centers = [] for loc in locs: if all(dist(loc, c) / self.map_dim > 0.1 for c in centers): # too far away from any cluster center, should be split if any(dist(loc, select_center(cs)) / self.map_dim <= 0.1 for cs in new_centers): # close to an already split building, merge _, i = min((dist(loc, select_center(cs)), i) for i, cs in enumerate(new_centers)) labels[(locs == loc).all(axis=1).nonzero()] = n_clusters + i new_centers[i].append(tuple(loc)) else: # start a new cluster labels[(locs == loc).all(axis=1).nonzero()] = n_clusters + len(new_centers) new_centers.append([tuple(loc)]) for cs in new_centers: central = select_center(cs) cluster_centers_indices = np.append(cluster_centers_indices, (locs == central).all(axis=1).nonzero()) n_clusters += 1 for unit_id, loc, team_id in zip(unit_ids, locs, teamids): pdict[team_id].bases[frame][unit_id] = loc except: print(locs) print(replay.filename) traceback.print_exc() for player in replay.players: if frame < replay.frames: for f in range(frame + 1, replay.frames + 1): player.bases[f] = player.bases[frame] assert len(player.bases) == replay.frames + 1, f"{len(player.bases)} base entries, {replay.frames} frames {sorted(player.bases.keys())}"
def malek_gentleman(): os.system("sudo modprobe bcm2835-v4l2") timeout=0 i_file = "1" N = "1" sumq=[] buf_back=[] colq=[] count=[] initialisation=0 initialisation+=1 i=1 VideoNumberInt=i VideoNumberString=i_file VideoName=VideoNumberString+'.mp4' cap = cv2.VideoCapture(0) # importer le video if( initialisation <= int(N)): count.append(0) #count[i-1]s the no of frames read till now nframe = 300 #no of frames needed to initialize the background cols = 160 rows = 160 flag = 0 move = 0 avg = np.zeros([160,160],dtype=np.uint8) # definir une image 'avg' dans tous ces pixels sont null , uint Unsigned integer (0 to 255) avg_temp = np.zeros([160,160],np.uint) # bon j'ai pas trouver la deffirence entre le uint8 et le uint , mais finalement sa sere à definir une plage d'entier if( initialisation <= int(N)): sumq.append(np.zeros([160,160],np.uint)) # the same thing cur_back = np.zeros([160,160],dtype=np.uint8) # definir une image cur_back , dans tous ces pixels sont null if( initialisation <= int(N)): buf_back.append(np.zeros([160,160],dtype=np.uint8)) # definir une image buf_back , dans tous ces pixels sont null if( initialisation <= int(N)): colq.append(Queue()) # definir une pile des threads qui suit le loi de fifo #to form clusters count_5=0 arr=np.zeros(shape=(0, 2), dtype=np.uint8) # definir un tableau 'arr' de deux colones cur_cent=last_cent=[0,0] def dist(cur_cent, last_cent): dis=(cur_cent[0]-last_cent[0])**2 + (cur_cent[1] - last_cent[1])**2 # calculer la somme de la variation au carré des deux pixel return dis cluster_centres_q = np.zeros(shape=(0,2),dtype=np.int64) # on vas travaillé sur un cluster de deux pixels ret, pure_img = cap.read() # on extraire le 'cout'éme frame du video qui sera enregistrer dans la variable pure_img , ret , est un variable booleanretourner par la fonction read #print("ker") #print(cap.isOpened()) #print(ret) while(cap.isOpened() and ret==True): # tanque la video qu'on souhaite traiter est en boucle #print("im") #time.sleep(0.1) count[i-1] = count[i-1] + 1 # on va commencer à traiter les frame , du coup il faux incrémenter le count[i-1] à chaque fois qu'on est entrain de traiter une frame ( count[i-1] represente le nbr des frame traiter ) img = cv2.resize(pure_img,(160,160)) # grase à resize on arrive à resizer notre image sans perdre la forme generale de l'image , c'est dans le sense ou on est pas entrais deretrancher les pixels d'une facon stupide bon maby il vas engondrer une certaine distortion l'orsqu'on zoom , euuh mais bon :p gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convertir l'image en une image noire et blanch colq[i-1].enqueue(gray) # aywah ahiya :D , tana na7chiw il image ' des années 60 :p ' fil pila mte3na if count[i-1] == nframe: print("Background fixed !") if(count[i-1] < nframe): #avg[:]=0 #no need as avg is initialized to be zero matrix sumq[i-1] = sumq[i-1] + gray # il est entrain de superposer les differentes frame print(str(count[i-1] //3)+" %") else: # une fois nodkhol lil else , ca veux dire que j'ai construit mon propre background temp = colq[i-1].dequeue() # yibda yijbid fil les frames ta3 el background sumq[i-1] = sumq[i-1] + gray - temp # yzid ce qui a changé entre le frame du background wil frame ali 9e3id yikhdim fih avg_temp = sumq[i-1]/nframe # bon mafhimtich 3leh ya9sim el 7a9 :D , ama mahiyech importante , puisque nnframe deja cst , nitsawir ,il est entrai de normaliser haja kima haka high_value_indices = avg_temp>255 # 7asilou puisque ahna mil se3a on ajout de la deffirence , fama des pixel , ynajmo yfoutou 255 ce qui n'est pas logique avg_temp[high_value_indices] = 255 # kif kif avg=avg_temp avg=avg.astype(np.uint8) #7asilou lahna badal el type bech twali image cur_back = avg if(flag == 0): #buf_back[:] = 0 #no need as buf_back is initialized to be zero matrix flag = 10 # pour eliminer le cas ou nframe=1 if(flag == 10 and count[i-1] >= nframe): buf_back[i-1] = cur_back # doub maya3mil hekom el nframe , yimchi ya3ti lil buf_back el deffirence mabin el back w awil frame 5dheha ba3d el back flag = 20 sub = cv2.absdiff(cur_back,buf_back[i-1]) # voila lahna nhoto la difference entre le background wil frame ali 9e3din nitraitiw fiha img_show = cv2.resize(img,(400,400)) #img_show = img_show.astype(int) #print(img_show.shape) #print(img_show) #time.sleep(1) cv2.imshow("img",img_show) # affichage de l'image originale gray_show = cv2.resize(gray,(400,400)) #gray_show = gray_show.astype(int) cv2.imshow("gray",gray_show) #print(cur_back) cur_back_show = cv2.resize(cur_back,(400,400)) #cur_back_show = cur_back_show.astype(int) #print(cur_back_show) cv2.imshow("cur_back_show",cur_back_show) buf_back_show = cv2.resize(buf_back[i-1],(400,400)) #buf_back_show = buf_back_show.astype(int) cv2.imshow("buf_back_show",buf_back_show) sub = cv2.resize(sub,(400,400)) #sub=sub.astype(int) cv2.imshow("Abandoned Objects",sub) # affichage de l'ivolution de la distortion du back ret_s,sub_t = cv2.threshold(sub,50,255,0) # codage : pour tous pixels entre 50 et 255 en luis attribut un 0 sinon 1 mask = np.zeros(gray.shape,np.uint8) louka, contours, hier = cv2.findContours(sub_t,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE) #im2, contours, hier = cv2.findContours(sub_t,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE) count_5 += 1 malek=0; for cnt in contours: # count[i-1]ouRS IS A LISTE OF object and in each element we find a liste of element which represente the different value of pixel who forme the contours of this object print(cv2.contourArea(cnt)) if 300<cv2.contourArea(cnt)<5000: # si le piremetre entre 200 et 5000 #cv2.drawContours(sub,[cnt],0,(0,255,0),2) cv2.drawContours(mask,[cnt],0,255,-1) M = cv2.moments(cnt) cx,cy = int(M['m10']/M['m00']), int(M['m01']/M['m00']) # localiser le centroid de l'objet ( centre de masse ) file_output = open('output.txt', 'w') file_output.write('video num : ') file_output.write(i_file)#<------------------------ file_output.write(' ====> une anomalie est detecté dans la position suivante : x=') file_output.write(str(cx)) file_output.write(', y=') file_output.write(str(cy)) file_output.close() cv2.circle(sub,(cx,cy),10,255,-1) # if(count_5<=5): arr = np.append(arr, [[cx,cy]], axis=0) # localiser la position de l'objet dans l'image #print(arr) #print(count_5) if(count_5==5): count_5 = 0 #print (len(arr)) if (len(arr) == 0): # il n y a pas d'objet pass else: affin = AffinityPropagation() affin.fit_predict(arr) centroids = affin.cluster_centers_ labels = affin.labels_ max_label_index = labels.argmax() biggest_clust_ind = labels[max_label_index] print(labels) print(centroids) # print("len_labels",len(labels),"biggest cluster's index", biggest_clust_ind, "len_centroids", len(centroids)) print("+++++++++++++++++++++++++++++++++++++",type(biggest_clust_ind),"++++++++++++++++++++++++++++++++++++++++++") if( type(biggest_clust_ind)!=np.ndarray): biggest_clust_cent = centroids[biggest_clust_ind] cx = np.uint8(biggest_clust_cent[0]) cy = np.uint8(biggest_clust_cent[1]) cv2.rectangle(sub,(cx-15,cy-15),(cx+15,cy+15),(255,255,255),1) cv2.rectangle(img,(cx-7,cy-7),(cx+7,cy+7),(0,255,0),2) cv2.drawContours(sub, contours, -1, (0,255,0), 3) #finallly reinitializing the np_array arr last_cent = cur_cent cur_cent = [cx,cy] cluster_centres_q = np.append(cluster_centres_q, [cur_cent], axis=0) print("++++++++++++++++++++++++++++++++", len(cluster_centres_q),"++++++++++++++++++++++++++++++++++++") else: pass arr=np.zeros(shape=(0, 2), dtype=np.uint8) dista=dist(cur_cent, last_cent) # print("distance b/w centroid of last & current frame",dista) #if(0 < dista < 25 ): if(cur_cent==last_cent and last_cent!=[0,0]): cv2.rectangle(sub,(cx-15,cy-15),(cx+15,cy+15),(255,255,255),1) cv2.rectangle(img,(cx-7,cy-7),(cx+7,cy+7),(0,255,0),2) print("---------------------------------------------------------------------------") if(len(cluster_centres_q)>=1): temp_a = deepcopy(cluster_centres_q[0]) temp_b = cluster_centres_q[-1] print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") if(1==1): print ("warning, abandoned object detected") print("maleek yraheb b karim") if timeout==0: timeout = time.time() + 10 else: if time.time() > timeout: break #font = cv2.InitFont(cv2.CV_FONT_HERSHEY_SIMPLEX, 1, 1, 0, 3, 8) #Creates a font font = cv2.FONT_HERSHEY_SIMPLEX text_x = cx-10 #position of text text_y = cy-20 #position of text cv2.putText(sub,"Warning", (text_x,text_y),font,1, (255,255,255)) #Draw the text cluster_centres_q = cluster_centres_q[1:] avg_show = cv2.resize(avg,(400,400)) # cv2.imshow("avg",avg_show) cv2.imshow("Abandoned Objects",sub) if(move==0): move=1 cv2.moveWindow("gray", 400,20) cv2.moveWindow("img", 0,20) cv2.moveWindow("cur_back_show", 800,20) cv2.moveWindow("buf_back_show", 400,420) cv2.moveWindow("Abandoned Objects", 20,220) cv2.moveWindow("avg", 800,420) if cv2.waitKey(1) & 0xFF == ord('q'): break ret, pure_img = cap.read() # on extraire le 'cout'éme frame du video qui sera enregistrer dans la variable pure_img , ret , est un variable booleanretourner par la fonction read cap.release() cv2.destroyAllWindows() return