def hierarchical(similarity, concepts=2, euclid=False): if euclid: model = AgglomerativeClustering(n_clusters=concepts) return model.fit_predict(similarity) else: model = AgglomerativeClustering(n_clusters=concepts, affinity='precomputed', linkage='complete') return model.fit_predict(1 - similarity)
class HierarchicalTopics(object): def __init__(self, corpus): """ corpus is a corpus object, e.g. an HTMLCorpusReader() or an HTMLPickledCorpusReader() object """ self.model = None self.vocab = list( set(normalize(corpus.words(categories=['news']))) ) def vectorize(self, document): """ Vectorizes a document consisting of a list of part of speech tagged tokens using the segmentation and tokenization methods. One-hot encode the set of documents """ features = set(normalize(document)) return np.array([ token in features for token in self.vocab], np.short) def cluster(self, corpus): """ Fits the AgglomerativeClustering model to the given data. """ self.model = AgglomerativeClustering() self.model.fit_predict([ self.vectorize( corpus.words(fileid)) for fileid in corpus.fileids(categories=['news'] ) ]) self.labels = self.model.labels_ self.children = self.model.children_ def plot_dendrogram(self, **kwargs): # Distances between each pair of children distance = np.arange(self.children.shape[0]) position = np.arange(self.children.shape[0]) # Create linkage matrix and then plot the dendrogram linkage_matrix = np.column_stack([ self.children, distance, position] ).astype(float) # Plot the corresponding dendrogram fig, ax = plt.subplots(figsize=(15, 7)) # set size ax = dendrogram(linkage_matrix, **kwargs) plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off') plt.tight_layout() plt.show()
def agglom(data, n_clusters): knn_graph = kneighbors_graph(data, 30, include_self=False) cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results model = cluster.fit(data) return cluster.fit_predict(data)
def buckshot(k, mat): size = int((k*mat.shape[0])**.5) print size samp = np.zeros((size, mat.shape[1])) inds = np.random.randint(0, mat.shape[0], size) print inds for i in xrange(size): samp[i] = mat[inds[i]] #agglomerative clusting on sample hier = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='euclidean', compute_full_tree=True) flat = hier.fit_predict(samp) centroids = [] #find centroids for j in xrange(k): i_s = [i for i, l in enumerate(flat) if l == j] print len(i_s) points = [samp[m] for m in i_s] points = np.array(points) cent = np.mean(points, axis=0) centroids.append(cent) return centroids
def calculateNumberOfIdealClusters(maxAmount, corpus): print "Initializing silhouette analysis" range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs silhouette_high = 0; silhouette_high_n_clusters = 2; for n_clusters in range_n_clusters: # Initialize the clusterer with n_clusters value cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean") cluster_labels = cluster.fit_predict(corpus) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed clusters silhouette_avg = silhouette_score(corpus, cluster_labels) print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg) if (silhouette_avg > silhouette_high): silhouette_high = silhouette_avg silhouette_high_n_clusters = n_clusters # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(corpus, cluster_labels) print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters)) return silhouette_high_n_clusters
def sp_connectivity(self,X,connectivity, n_clusters, n): # plt.figure(figsize=(10, 4)) # plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage="ward", connectivity=connectivity, n_clusters=n_clusters) #t0 = time.time() y = np.zeros(shape=(n)) y = model.fit_predict(X, None) #elapsed_time = time.time() - t0 return y #plt.scatter(X[:, 0], X[:, 1], c=model.labels_, # cmap=plt.cm.spectral) #plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), # fontdict=dict(verticalalignment='top')) #plt.axis('equal') #plt.axis('off') #plt.subplots_adjust(bottom=0, top=.89, wspace=0, # left=0, right=1) # plt.suptitle('n_cluster=%i, connectivity=%r' % # (n_clusters, connectivity is not None), size=17) #plt.show()
def clustering_approach(self): ''' Cluster user data using various clustering algos IN: self.df_full and self.labels OUT: results to stdout ''' print 'Fitting clustering model' X = self.df_full.values y = self.labels # scale data scaler = StandardScaler() X = scaler.fit_transform(X) # KMeans km_clf = KMeans(n_clusters=2, n_jobs=6) km_clf.fit(X) # swap labels as super-users are in cluster 0 (messy!!) temp = y.apply(lambda x: 0 if x == 1 else 1) print '\nKMeans clustering: ' self.analyse_preds(temp, km_clf.labels_) # Agglomerative clustering print '\nAgglomerative clustering approach: ' ac_clf = AgglomerativeClustering() ac_labels = ac_clf.fit_predict(X) self.analyse_preds(y, ac_labels) return None
def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv) Allstrings=list() #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts] for row_dict in Allrow_dicts: if self.POS =="ALL_EXT": Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"] Allstrings.append(Stringrow) else: Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"] Allstrings.append(Stringrow) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] vectorizer = CountVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #-------------------------- feature_names=vectorizer.get_feature_names() #--z---------------------------------------------- Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine') Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #term_doc_lsa = lsa.fit_transform(term_doc) print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity) print Res_Labels print("n_samples: %d, n_features: %d" % term_doc.shape)
def CreateCluster(self): Fileobj=file(self.DistanceFile,"rb") SimArray=np.load(self.DistanceFile) Fileobj.close() print SimArray AggClusterDistObj=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) Res_Labels=AggClusterDistObj.fit_predict(SimArray) print Res_Labels
def hierarchicalCluster(corr_matrix_df, n_clusters): """calculate clustering from the correlation matrix using the hierarchical Ward method""" #set method ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',affinity='euclidean') result=ward.fit_predict(corr_matrix_df) cluster_df=pd.DataFrame(result, index=corr_matrix_df.index, columns= ['Cluster']) return cluster_df
def get_topics(X_lsi, text_names, nk=1): ag = AgglomerativeClustering(n_clusters=nk, affinity='cosine', linkage='average') topics = ag.fit_predict(X_lsi) paper_to_topic = defaultdict(int) topic_to_papers = defaultdict(list) for paper,topic in zip(text_names,topics): paper_to_topic[paper] = topic topic_to_papers[topic].append(paper) return (paper_to_topic, topic_to_papers)
def openfaceExp(lfwAligned, net, cls): df = pd.DataFrame(columns=('nPpl', 'nImgs', 'trainTimeSecMean', 'trainTimeSecStd', 'predictTimeSecMean', 'predictTimeSecStd', 'accsMean', 'accsStd')) repCache = {} df_i = 0 for nPpl in nPplVals: print(" + nPpl: {}".format(nPpl)) cls = AgglomerativeClustering(n_clusters=nPpl) (X, y) = getData(lfwAligned, nPpl, nImgs, size=96, mode='rgb') nSampled = X.shape[0] ss = ShuffleSplit(nSampled, n_iter=10, test_size=0.1, random_state=0) allTrainTimeSec = [] allPredictTimeSec = [] accs = [] for train, test in ss: X_train = [] for img in X[train]: h = hash(str(img.data)) if h in repCache: rep = repCache[h] else: rep = net.forward(img) repCache[h] = rep X_train.append(rep) start = time.time() X_train = np.array(X_train) cls.fit(X_train, y[train]) trainTimeSec = time.time() - start allTrainTimeSec.append(trainTimeSec) start = time.time() X_test = [] for img in X[test]: X_test.append(net.forward(img)) y_predict = cls.fit_predict(X_test) predictTimeSec = time.time() - start allPredictTimeSec.append(predictTimeSec / len(test)) y_predict = np.array(y_predict) print y[test], y_predict acc = accuracy_score(y[test], y_predict) print acc accs.append(acc) df.loc[df_i] = [nPpl, nImgs, np.mean(allTrainTimeSec), np.std(allTrainTimeSec), np.mean(allPredictTimeSec), np.std(allPredictTimeSec), np.mean(accs), np.std(accs)] df_i += 1 return df
def test_agglomerative_clustering_with_distance_threshold_edge_case( linkage, threshold, y_true): # test boundary case of distance_threshold matching the distance X = [[0], [1]] clusterer = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=linkage) y_pred = clusterer.fit_predict(X) assert adjusted_rand_score(y_true, y_pred) == 1
def clusterize(matrices): #dbscan = DBSCAN(metric="precomputed", eps=25, min_samples=50) cluster = AgglomerativeClustering(n_clusters=2, affinity="precomputed", linkage="complete") distances = distance_matrix(matrices) print("mean of distances is {} and std of norms is {}".format(numpy.mean(distances), numpy.std([numpy.linalg.norm(m, numpy.inf) for m in matrices]))) #pyplot.plot([numpy.linalg.norm(m, numpy.inf) for m in matrices], 'ro') #pyplot.show() #pyplot.hist(distances.flatten(), bins=20) #pyplot.show() return cluster.fit_predict(distances)
def agglomerative_clustering(crime_rows, column_names, num_clusters): crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] print("Running Agglomerative Clustering") agglo_clustering = AgglomerativeClustering(n_clusters=num_clusters, connectivity=neighbors.kneighbors_graph(crime_xy, n_neighbors=2)) agglomerative_clustering_labels = agglo_clustering.fit_predict(crime_xy) print("formatting....") return _format_clustering(agglomerative_clustering_labels, crime_xy, crime_info, column_names)
def agglomorative_clustering(df_in): # Set model input args n_clusters = 8 linkage = 'ward' model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters) # attach cluster-label to dataframe df_in['cluster'] = model.fit_predict(df_in)
def find_steady_coalition(): working_direcotry = r"C:\Users\ORI\Documents\IDC-non-sync\ML_Course\Election\Data\\" file_name = os.path.join(working_direcotry, r'ElectionsData.csv') train, validation, test, feature_categorical_dictionary, train_idx, test_idx, number_to_party_dictionary = prepare_the_data(file_name, working_direcotry) good_colation_found = False for n_clusters in [5,4,3]: print ("---------------") linkage = 'ward' X = train.data clusters = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters) clusters.fit_predict(X) bin_count_of_kmeans_clusters = np.bincount(clusters.labels_) normalized_bin_count_of_kmeans_clusters = bin_count_of_kmeans_clusters/np.sum(bin_count_of_kmeans_clusters).astype('float32') #is there any cluster with more than 50% of the votes? coalition_exists = np.any(normalized_bin_count_of_kmeans_clusters > 0.5) print "number_of_clustes {0}".format(n_clusters) print "coalition_exists: {0} ".format(coalition_exists) # find all the parties belong to the cluster biggest_cluster = np.argmax(normalized_bin_count_of_kmeans_clusters) biggest_cluster_voters = np.bincount(train.labels[clusters.labels_ == biggest_cluster].astype('int64')) #normalize the votes by the size of their parties: votes_out_of_party = biggest_cluster_voters/np.bincount( train.labels.astype('int32')).astype('float32') #commited_to_coalition_parties = partyw with majority of the votes in the cluster commited_to_coalition_parties = votes_out_of_party > 0.5 percentage_of_voters_in_commited_coalition = np.sum(biggest_cluster_voters[votes_out_of_party > 0.5])*1.0/len(train.labels)*1.0 print percentage_of_voters_in_commited_coalition if percentage_of_voters_in_commited_coalition> 0.5: print "coalition found" parties_in_coalition = number_to_party_dictionary.keys() print "parties in coalition:{0}".format([number_to_party_dictionary[k] for k in np.array(number_to_party_dictionary.keys())[votes_out_of_party > 0.5]]) break print ("---------------")
def outlier_clusters_ward(x, y, skill=None, memory=None): # TODO: incorporate skill data = np.vstack((x, y)).T if len(data) == 0: # uh. print 'clustering: NO cluster members!' cluster_centers = np.array([[-1, -1]]) cluster_labels = [] labels = [] n_clusters = 0 dist_within = np.array([]) elif len(data) == 1: print 'clustering: only 1 data point!' cluster_centers = data cluster_labels = [0] labels = np.array([0]) n_clusters = 1 dist_within = np.array([0]) else: dist_within = 1000 dist_max = 75 n_clusters = 0 n_clusters_max = 10 clusterer = AgglomerativeClustering(n_clusters=n_clusters, memory=memory) # while dist_within > dist_max, keep adding clusters while (dist_within > dist_max) * (n_clusters < n_clusters_max): # iterate n_clusters n_clusters += 1 clusterer.set_params(n_clusters=n_clusters) # cluster labels = clusterer.fit_predict(data) # get cluster_centers cluster_labels = range(n_clusters) cluster_centers = np.array([np.mean(data[labels == i], axis=0) for i in cluster_labels]) # find dist_within: the maximum pairwise distance inside a cluster dist_within = np.max([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) dist_within_final = np.array([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
def hierarchical_clustering(g, n_clusters=3): """performs hierarchical_clustering to specified number fo clusters""" from sklearn.cluster import AgglomerativeClustering distances = get_shortest_path_distance_matrix(g) ac = AgglomerativeClustering(n_clusters, linkage='average') labels = ac.fit_predict(distances) clusters = defaultdict(list) for node, label in zip(g.nodes(), labels): clusters[label].append(node) return clusters
def ward_cluster(file_topic_matrix="PCA_matrix.pkl", file_term_matrix="Tfidf_Matrix.pkl", file_term_list="features.pkl",n_clusters=500, truncate=25000,output="",tolerance=-0.2): #Only keep the first 10k articles by default. I couldn't get it to run with 50k. #However, I got it with 25k, and it took 45 minutes. start_time = time.time() #Load pickles topic_matrix = pd.read_pickle(file_topic_matrix)[:truncate,:] term_matrix = pd.read_pickle(file_term_matrix)[:truncate,:] term_list = pd.read_pickle(file_term_list) processing_time = (time.time() - start_time)/60 print("Current time: %.2f minutes. Files loaded." % processing_time ) #Apply clustering clustering = AgglomerativeClustering(linkage="ward", n_clusters=n_clusters) classification = clustering.fit_predict(topic_matrix) processing_time = (time.time() - start_time)/60 print("Current time: %.2f minutes. Clustering done." % processing_time ) #Translate to tree_node, generate label_tree and collapsed_tree full_tree = tree_to_nodes(clustering.children_,topic_matrix.shape[0]) label_tree = get_label_tree(classify_tree(full_tree,classification)) (topic_means,docs_in_cluster) = get_means(classification,topic_matrix) collapsed_tree = collapse_label_tree(label_tree,topic_means,docs_in_cluster,tolerance) processing_time = (time.time() - start_time)/60 print("Current time: %.2f minutes. Trees collapsed." % processing_time ) #Assign names to each node of tree based on most common links (term_means,docs_in_cluster) = get_means(classification,term_matrix) descriptive_tree = get_name_tree(collapsed_tree,term_means,docs_in_cluster,term_list) processing_time = (time.time() - start_time)/60 print("Current time: %.2f minutes. Node descriptions generated." % processing_time ) #Write pickles #c_labels tells you which cluster each document is in pd.to_pickle(classification,output+'c_labels.pkl') #save the uncollapsed tree in case I want to tweak that process later. pd.to_pickle(label_tree,output+'uncollapsed_tree.pkl') #ward_tree shows how the clusters fit together in a tree structure pd.to_pickle(collapsed_tree,output+'ward_tree.pkl') #descriptive_tree the same as ward_tree, except that nodes are named after most common links pd.to_pickle(descriptive_tree,output+'descriptive_tree.pkl') #cluster_means are the vectors (in the PCA space) of each cluster pd.to_pickle(topic_means,output+'topic_means.pkl') #link_means are the vectors (in link space) of each cluster pd.to_pickle(term_means,output+'term_means.pkl') #docs_in_cluster tells you the size of each cluster pd.to_pickle(docs_in_cluster,output+'docs_in_cluster.pkl') processing_time = (time.time() - start_time)/60 print("Current time: %.2f minutes. Pickles written." % processing_time )
def Create_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.StringsFile) Allstrings=[rowdict_str[self.clusterdfield] for rowdict_str in Allrow_dicts] if self.POS=="ALL": Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] else: POS_Strings=list() if self.POS=="Noun_Verb_AdJ" : POS_List=["Noun","Adj","Verb"] else: if self.POS=="Noun_AdJ" : POS_List=["Noun","Adj"] else: print "Error in Part of speech in function Create_Agg_cluster" sys.exit(0) for string in Allstrings: POS_String=Add_POS.ADDPOS_string(string,POS_List)["AllPOSstring"] POS_Strings.append(POS_String) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in POS_Strings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] if self.vec=="CountVectorizer": vectorizer = CountVectorizer() else: if self.vec=="TFIdfCountVectorizer": vectorizer= TfidfVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #======================================================================= # svd = TruncatedSVD(n_components=5, random_state=42) # lsa = make_pipeline(svd, Normalizer(copy=False)) # term_doc = lsa.fit_transform(term_doc) # term_doc = svd.fit_transform(term_doc) #======================================================================= #-------------------------- feature_names=vectorizer.get_feature_names() #------------------------------------------------ Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) if self.affinity=='l1': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
def hierarchical_cluster(champ_data,num_clusters): clusterer = AgglomerativeClustering(num_clusters) ids = champ_data.keys() ids.sort() X = [] ordered_atts = champ_data[ids[0]].keys() ordered_atts.sort() for ID in ids: champ = champ_data[ID] X.append([champ[att] for att in ordered_atts]) X = array(X) X_scaled = preprocessing.scale(X) clusters = clusterer.fit_predict(X_scaled) id_to_cluster = {ids[i]: clusters[i] for i in xrange(len(ids))} return id_to_cluster
def obtainClusters(self, hist): print 'Obatining clusters using Agglomerative Clustering from skilean...' hist = np.array(hist) hist = hist.astype(float) scaled_vec = StandardScaler().fit_transform(hist) hc = AgglomerativeClustering(n_clusters=self.nclusters, linkage=self.linkage, affinity=self.dist) #obatin the clusters clusters = hc.fit_predict(scaled_vec,None) print 'Clusters obtained.' return clusters
def cv_iteration(n_clusters=1, affinity='euclidean', linkage='ward'): X, y_train, _ = load_data() scores = [] cms = [] # confusion matrices cluster_sizes = [] random_states = (666, 69, 7, 13, 1337) for i in random_states: model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage) predictions = model.fit_predict(X) score, confusion_matrix = scoring_function(y_train, predictions) scores.append(score) cms.append(serialise_confusion_matrix(confusion_matrix)) cluster_sizes.append(serialise_confusion_matrix(np.unique(predictions, return_counts=True))) return {'result': scores, 'confusion_matrices': eval(str(cms)), 'score_name': string_enhancer(str(scoring_function)), 'cluster_sizes': eval(str(cluster_sizes))}
def obtainCodebook(self, hist): print 'Obatining clusters using Agglomerative Clustering from skilean...' scaled_vec = StandardScaler().fit_transform(hist) # connectivity matrix for structured Ward connectivity = kneighbors_graph(scaled_vec, n_neighbors=3, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) hc = AgglomerativeClustering(n_clusters=self.nclusters, linkage=self.linkage, connectivity=connectivity, compute_full_tree=False, affinity=self.dist) #obatin the codebook and the projections of the images on the codebook (clusters of words) clusters = hc.fit_predict(scaled_vec,None) print 'Clusters obtained.' return None, clusters
def _sort_clustering(data): try: from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import euclidean_distances except ImportError: logging.error('Cannot use _sort_clustering without scikit-learn ' 'installed.') raise dbscan = AgglomerativeClustering(n_clusters=len(data) / 10, compute_full_tree=False) labels = dbscan.fit_predict(data) logging.debug('Clustering sort, assigned labels: {0}' ''.format({l: len([x for x in labels if x == l]) for l in set(labels)})) data_by_labels = collections.defaultdict(list) for i, l in enumerate(labels): data_by_labels[l].append(data[i]) numpy_by_labels = {l: numpy.array(data_by_labels[l]) for l in data_by_labels} centroids = {l: numpy.average(numpy_by_labels[l], axis=0) for l in numpy_by_labels} pivot_centroid = centroids[0] # Get distances to pivot centroid distances_to_pivot = {l: euclidean_distances(numpy.array([centroids[l], pivot_centroid]))[0][1] for l in centroids} # sort Labels by Distances To Pivot (ldtp) ldtp = map(operator.itemgetter(0), sorted(distances_to_pivot.items(), key=operator.itemgetter(1))) sorted_data_by_cluster = [] for label in ldtp: for item in numpy_by_labels[label]: sorted_data_by_cluster.append(item) return numpy.array(sorted_data_by_cluster)
def Aglomerative_cl(feat_name): features = opening_data_target(feat_name) features = scale(features) ac = AgglomerativeClustering(n_clusters=4, linkage='average', affinity='cosine') #cosine ac_preds = ac.fit_predict(features) # getting aglomerative tags # need only for evaluation #ac_tags = validate_with_mappings(ac_preds, target, features) # Aglomerative results ''' print 'Accuracy ', accuracy_score(target, ac_tags) print 'Precision ', precision_score(target, ac_tags) print 'Recall ', recall_score(target, ac_tags) print 'F1 ', f1_score(target, ac_tags) ''' return ac_preds
def main(): ratings, users, movies = readFromFile() global NO_OF_CLUSTERS, NO_OF_USERS userDataMatrix = getUserDataMatrix(ratings, users, movies) kmeansModel = KMeans(n_clusters=NO_OF_CLUSTERS, init="k-means++") agglomerativeClusteringModel = AgglomerativeClustering(n_clusters=NO_OF_CLUSTERS, affinity="euclidean") kmeansPredictedClusters = kmeansModel.fit_predict(userDataMatrix) aggClusteringPredictedClusters = agglomerativeClusteringModel.fit_predict(userDataMatrix) with open("../data/UserClustersKMeans.txt", "w") as KMeansOutFile: for i in range(1, NO_OF_USERS): KMeansOutFile.write("\t".join([str(i), str(kmeansPredictedClusters[i - 1])]) + "\n") with open("../data/UserClustersAgglomerativeClustering.txt", "w") as aggClusOutFile: for i in range(1, NO_OF_USERS): aggClusOutFile.write("\t".join([str(i), str(aggClusteringPredictedClusters[i - 1])]) + "\n")
def agglomerativeClustering(sourceFiles, fileExtension): """ Performs agglomerative hierarchical clustering using files with <fileExtension> in the <sourceFiles> directory and return accuracy measure""" try: accuracy = 0 # Step 1 - Check the required algorithm to specify the data type to load dataFiles = glob.glob("%s/*.%s" % (arguments.sourcedir, arguments.datatype)) # Get the paths of files to load dataSamples, dataLabels, loadedClusters = [], [], [] for dataPoint in dataFiles: dataSamples.append([float(x) for x in open(dataPoint).read()[1:-1].split(",")]) # Also load its cluster clusterName, paramNames = loadLabelFromFile(dataPoint.replace(".%s" % arguments.datatype, ".metadata")) if not clusterName in loadedClusters: loadedClusters.append(clusterName) dataLabels.append(loadedClusters.index(clusterName)) prettyPrint("Successfully retrieved %s instances for clustering" % len(dataSamples)) # Step 2 - Perform clustering clusterer = AgglomerativeClustering(n_clusters=len(loadedClusters)) predicted = clusterer.fit_predict(numpy.array(dataSamples), dataLabels) accuracy = round(metrics.accuracy_score(dataLabels, predicted), 2) except Exception as e: prettyPrint("Error encountered: %s" % e, "error") return accuracy
from sklearn.cluster import AgglomerativeClustering from scipy.spatial import ConvexHull import pandas as pd import matplotlib.pyplot as plt import numpy as np # Import Data df = pd.read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/USArrests.csv') # Agglomerative Clustering cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') cluster.fit_predict(df[['Murder', 'Assault', 'UrbanPop', 'Rape']]) # Plot plt.figure(figsize=(14, 10), dpi=80) plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=cluster.labels_, cmap='tab10') # Encircle def encircle(x, y, ax=None, **kw): if not ax: ax = plt.gca() p = np.c_[x, y] hull = ConvexHull(p) poly = plt.Polygon(p[hull.vertices, :], **kw) ax.add_patch(poly) # Draw polygon surrounding vertices
coo_Y = [] #y坐标列表 for j in range(len(C[i])): coo_X.append(C[i][j][0]) coo_Y.append(C[i][j][1]) pl.scatter(coo_X, coo_Y, marker='x', color=colValue[i%len(colValue)], label=i) pl.legend(loc='upper right') pl.show() if __name__ == '__main__': data=DataS() data=np.array(data) labels_true=label_init() labels_super=label_init() #C = AGNES(data, dist_avg, 6) c = ['标准', '周期', '递增', '递减', '递增向上', '递减向下'] color = ['dodgerblue', 'orange', 'green', 'tomato', 'yellow', 'brown'] pca = PCA(n_components=2) # 进行PCA降维 newdata = pca.fit_transform(data) patches = [mpatches.Patch(color=color[i], label="{:s}".format(c[i])) for i in range(len(color))] for i in range(6): for j in range(100): plt.scatter(newdata[i*100+j][0],newdata[i*100+j][1],c=color[i]) plt.legend(handles=patches,loc='upper right') plt.show() ac = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete') labels_super=ac.fit_predict(data) plt.scatter(newdata[:, 0], newdata[:, 1], c=labels_super) plt.show() print(metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_super)) # 进行ARI性能评估,取值[-1,1]越接近1,性能越好 #draw(C)
#**************************** # Hierarchical clustering #**************************** hierarch_res = pd.DataFrame( columns=['it_id', 'linkage', 'micro', 'macro', 'silhouette']) linkages = ['complete', 'average', 'single'] for linky in linkages: for i in range(nb_trials): aglo = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage=linky) aglo_preds = aglo.fit_predict(dm) m, pred = misc(labels_oh, aglo_preds, True) sil = silhouette_score(dm, pred, metric='precomputed') micro = precision_score(labels_oh, pred, average='micro') macro = precision_score(labels_oh, pred, average='macro') hierarch_res = hierarch_res.append({'it_id': i + 1, 'linkage': linky, \ 'micro': micro, 'macro': macro, 'silhouette': sil},\ ignore_index=True) hierarch_res.groupby('linkage').mean() hierarch_res.groupby('linkage').std() hierarch_res.to_csv(res_folder + '/hierarch_res_categ_encoded.csv')
arr = np.array(images) arr = arr.reshape(2006, 1024) from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import normalize scaler = StandardScaler() arr = scaler.fit_transform(arr) arr = normalize(arr) import pandas as pd arr = pd.DataFrame(arr * 2550) from sklearn.cluster import AgglomerativeClustering clustering = AgglomerativeClustering().fit(arr) opt = clustering.fit_predict(arr) """ sources = {} for i in range(2006): if opt[i] not in sources: sources[opt[i]] = [] sources[opt[i]].append(files[i]) else: sources[opt[i]].append(files[i]) """ import shutil for i in range(0, 2006): if opt[i] == 0: shutil.copy("C:/Users/ManavChordia/Work/IUCAA/DI/cutouts/" + files[i], "C:/Users/ManavChordia/Work/IUCAA/DI/cutout_0")
data = sc_X.fit_transform(data) # Using the dendrogram to find the optimal number of clusters import scipy.cluster.hierarchy as sch dendrogram = sch.dendrogram(sch.linkage(data, method='ward')) plt.title('Dendrogram') plt.xlabel('TimeStamp') plt.ylabel('Total Expenditure') plt.show() # Fitting Hierarchical Clustering to the dataset from sklearn.cluster import AgglomerativeClustering hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(data) # Visualising the clusters plt.scatter(data[y_hc == 0, 0], data[y_hc == 0, 1], s=100, c='red', label='Cluster 1') plt.scatter(data[y_hc == 1, 0], data[y_hc == 1, 1], s=100, c='blue', label='Cluster 2') plt.scatter(data[y_hc == 2, 0], data[y_hc == 2, 1], s=100,
plt.axhline(y=1,color='r',linestyle='--') # In[94]: ## we have 2 clusters as the line cuts the dendrogram at two points # In[95]: from sklearn.cluster import AgglomerativeClustering cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') cluster.fit_predict(cluster_data) # In[97]: plt.figure(figsize=(10, 7)) plt.scatter(cluster_data['SepalWidthCm'], cluster_data['PetalLengthCm'], c=cluster.labels_) # ## USING K MEANS ON THE DATASET # In[65]: import numpy as np
X = data_drop.values from sklearn.manifold import TSNE tsne = TSNE(verbose=1, perplexity=40, n_iter= 4000) Y = tsne.fit_transform(X) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto') kY = kmeans.fit_predict(X) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.scatter(Y[:,0],Y[:,1], c=kY, cmap = "jet", edgecolor = "None", alpha=0.35) ax1.set_title('k-means clustering') ax2.scatter(Y[:,0],Y[:,1], c = datas['diagnosis'], cmap = "jet", edgecolor = "None", alpha=0.35) ax2.set_title('Actual clusters') from sklearn.cluster import AgglomerativeClustering aggC = AgglomerativeClustering(n_clusters=2, linkage='ward') kY = aggC.fit_predict(X) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.scatter(Y[:,0],Y[:,1], c=kY, cmap = "jet", edgecolor = "None", alpha=0.35) ax1.set_title('Hierarchical clustering ') ax2.scatter(Y[:,0],Y[:,1], c = datas['diagnosis'], cmap = "jet", edgecolor = "None", alpha=0.35) ax2.set_title('Actual clusters')
tot_word = pd.read_csv('C:\\Users\Salvo\Desktop\IFISC\data\\no_words_per_county.csv', sep=",", low_memory=False,index_col=[0]) print(tot_word) word = np.asarray(tot_word.iloc[1:-1, 1].to_numpy(), dtype=float) print(word) #Plotting a scatter plot for each number of clusters which_clusters = [3,5,7,9] #""" #KS for i in which_clusters: #calling the object ks_cluster = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward') a = ks_cluster.fit_predict(myD) print(a) #a has in position i the county with index i. #as value in position i , it has the number of the cluster that the i-th county belongs to (example, 3 clusters, a =[1,1,1,1,0,0,0,2,1,1] #Here it is extracting the how many counties are in each cluster, (example, 3 clusters, bin_count = [0, 3, 6, 1]) unique, bin_count_temp = np.unique(a, return_counts=True) #with concatenate I put a 0 as a first element of bin count bin_count = np.concatenate(([0], bin_count_temp)) print(bin_count, len(bin_count)) #b are the counties indexes b = np.arange(len(a)) #Here I create a pandas dataframe to easily sort counties, latitude and longitude arrays, following the clusters sorting.
df["Connectivity"] = df["Cellular"] | df["WiFi"] df = df.drop(["WiFi", "Cellular", "isInteractive"], axis=1) # Cause of correlation between columns # After the first run, you don't have to compute the distance matrix again. You can read it from the pickle file distance = gower.gower_matrix(df) with open("distance.pickle", "wb") as f: pickle.dump(distance, f) #distance = pickle.load( open( "distance.pickle", "rb" ) ) print("Done with distance!") del df modelAverage = AgglomerativeClustering(n_clusters=4, affinity="precomputed", linkage='average').fit(distance) labels = modelAverage.fit_predict(distance) silhouette_score = metrics.silhouette_score(distance, labels, metric="precomputed") print(f"silhouette = {silhouette_score} for average and 4 clusters.") modelComplete = AgglomerativeClustering(n_clusters=5, affinity="precomputed", linkage='complete').fit(distance) labels = modelComplete.fit_predict(distance) silhouette_score = metrics.silhouette_score(distance, labels, metric="precomputed") print(f"silhouette = {silhouette_score} for complete and 5 clusters.") # Save models to keep the same numbering on clusters ...
def get_tasks(): pca = PCA() data = request.json dataset = pd.read_csv(data['filename']) # dataset = pd.read_csv('D:/sem1/VDS/symptom_project/cs529-project/data/output/multi-dim-5-timepoints-0_t.csv') symp = data['symptoms'] symp1 = deepcopy(symp) # for el in symp1: # symp.append(el+'1') # for el in symp1: # symp.append(el+'2') # for el in symp1: # symp.append(el+'3') # for el in symp1: # symp.append(el+'4') z = dataset[symp] patientId = data['patientId'] x = z.values patients = dataset.iloc[:, list(range(28))] # patients = dataset.iloc[:, list(range(59))] # patients = dataset.iloc[:, list(range(87))] # patients = dataset.iloc[:, list(range(115))] # patients = dataset.iloc[:, list(range(142))] print(patients) p = patients[symp] x_pca = pca.fit_transform(p) x_pca = pd.DataFrame(x_pca) if len(symp) == 1: x_pca['PC2'] = [[0]] * len(patients) x_pca = x_pca.iloc[:, list(range(2))] x_pca.columns = ['PC1', 'PC2'] dataset['PC1'] = x_pca['PC1'] dataset['PC2'] = x_pca['PC2'] x_pca_pc1 = x_pca['PC1'].to_numpy() x_pca_pc1 = (x_pca_pc1 - x_pca_pc1.mean()) / np.std(x_pca_pc1) if len(symp) != 1: x_pca_pc2 = x_pca['PC2'].to_numpy() x_pca_pc2 = (x_pca_pc2 - x_pca_pc2.mean()) / np.std(x_pca_pc2) dataset['PC2'] = x_pca_pc2 dataset['PC1'] = x_pca_pc1 print(x_pca_pc1) if (len(patientId) > 0): l = [] d = {} dp = {} for i, row in dataset.iterrows(): l.append([row['PC1'], row['PC2']]) d[int(row['patientId'])] = i dp[i] = int(row['patientId']) crt_patient = d[int(patientId)] distance_list = [] for i, elem in enumerate(l): distance_list.append([ i, math.sqrt((elem[0] - l[crt_patient][0])**2 + (elem[1] - l[crt_patient][1])**2) ]) distance_list.sort(key=lambda x: x[1]) return jsonify([ dp[distance_list[1][0]], dp[distance_list[2][0]], dp[distance_list[3][0]] ]) h = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') y = h.fit_predict(x) dataset['cluster'] = y dataset = dataset.transpose() dataset = dataset.to_dict() sum0 = -1 sum1 = -1 for index, row in dataset.items(): if row['cluster'] == 0: sum0 = row['sum'] if row['cluster'] == 1 and sum0 != -1: sum1 = row['sum'] break while sum1 == sum0: sum0 = -1 for index, row in dataset.items(): if index > 0: if row['cluster'] == 0: sum0 = row['sum'] break if sum0 > sum1: for index, row in dataset.items(): if int(row['cluster']) == 0: row['cluster'] = 1 else: row['cluster'] = 0 return jsonify(dataset)
index=False) # K-Means 手肘法:统计不同K取值的误差平方和 import matplotlib.pyplot as plt sse = [] for k in range(1, 204): # kmeans算法 kmeans = KMeans(n_clusters=k) kmeans.fit(train_x) # 计算inertia簇内误差平方和 sse.append(kmeans.inertia_) x = range(1, 204) plt.xlabel('K') plt.ylabel('SSE') plt.plot(x, sse, 'o-') plt.show() #使用层次聚类 from scipy.cluster.hierarchy import dendrogram, ward from sklearn.cluster import KMeans, AgglomerativeClustering import matplotlib.pyplot as plt model = AgglomerativeClustering(linkage='ward', n_clusters=3) y = model.fit_predict(train_x) print(y) linkage_matrix = ward(train_x) dendrogram(linkage_matrix) plt.show()
x = [station['loc']['coordinates'][0] for station in all_stations] y = [station['loc']['coordinates'][1] for station in all_stations] X = np.array((x, y)) X = X.T try: mongo_bulk = mongo_db.stations.initialize_ordered_bulk_op() mongo_bulk.find({}).update({'$set': {'clusters': []}}) for n_clusters in reversed(range_clusters): model = AgglomerativeClustering(linkage='ward', connectivity=None, n_clusters=n_clusters) labels = model.fit_predict(X) for label in range(len(np.unique(labels))): cluster_assign = labels == label cluster = X[cluster_assign] average = np.average(cluster, 0) middle = cluster[KDTree(cluster).query(average)[1]] indexes = np.where((X == middle).all(axis=1))[0] if len(indexes) > 1: stations = list( mongo_db.stations.find( { '_id': { '$in':
data = customer_data.iloc[:, 3:5].values # In[10]: # create the dendrograms for our dataset. import scipy.cluster.hierarchy as shc #import scipy for dendograms plt.figure(figsize=(10, 7)) plt.title("Customer Dendograms") dend = shc.dendrogram(shc.linkage(data, method='ward')) # In[11]: #group the data points into these k(5) clusters # In[12]: from sklearn.cluster import AgglomerativeClustering cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') cluster.fit_predict(data) # In[13]: plt.figure(figsize=(10, 7)) plt.scatter(data[:, 0], data[:, 1], c=cluster.labels_, cmap='rainbow') # In[ ]:
#dendogram method - finding optimal number of clusters import scipy.cluster.hierarchy as sch dendrogram = sch.dendrogram(sch.linkage(x, method='ward')) plt.title('dendrogram') plt.xlabel('customers') plt.ylabel('distance') plt.show() #fitting hierarchical clustering from sklearn.cluster import AgglomerativeClustering hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') yhc = hc.fit_predict(x) #visualising clustes plt.scatter(x[yhc == 0, 0], x[yhc == 0, 1], s=100, c='red', label='cluster 1') plt.scatter(x[yhc == 1, 0], x[yhc == 1, 1], s=100, c='blue', label='cluster 2') plt.scatter(x[yhc == 2, 0], x[yhc == 2, 1], s=100, c='green', label='cluster 3') plt.scatter(x[yhc == 3, 0], x[yhc == 3, 1], s=100, c='yellow', label='cluster 4') plt.scatter(x[yhc == 4, 0],
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split dataset = pd.read_csv('/content/gdrive/My Drive/Mall_Customers.csv') X = dataset.iloc[:, [3, 4]].values y = dataset.iloc[:, 3].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) from sklearn.cluster import AgglomerativeClustering hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward') y_hc = hc.fit_predict(X) plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1') plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2') plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3') plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4') plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5') plt.title('Clusters of customers') plt.xlabel('Annual Income (k$)') plt.ylabel('Spending Score (1-100)') plt.legend() plt.show()
def silhouette(X, alg = "kmeans", max_dec = 5): assert alg in ['agglomerative', 'kmeans'], "alg must be kmeans or agglomerative" x_plot = [] silhuette_plot = [] sse_plot = [] max_silhouette = -1 #x_plot.append(1) #silhuette_plot.append(max_silhouette) num_dec = 0 n_clusters = 1 fig, ax1 = plt.subplots(1) ax1.set_title(("Silhouette score for each cluster number"),fontsize=16, fontweight='bold') fig.set_size_inches(18, 7) plt.grid(b=True, which='major', color='#666666', linestyle='-') plt.minorticks_on() plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2) hl, = plt.plot([], []) best_labels = [] while num_dec <= max_dec: n_clusters+=1 if alg == 'agglomerative': clusterer = AgglomerativeClustering(n_clusters=n_clusters) elif alg == 'kmeans': clusterer = KMeans(n_clusters=n_clusters, random_state=42) cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) x_plot.append(n_clusters) silhuette_plot.append(silhouette_avg) if silhouette_avg < max_silhouette: num_dec += 1 else: best_labels = cluster_labels max_silhouette = silhouette_avg num_dec = 0 if max_silhouette < 0.2: best_labels = [0]*len(X) #silhuette_plot = silhuette_plot/max(silhuette_plot) ax1.plot(x_plot , silhuette_plot, label='silhuoette') #sse_plot = sse_plot/max(sse_plot) #ax1.plot(x_plot , sse_plot, label='inertia') ax1.set_xticks([i+1 for i in range(n_clusters)]) ax1.legend() ax1.set_xlabel("Number of clusters") ax1.set_ylabel("Silhouette score") plt.show() print(f"Best cluster number is {len(set(best_labels))}") return best_labels
from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering import numpy as np from matplotlib import pyplot as plt X,Y = make_blobs(n_samples=2000, n_features=2, centers=8, cluster_std=2.0) plt.scatter(X[:,0], X[:,1], s=4) plt.title('Generated Data') plt.show() Z = AgglomerativeClustering(n_clusters=8, linkage='complete') P = Z.fit_predict(X) colormap = np.array(['r', 'g', 'b', 'k', 'y', 'c', 'm', 'orange']) plt.scatter(X[:,0], X[:,1], s=4, c=colormap[P]) plt.title('Clustering results') plt.show()
def HC_predict(data, kclusters): """Fit the HC on data""" hc = AgglomerativeClustering(n_clusters=kclusters, affinity='euclidean', linkage='ward') return hc.fit_predict(data)
data = pd.DataFrame(dictionary) plt.scatter(x1, y1) plt.scatter(x2, y2) plt.scatter(x3, y3) plt.show() # %% dendogram from scipy.cluster.hierarchy import linkage, dendrogram merg = linkage(data, method="ward") dendrogram(merg, leaf_rotation=90) plt.xlabel("data points") plt.ylabel("euclidean distance") plt.show() # %% HC from sklearn.cluster import AgglomerativeClustering hiyerartical_cluster = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="ward") cluster = hiyerartical_cluster.fit_predict(data) data["label"] = cluster plt.scatter(data.x[data.label == 0], data.y[data.label == 0], color="red") plt.scatter(data.x[data.label == 1], data.y[data.label == 1], color="green") plt.scatter(data.x[data.label == 2], data.y[data.label == 2], color="blue") plt.show()
import numpy as np import pandas as pd import scipy.cluster.hierarchy as shc from matplotlib import pyplot as plt from sklearn.cluster import AgglomerativeClustering customer_data = pd.read_csv('shopping_data.csv') print(customer_data.shape) dt = np.array(customer_data) # print(dt[0:10,:]) print(customer_data.head()) print(customer_data.iloc[0:10, 0:5].values) data = customer_data.iloc[0:15, 3:5].values print('--Linkage Matrix-only 10 rows------') link = shc.linkage(data, method='ward') print(link[0:10, ]) plt.figure(figsize=(6, 4)) plt.title("Customer Dendograms") dend = shc.dendrogram(shc.linkage(data, method='ward')) cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') clt = cluster.fit_predict(data) print('--Clusters formed by program-----') print(cluster.fit_predict(data)) plt.figure(figsize=(6, 4)) plt.scatter(data[:, 0], data[:, 1], c=cluster.labels_, cmap='rainbow') plt.show()
with open(file, "rb") as f: pdf = pdftotext.PDF(f) score_sheet = pdf[2] SS.append(" ".join( jieba.cut(re.sub('\W|\d|[a-zA-Z]', '', score_sheet), cut_all=False))) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(SS) SS_sim_mat = (tfidf * tfidf.T).A linkage_matrix = ward(SS_sim_mat) cluster = AgglomerativeClustering(n_clusters=N_Group, affinity='euclidean', linkage='ward') cluster.fit_predict(SS_sim_mat) os.mkdir(f'{ODIR}/manual') for i in range(N_Group): os.mkdir(f'{ODIR}/manual/G{i}') for i, file in enumerate(manual_files): copyfile( file, f'{ODIR}/manual/G' + str(cluster.labels_[i]) + '/' + file.split('/')[-1]) # create template excel files to start manual work for i in range(N_Group): temp = [ f for j, f in enumerate(manual_files) if cluster.labels_[j] == i ]
plt.axhline(y=8, c='black', lw=2, linestyle='dashed') #from scipy.cluster.hierarchy import fcluster #d=shc.linkage(X_principal, method ='ward') ac2 = AgglomerativeClustering(n_clusters=2, compute_full_tree=True) # Visualizing the clustering plt.figure(figsize=(6, 6)) color = ['b', 'r'] for i in range(X_new.shape[0]): plt.scatter(X_principal[i, 0], X_principal[i, 1], c=color[ac2.fit_predict(X_new)[i]], cmap='rainbow') ''' for i in range (35): plt.annotate(z[i],(X_principal[i, 0], X_principal[i, 1])) ''' plt.axhline(y=0, color='k') plt.axvline(x=0, color='k') plt.axis('off') labels = ac2.fit_predict(X_new) DX = pd.DataFrame(labels, columns=["label"]) df_concat = pd.concat([dfnew, DX], axis=1) print(("total samples is: ") + str(df_concat.shape[0]))
# for i in clusted_list_title: # print("Cluser: "******"Quality: ", len(clusted_list_title.get(i))) # for title in clusted_list_title.get(i): # # print(title) # dem+= 1 # print("Title Quality: ", dem) print("Agglomerative") # https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/ # cluster5 = AgglomerativeClustering(n_clusters=12, affinity='precomputed', linkage='complete') cluster5 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='complete', distance_threshold=0.88) # clusted = model.fit_predict(linkage_matrix) cluster5.fit_predict(dist2) clus_list = [] for i in range(0, len(cluster5.labels_)): clus_dict = { 'id': list_id[i], 'old_cluster': list_old_cluster[i], 'new_cluster': cluster5.labels_.item(i), 'title': list_title[i] } clus_list.append(clus_dict) clus_list.sort(key=lambda item: item.get("new_cluster")) for cl in clus_list: print(cl) # for i in range(0, 209): # print(i, ": ", linkage_matrix[i])
import scipy.cluster.hierarchy as shc plt.figure(figsize=(10, 7)) plt.title("Dendrograms") dend = shc.dendrogram(shc.linkage(data_scaled, method='ward')) #%%%% #The x-axis contains the samples and y-axis represents the distance between these samples. The vertical line with maximum distance is the blue line and hence we can decide a threshold of 6 and cut the dendrogram: plt.figure(figsize=(10, 7)) plt.title("Dendrograms") dend = shc.dendrogram(shc.linkage(data_scaled, method='ward')) plt.axhline(y=6, color='r', linestyle='--') #We have two clusters as this line cuts the dendrogram at two points. Let’s now apply hierarchical clustering for 2 clusters: from sklearn.cluster import AgglomerativeClustering cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') cluster.fit_predict(data_scaled) #%%% #We can see the values of 0s and 1s in the output since we defined 2 clusters. 0 represents the points that belong to the first cluster and 1 represents points in the second cluster. Let’s now visualize the two clusters: plt.figure(figsize=(10, 7)) plt.scatter(data_scaled['Milk'], data_scaled['Grocery'], c=cluster.labels_) #%%%% #selecting number of clusters #https://www.analyticsvidhya.com/wp-content/uploads/2016/11/clustering-7.png #https://www.analyticsvidhya.com/blog/2016/11/an-introduction-to-clustering-and-different-methods-of-clustering/
plt.show() fig = plt.figure(figsize=(8, 8), facecolor="white") axd = fig.add_axes([0.09, 0.1, 0.2, 0.6]) row_dendr = dendrogram(row_clusters, orientation='left') df_rowclust = df.iloc[row_dendr['leaves'][::-1]] axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r') axd.set_xticks([]) axd.set_yticks([]) for i in axd.spines.values(): i.set_visible(False) fig.colorbar(cax) axm.set_xticklabels([''] + list(df_rowclust.columns)) axm.set_yticklabels([''] + list(df_rowclust.index)) plt.show() ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete') labels = ac.fit_predict(X) print('Cluster Label: %s' % labels) ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete') labels = ac.fit_predict(X) print('Cluster Label: %s' % labels)
# -*- coding: utf-8 -*- from numpy import * from matplotlib.pyplot import * from sklearn.cluster import AgglomerativeClustering # cosine類似度に基づくクラスタリング random.seed(0) N = 300 x = random.uniform(-1, 1, N) y = random.uniform(-1, 1, N) xlim(-1, 1) ylim(-1, 1) scatter(x, y, s=50, cmap=cm.rainbow) show() cls = AgglomerativeClustering(n_clusters = 10, affinity="cosine", linkage="average") labels = cls.fit_predict(c_[x, y]) xlim(-1, 1) ylim(-1, 1) scatter(x, y, s=50, c=labels, cmap=cm.rainbow) show()
def clustering_tfidf(indir, level=None): datadir = indir + '/level-' + level lab_to_idx, idx_to_lab = _load_vocab(datadir, ut.file_names['vocab']) _, behrs = _load_data(datadir, ut.file_names['behr']) terms = [] for vec in behrs.values(): terms.extend(vec) count = 0 list_count = {} for idx, lab in idx_to_lab.items(): co = terms.count(str(idx)) list_count[lab] = co if co > 1: count += 1 print("Number of repeated terms: {0} -- Terms with one occurrence: {1}\n".format(count, len(lab_to_idx)-count)) print('Most frequent terms (TF>20)') x = [] y = [] for lab, co in list_count.items(): if co > 20: x.append(lab) y.append(co) print('%s, %d' % (lab, co)) else: x.append('TF<20') y.append(co) plt.figure(figsize=(30, 20)) plt.bar(x, y) plt.tick_params(axis='x', rotation=90, labelsize=10) plt.savefig(os.path.join(datadir, 'term20-distribution.png')) plt.figure(figsize=(20, 10)) plt.bar(range(len(list_count.values())), list(list_count.values())) plt.tick_params(axis='x', rotation=90, labelsize=10) plt.savefig(os.path.join(datadir, 'term-distribution.png')) print('\n') # TF-IDF print('Computing TF-IDF matrix...') doc_list = list(map(lambda x: ' '.join(x), list(behrs.values()))) id_subj = [id_lab for id_lab in behrs] vectorizer = TfidfVectorizer(norm='l2') tfidf_mtx = vectorizer.fit_transform(doc_list) print('Performing SVD on the TF-IDF matrix...') reducer = TruncatedSVD(n_components=ut.n_dim, random_state=123) encoded_dt = reducer.fit_transform(tfidf_mtx) # Internal clustering validation rf = RandomForestClassifier(criterion='entropy', random_state=42) best = 0 for n_clu in range(ut.min_cl, ut.max_cl): hclu = AgglomerativeClustering(n_clusters=n_clu) lab_cl = hclu.fit_predict(encoded_dt) tmp_silh = silhouette_score(encoded_dt, lab_cl) print('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, tmp_silh)) enc_tr, enc_ts, lab_tr, lab_ts = train_test_split(encoded_dt, lab_cl, stratify=lab_cl, test_size=0.25, random_state=42) rf.fit(enc_tr, lab_tr) rf_predict = rf.predict(enc_ts) tmp_mcc = matthews_corrcoef(lab_ts, rf_predict) print(' MCC RF classifier: %.2f' % tmp_mcc) mu = np.mean([tmp_mcc, tmp_silh]) if mu > best: best_mcc = tmp_mcc best_silh = tmp_silh best_lab_cl = lab_cl best_n_clu = n_clu best = mu print('\n') print("MCC: %.4f -- silhouette score: %.4f -- Number of clusters: %d\n" % (best_mcc, best_silh, best_n_clu)) num_count = np.unique(best_lab_cl, return_counts=True)[1] for idx, nc in enumerate(num_count): print("Cluster {0} -- Numerosity {1}".format(idx, nc)) print('\n') colormap = [c for c in ut.col_dict if c not in ut.c_out] colormap_rid = [colormap[cl] for cl in sorted(list(set(best_lab_cl)))] colors_en = [colormap_rid[v] for v in best_lab_cl] umap_mtx = umap.UMAP(random_state=42).fit_transform(encoded_dt) single_plot(datadir, umap_mtx, best_lab_cl, colors_en) linked = linkage(encoded_dt, 'ward') # Color mapping dflt_col = "#808080" # Unclustered gray # * rows in Z correspond to "inverted U" links that connect clusters # * rows are ordered by increasing distance # * if the colors of the connected clusters match, use that color for link link_cols = {} for i, i12 in enumerate(linked[:, :2].astype(int)): c1, c2 = (link_cols[x] if x > len(linked) else colormap_rid[best_lab_cl[x]] for x in i12) link_cols[i + 1 + len(linked)] = c1 if c1 == c2 else dflt_col plt.figure(figsize=(20, 10)) # Dendrogram dendrogram(Z=linked, labels=best_lab_cl, color_threshold=None, leaf_font_size=5, leaf_rotation=0, link_color_func=lambda x: link_cols[x]) plt.savefig(os.path.join(datadir, 'dendrogram-tfidf.png')) with open(os.path.join(indir, 'person-demographics.csv')) as f: rd = csv.reader(f) next(rd) dem = {r[0]: r[1::] for r in rd} df_ar = [] for id_name, coord, cl_lab in zip(id_subj, umap_mtx, best_lab_cl): df_ar.append([id_name, coord[0], coord[1], cl_lab, age(dem[id_name][0]), dem[id_name][2], dem[id_name][3]]) df_ar = np.array(df_ar) df = pd.DataFrame(df_ar, columns=['id_subj', 'x', 'y', 'cluster', 'age', 'sex', 'n_enc']) df['x'] = df['x'].astype('float64') df['y'] = df['y'].astype('float64') df['age'] = df['age'].astype('float64') df['n_enc'] = df['n_enc'].astype('int') p_clu = {} with open(os.path.join(datadir, 'person-cluster.txt'), 'w') as f: wr = csv.writer(f) wr.writerow(['ID_LAB', 'CLUSTER']) for el in df_ar: wr.writerow([el[0], el[3]]) p_clu[el[0]] = el[3] source = ColumnDataSource(dict( x=df['x'].tolist(), y=df['y'].tolist(), id_subj=df['id_subj'].tolist(), cluster=[str(i) for i in df['cluster'].tolist()], age=df['age'].tolist(), sex=df['sex'].tolist(), n_enc=df['n_enc'].tolist())) labels = [str(i) for i in df['cluster'].tolist()] cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), palette=colormap_rid) TOOLTIPS = [('id_subj', '@id_subj'), ('cluster', '@cluster'), ('sex', '@sex'), ('age', '@age'), ('n_enc', '@n_enc')] plotTools = 'box_zoom, wheel_zoom, pan, crosshair, reset, save' output_file(filename=os.path.join(datadir, 'tfidf-plot-interactive.html'), mode='inline') p = figure(plot_width=800, plot_height=800, tools=plotTools) p.add_tools(HoverTool(tooltips=TOOLTIPS)) p.circle('x', 'y', legend='cluster', source=source, color={"field": 'cluster', "transform": cmap}) save(p) freq_term(best_lab_cl, idx_to_lab, behrs, p_clu)