def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None): """ Compute the clusters assignement from spectral clustering algorithm steps : * Compute laplacian * Compute k smaller eigenvalues and associated eigenvectors * Train a kmean on this vectors * Apply this kmean to the Laplacian """ if algo not in ['sph', None]: raise Exception('Algorithm {} unknown'.format(algo)) L = get_laplacian(A, laplacian_normalization) L = scipy.sparse.csr_matrix(L, dtype=np.float64) v, w = eigsh(L, nb_clusters, which='SM') if algo == None : km = KMeans(n_clusters= nb_clusters) km.fit(np.transpose(w)) clusters = km.predict(L) elif algo == 'sph': clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
def nltk_clustering(n, filename): global vectors global names global repeats # Clustering print("Begin clustering, n = {:d}...".format(n)) clusterer = KMeansClusterer(n, cosine_distance, repeats=repeats) clustered = clusterer.cluster(vectors, assign_clusters=True, trace=False) clustered = np.array(clustered) index = sorted(clustered) # print(clustered.argsort()) names = list(names[clustered.argsort()]) # write result to file print("Saving result to file...") output = filename[:-4] + "_" + str(n) + "_clustered.txt" with open(output, "w") as f: current_idx = None for itr, idx in zip(names, index): if current_idx != idx: current_idx = idx f.write("\nCluster {:d} (description: )\n".format(current_idx)) else: pass f.write(itr + "\n") # print("Clustered result saved in {0}".format(output))
def clusterer_nltk_kmeans(X, n_clusters): # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ], # "_return": [{ "type": "numpy.ndarray","dtype": "int32"} # in this case we want to try different numbers of clusters, so it is a parameter import nltk import numpy as np from nltk.cluster.kmeans import KMeansClusterer print('clusterer_nltk_kmeans') clusterAlgLabelAssignmentsNK = None # X = XY[0] cmtVectors = X # XY[1] if type(cmtVectors) is np.ndarray and len(cmtVectors) > 0: # dt = np.dtype(cmtVectors) dt = cmtVectors.dtype if dt.type is np.float32 or dt.type is np.float64: clusterAlgNK = KMeansClusterer( params['n_clusters'], distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True) clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster( cmtVectors, assign_clusters=True) XY = (X, clusterAlgLabelAssignmentsNK) return XY
def nltk_manhattan_kmeans(encoded_img): from scipy.spatial.distance import cityblock from nltk.cluster.kmeans import KMeansClusterer kclusterer = KMeansClusterer(2, distance=cityblock, repeats=10) assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True) print_labels(assigned_clusters)
def nltk_euclidean_kmeans(encoded_img): from nltk.cluster.util import euclidean_distance from nltk.cluster.kmeans import KMeansClusterer kclusterer = KMeansClusterer(2, distance=euclidean_distance, repeats=10) assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True) print_labels(assigned_clusters)
def spherical_clustering_from_adjency(A, nb_clusters): """ Spectral clustering with spherical kmeans """ A = scipy.sparse.csr_matrix(A, dtype=np.float64) v, w = eigsh(A, nb_clusters, which='LM') clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
def get_cluster(tfidf_arr, k): """ K-means聚类 :param tfidf_arr: :param k: :return: """ kmeans = KMeansClusterer(num_means=k, distance=cosine_distance, avoid_empty_clusters=True) # 分成k类,使用余弦相似分析 kmeans.cluster(tfidf_arr) # 获取分类 kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr]) fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt', 'a+', encoding='utf-8') for i, v in kinds.items(): fw.write(str(i) + '\t' + str(v) + '\n') fw.close()
def Kmeans(self, volcabulary, vectors, n_cluster): """K-means clustering based on cosine similarity of word2vec. """ kclusterer = KMeansClusterer( n_cluster, distance=nltk.cluster.util.cosine_distance, repeats=10, avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) dic = defaultdict(list) for c, w in zip(assigned_clusters, volcabulary): dic[c].append(w) return assigned_clusters, dic
def new_cluster(filepath): NUM_CLUSTERS = 4 data = get_data(filepath) kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=lambda a, b: np.max(a - b), repeats=1000) labels = kclusterer.cluster(data, assign_clusters=True) print("Showing the cluster results") for id in range(NUM_CLUSTERS): for i in range(len(data)): if labels[i] == id: print("Joint : ", i + 1, " Joint Values: ", data[i], " Cluster Id: ", id)
def ClusterItems(data_file, items_bias_file, index_file, clusters_file, centroids_file): data = np.genfromtxt(data_file) popular_items = np.genfromtxt(index_file).astype('int') data = data[popular_items] items_bias = np.genfromtxt(items_bias_file) important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0] kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance) print(NUM_CLUSTERS, important_items.shape) print("end", data.shape) clusters = kclusterer.cluster(data[important_items], assign_clusters=True) np.savetxt(centroids_file, kclusterer.means()) np.savetxt(clusters_file, clusters)
def main(): getFiles() tf_idf() num_clusters = int(sys.argv[2]) kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(wordvec, assign_clusters=True) clustersDict = {} for i in range(num_clusters): clustersDict[i] = [] for i in range(len(assigned_clusters)): clustersDict[assigned_clusters[i]].append(fileList[i]) printClustersInFormat(clustersDict)
class KMeansClusters(BaseEstimator, TransformerMixin): def __init__(self, k=7) -> None: self.k = k self.distance = nltk.cluster.cosine_distance self.model = KMeansClusterer(self.k, self.distance, avoid_empty_clusters=True) def fit(self, data, labels=None): return self def transform(self, data): return self.model.cluster(data, assign_clusters=True)
def cluster(folderName, vectorsize, clusterType): corpus = loadXES.get_doc_XES_tagged(folderName + '.xes') print('Data Loading finished, ', str(len(corpus)), ' traces found.') model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' + str(vectorsize) + '.model') vectors = [] NUM_CLUSTERS = 5 print("inferring vectors") for doc_id in range(len(corpus)): inferred_vector = model.infer_vector(corpus[doc_id].words) vectors.append(inferred_vector) print("done") if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "HierWard"): ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS, linkage='ward').fit(vectors) assigned_clusters = ward.labels_ elif clusterType == "OCSVM": ocsvm = OneClassSVM() assigned_clusters = ocsvm.fit_predict(vectors) else: print( clusterType, " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ", clusterType) return trace_list = loadXES.get_trace_names(folderName + ".xes") clusterResult = {} for doc_id in range(len(corpus)): clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id] resultFile = open( 'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType + '.csv', 'w') for doc_id in range(len(corpus)): resultFile.write(trace_list[doc_id] + ',' + str(assigned_clusters[doc_id]) + "\n") resultFile.close() print("done with ", clusterType, " on event log ", folderName)
def cluster(clusterType, vectors, y): if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "GMM"): GMM = GaussianMixture(n_components=NUM_CLUSTERS) assigned_clusters = GMM.fit_predict(vectors) elif (clusterType == "SVM"): classifier = SVC(kernel='rbf', gamma='auto', random_state=0) #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) elif (clusterType == "T2VH"): ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS) children = ret[0] n_leaves = ret[2] assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children, n_leaves) elif (clusterType == "RandomForest"): classifier = RandomForestClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "DecisionTree"): classifier = DecisionTreeClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "LogisticRegression"): classifier = sklearn.linear_model.LogisticRegression() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) else: print(clusterType, " is not a predefined cluster type.") return return assigned_clusters
def clustering(dataframe, repeats, myStopwords): num_clusters = 5 # define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(stop_words=myStopwords) # Only process the content, not the title tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe["Content"]) # Convert it to an array tfidf_matrix_array = tfidf_matrix.toarray() # Run K-means with cosine distance as the metric kclusterer = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=repeats) # Output to assigned_clusters assigned_clusters = kclusterer.cluster(tfidf_matrix_array, assign_clusters=True) # cluster_size counts how many elements each cluster contains cluster_size = [0, 0, 0, 0, 0] # Create a 5x5 array and fill it with zeros matrix = [[0 for x in range(5)] for y in range(5)] # For every category for category in categories: # For every article for row in range(0, len(assigned_clusters)): # Compare the cluster number with the category number if assigned_clusters[row] == categories.index(category): ind = categories.index(dataframe.ix[row][4]) matrix[categories.index(category)][ind] += 1 # Count how many elements each cluster contains for row in range(0, len(assigned_clusters)): cluster_size[assigned_clusters[row]] += 1 for x in range(5): for y in range(5): # Calculate frequency matrix[x][y] /= cluster_size[x] # Only keep the 2 first decimal digits matrix[x][y] = format(matrix[x][y], '.2f') # Output to a .csv file out_file = open("output/clustering_KMeans.csv", 'w') wr = csv.writer(out_file, delimiter="\t") newCategories = categories newCategories.insert(0, "\t") wr.writerow(newCategories) for x in range(5): newMatrix = matrix[x] clusterName = "Cluster " + str(x + 1) newMatrix.insert(0, clusterName) wr.writerow(matrix[x])
def recluster(df, cl, clusters, n_clusters): lbls = cl.labels_ mask = np.array([False for i in range(len(lbls))]) for c in clusters: mask |= lbls==c subpipe, results = data_pipeline(df[mask]) ##use cosine similarity! NLTK clustering implementation #KMeans cluster object as carrier for consistency subcl = cluster(results, n_clusters) kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50) assigned_clusters = kclusterer.cluster(results, assign_clusters=True) #assign new cluster labels and cluster centroids subcl.labels_ = np.array(assigned_clusters) subcl.cluster_centers_ = np.array(kclusterer.means()) return subpipe, subcl, results, df[mask]
def cluster_docs(self): vectors = [] used_lines = [] for doc, id in self.es_docs(): tokens = text_cleaner.clean_tokens(doc) if tokens != 'NC' and len(tokens) > 200: used_lines.append(tokens) vectors.append(self.model.infer_vector(tokens)) kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) print("done")
class kmeans_cosine(object): def __init__(self,k): self.k = k self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25) def build(self,X,p): """ """ data = scipy.sparse.csr_matrix(X).toarray() kclusters= np.array(self.model.cluster(data, assign_clusters=True)) prediction = self.model.classify(p) cluster_id = kclusters == prediction return cluster_id, prediction def save(self, filename = "model2.pkl"): """ """ with open(filename, 'w') as f: pickle.dump(self.model, f)
# create counter and idf vectors count_vect = TfidfVectorizer (stop_words=stop_words) count_vect.fit(df['Content']) #12266 X_train_counts = count_vect.transform(df['Content'] # reduce size of vector with LSI svd = TruncatedSVD(n_components=5) X_train_counts = svd.fit_transform(X_train_counts) # Clustering kclusterer = KMeansClusterer(num_means = 5, distance=cosine_distance, repeats=25, avoid_empty_clusters= True) clusters = kclusterer.cluster(X_train_counts, assign_clusters=True) # print "Clusters:\n " , clusters # print "Means" , kclusterer.means() # Prepare results Matrix categories_map={ 'Politics': 0, 'Business': 1, 'Film': 2, 'Technology': 3, 'Football': 4 } labels_map={
# Part 1 - Compute Kmeans using Cosine distance with 5 clusters # Use KMeans Clusterer from Natural Language Toolkit Package as this allows Cosine distance to be used as distance measure # Import packages from NLTK from nltk.cluster.kmeans import KMeansClusterer from nltk.cluster.util import cosine_distance # In[65]: # Perform clustering using Cosine distance km_cos = KMeansClusterer(5, distance=cosine_distance, avoid_empty_clusters=True) km_cos_cl = km_cos.cluster(X, assign_clusters=True) # In[103]: # Part 2 - Calculate Adjusted Rand Score and MI Score print("Adjusted Rand Score (Cosine Distance): " + str(met.adjusted_rand_score(true_Labels, km_cos_cl))) print("\nAdjusted Mutual Information Score (Cosine Distance): " + str(met.adjusted_mutual_info_score(true_Labels, km_cos_cl))) # Clustering using Cosine distance has resulted in far better results on both the Adjusted Rand and Adjusted MI scores. As with Euclidean distance, attempting clustering after performing PCA resulted in reduced scores on both indices (see Appendix). # In[222]: # Part 3 - Run 50 random initializations
lines = open(datacfg).readlines() images = [] for line in lines: if (line.split(' ')[0] == 'train'): valid_path = line.strip().split(' ')[-1] if (valid_path[0] != '/'): valid_path = workspace + valid_path lists = open(valid_path).readlines() images = [x.strip() for x in lists] bboxes = [] for image in images: label = image.replace('.jpg', '.txt') lines = open(label).readlines() for line in lines: splitline = line.split(' ') # bboxes.append([float(x)*13. for x in splitline[-2:]]) bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.]) print(len(bboxes)) # samples = random.sample(bboxes, 15000) # print(len(samples)) bboxes = np.array(bboxes) # samples = np.array(samples) # print(samples.shape) KMeans = KMeansClusterer(5, negIoU, repeats=1) # clusters = KMeans.cluster(samples, True) clusters = KMeans.cluster(bboxes, True) centroids = KMeans.means() print(np.array(centroids) / np.array((1., 1.)))
if score < 0.7: break try: arr = numpy.append(arr, numpy.reshape(model.wv.word_vec(phrase), (1, 100)), axis=0) except KeyError: pass else: embedded_phrases.append(phrase) print('number of sample points:', len(embedded_phrases)) kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance) clusters = kmeans.cluster(arr, assign_clusters=True) centers = kmeans.means() result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []} for i in range(len(clusters)): result[clusters[i]].append([ nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]), embedded_phrases[i] ]) for k in result: sorted_result = sorted(result[k], reverse=True) final_result = '\n'.join( ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result]) f = open('cluster' + str(k) + '.txt', 'w+') f.write(final_result)
from nltk.cluster.util import cosine_distance from sklearn import manifold from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from assignment_2.data.data_reader import get_speeches from assignment_2.data.president import Trump speeches = get_speeches() speeches_sum = speeches.groupby(['president'])[['script']].sum() tfidf = TfidfVectorizer(stop_words='english') X = tfidf.fit_transform(speeches_sum.script).toarray() tfidf_matrix = tfidf.fit_transform(speeches_sum.script).toarray() kmeans = KMeansClusterer(27, distance=cosine_distance, repeats=25) clusters = kmeans.cluster(X, assign_clusters=True) speakers = speeches_sum.index.tolist() cluster_df = pd.DataFrame(speakers, clusters) cluster_df = cluster_df.reset_index() cluster_df.columns = ['president_index', 'president'] print(clusters) trump_index = cluster_df[cluster_df['president'] == Trump]['president_index'].iloc[0] similar_speakers = [ speakers[i] for i, c in enumerate(clusters) if (c == trump_index) and (i != 8) ]
# #Plot the clusters obtained using k means # fig = plt.figure() # ax = fig.add_subplot(111) # scatter = ax.scatter(big_data_copy['Accounting'],big_data_copy['3D Printing'], # c=kmeans[0],s=50) # plt.colorbar(scatter) # this one is not working out...dataframe might not be correct format NUM_CLUSTERS = 10 kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(big_data_copy, assign_clusters=True) '''NEW PLAN, thanks Evan ONE HOT ENCODIGN BUT WITH ADDED UP VECTROS ex: Math 1 Art 2 Math 3 CS 50 Joe 1 0 0 0 Bob 0 0 1 0 Smith 1 0 0 0 Bob 0 1 0 0 Smith 0 0 0 1 groupByIndividual alphabetical is fine probably, jsut want them to be same name next to each other Math 1 Art 2 Math 3 CS 50 Joe 1 0 0 0
def cluster(self, docs_repr): kclusterer = KM(self.n_clusters, distance=cosine_distance, repeats=25,avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(docs_repr, assign_clusters=True) return assigned_clusters
articals = [] for cutword in Red_df.cutword: articals.append(" ".join(cutword)) ## 构建语料库,并计算文档--词的TF-IDF矩阵 vectorizer = CountVectorizer() transformer = TfidfVectorizer() tfidf = transformer.fit_transform(articals) ## tfidf 以稀疏矩阵的形式存储,将tfidf转化为数组的形式,文档-词矩阵 dtm = tfidf.toarray() ## 使用夹角余弦距离进行k均值聚类 kmeans = KMeansClusterer(num_means=2, #聚类数目 distance=nltk.cluster.util.cosine_distance, #夹角余弦距离 ) kmeans.cluster(dtm) ## 聚类得到的类别 labpre = [kmeans.classify(i) for i in dtm] kmeanlab = Red_df[["ChapName","Chapter"]] kmeanlab["cosd_pre"] = labpre kmeanlab ## 查看每类有多少个分组 count = kmeanlab.groupby("cosd_pre").count() ## 将分类可视化 count.plot(kind="barh",figsize=(6,5)) for xx,yy,s in zip(count.index,count.ChapName,count.ChapName): plt.text(y =xx-0.1, x = yy+0.5,s=s)
x_a = torch.max(x1 - w1 / 2.0, x2 - w2 / 2.0) y_a = torch.max(y1 - h1 / 2.0, y2 - h2 / 2.0) x_b = torch.min(x1 + h1 / 2.0, x2 + w2 / 2.0) y_b = torch.max(y1 + h1 / 2.0, y2 + h2 / 2.0) intersection = torch.clamp(x_b - x_a, min=0) * torch.clamp(y_b - y_a, min=0) union = w1 * h1 + w2 * h2 - intersection return intersection / (union + 1e-6) kclusterer = KMeansClusterer(args.num_bbox, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(data, assign_clusters=True) kmeans_wh = KMeans(n_clusters=args.num_bbox) kmeans_wh.fit(train_wh) bbox_priors = kmeans_wh.cluster_centers_ np.save('priors.npy', bbox_priors) bbox_priors = torch.from_numpy(bbox_priors).cuda() # Set up the network features = DenseNet(growth_rate=8, block_config=(4, 8, 16, 32), activation=nn.LeakyReLU(inplace=True), input_channels=3) classifier = YOLOClassifier(features.output_channels, bbox_priors,
print("inferring vectors") duplicate_dict = {} used_lines = [] for i, t in enumerate(lines): if i % 2 == 0 and t not in duplicate_dict: duplicate_dict[t] = True used_lines.append(t) vectors.append(model.infer_vector(preprocess_document(t))) print("done") kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) # clustersizes = [] # # def distanceToCentroid(): # for i in range(0,NUM_CLUSTERS): # clustersize = 0 # for j in range(0,len(assigned_clusters)): # if (assigned_clusters[j] == i): # clustersize+=1 # clustersizes.append(clustersize) # dist = 0.0 # centr = means[i] # for j in range(0,len(assigned_clusters)): # if (assigned_clusters[j] == i): # dist += pow(nltk.cluster.util.cosine_distance(vectors[j], centr),2)/clustersize
def main(argv): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='input file', required=True) parser.add_argument('-s', '--step', help='step', required=True) parser.add_argument('-ik', '--init_k', help='K initial', required=True) parser.add_argument('-fk', '--final_k', help='K final', required=True) parser.add_argument('-od', '--distortion_out_file', help='elbow distortion graph file', required=True) parser.add_argument('-os', '--silhouette_out_file', help='elbow silhoutte graph', required=True) parser.add_argument('-pca', '--pca', help='with pca', action='store_true') parser.add_argument('-k_pca', '--k_pca', help='k pca') ARGS = parser.parse_args() descriptors = load_dataset(ARGS.input_file) if ARGS.pca == True: print("With pca") pca = PCA(n_components=int(ARGS.k_pca)) descriptors = pca.fit_transform(descriptors) ks = [] distortions = [] silhouettes = [] for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)): # kmeanModel = KMeans(n_clusters=k, init='k-means++') # kmeanModel.fit(descriptors) # predictions = kmeanModel.predict(descriptors) # cluster_centers_ = kmeanModel.cluster_centers_ kclusterer = KMeansClusterer( k, distance=nltk.cluster.util.cosine_distance) predictions = kclusterer.cluster(descriptors, assign_clusters=True) predictions = np.array(predictions) cluster_centers_ = np.array(kclusterer.means()) distortion = sum( np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'), axis=1)) / descriptors.shape[0] silhouette_score = metrics.silhouette_score(descriptors, predictions, metric='cosine') distortions.append(distortion) silhouettes.append(silhouette_score) ks.append(k) print("k:", k, "distortion:", distortion, "Silhouette Coefficient", silhouette_score) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, distortions, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method') fig.savefig(ARGS.distortion_out_file) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, silhouettes, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Silhouette Score') plt.title('Silhouette Score analysis') fig.savefig(ARGS.silhouette_out_file)
def k_means_cluster(): print("K means") # file = open("Subtree/sports.team.pro_athlete.txt","r") # op_pos = open("K_means/sports.team.pro_athlete_pos.txt","w") # op_neg = open("K_means/sports.team.pro_athlete_neg.txt","w") file = open("Test/test_subtree.txt", "r") op_pos = open("Test/test_sub_pos.txt", "w") op_neg = open("Test/test_sub_neg.txt", "w") new_list = file.readlines() new = [] filtered_list = [] temp_list = [] for item in new_list: new.append(item) if (item == "\n"): temp_list.append(new) new = [] for item in temp_list: if (len(item) == 1): continue else: filtered_list.append(item) data_matrix = random_sample.data_mat() new = filtered_list NUM_CLUSTERS = 5 print("Start assigning clusters") kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=100, avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(data_matrix, assign_clusters=True) #print("Assigned clusters ",assigned_clusters) c0 = 0 c1 = 0 c2 = 0 c3 = 0 c4 = 0 for item in assigned_clusters: if (item == 0): c0 = c0 + 1 if (item == 1): c1 = c1 + 1 if (item == 2): c2 = c2 + 1 if (item == 3): c3 = c3 + 1 if (item == 4): c4 = c4 + 1 maximum = max(c0, c1, c2, c3, c4) print("Cluster grouping") if (maximum == c0): pos = [] neg = [] positive_pos = [] negative_pos = [] second_max = max(c1, c2, c3, c4) if (second_max == c1): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c2): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c3): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c4): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (maximum == c1): pos = [] neg = [] positive_pos = [] negative_pos = [] second_max = max(c0, c2, c3, c4) if (second_max == c0): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c2): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c3): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) return pos, neg, positive_pos, negative_pos if (second_max == c4): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (maximum == c2): pos = [] neg = [] positive_pos = [] negative_pos = [] second_max = max(c0, c1, c3, c4) if (second_max == c0): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c1): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c3): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c4): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (maximum == c3): pos = [] neg = [] positive_pos = [] negative_pos = [] second_max = max(c0, c1, c2, c4) if (second_max == c0): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c1): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) if (second_max == c2): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (assigned_clusters[item] == 4): print(new[item], file=op_neg) return pos, neg, positive_pos, negative_pos if (second_max == c4): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (maximum == c4): pos = [] neg = [] positive_pos = [] negative_pos = [] second_max = max(c0, c1, c2, c3) if (second_max == c0): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_pos) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (second_max == c1): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_pos) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_neg) if (second_max == c2): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_neg) if (assigned_clusters[item] == 2): print(new[item], file=op_pos) if (second_max == c3): for item in range(len(assigned_clusters)): if (assigned_clusters[item] == 4): print(new[item], file=op_pos) if (assigned_clusters[item] == 0): print(new[item], file=op_neg) if (assigned_clusters[item] == 1): print(new[item], file=op_neg) if (assigned_clusters[item] == 3): print(new[item], file=op_pos) if (assigned_clusters[item] == 2): print(new[item], file=op_neg)