def ClusterItems(data_file, items_bias_file, index_file, clusters_file, centroids_file): data = np.genfromtxt(data_file) popular_items = np.genfromtxt(index_file).astype('int') data = data[popular_items] items_bias = np.genfromtxt(items_bias_file) important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0] kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance) print(NUM_CLUSTERS, important_items.shape) print("end", data.shape) clusters = kclusterer.cluster(data[important_items], assign_clusters=True) np.savetxt(centroids_file, kclusterer.means()) np.savetxt(clusters_file, clusters)
def recluster(df, cl, clusters, n_clusters): lbls = cl.labels_ mask = np.array([False for i in range(len(lbls))]) for c in clusters: mask |= lbls==c subpipe, results = data_pipeline(df[mask]) ##use cosine similarity! NLTK clustering implementation #KMeans cluster object as carrier for consistency subcl = cluster(results, n_clusters) kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50) assigned_clusters = kclusterer.cluster(results, assign_clusters=True) #assign new cluster labels and cluster centroids subcl.labels_ = np.array(assigned_clusters) subcl.cluster_centers_ = np.array(kclusterer.means()) return subpipe, subcl, results, df[mask]
lines = open(datacfg).readlines() images = [] for line in lines: if (line.split(' ')[0] == 'train'): valid_path = line.strip().split(' ')[-1] if (valid_path[0] != '/'): valid_path = workspace + valid_path lists = open(valid_path).readlines() images = [x.strip() for x in lists] bboxes = [] for image in images: label = image.replace('.jpg', '.txt') lines = open(label).readlines() for line in lines: splitline = line.split(' ') # bboxes.append([float(x)*13. for x in splitline[-2:]]) bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.]) print(len(bboxes)) # samples = random.sample(bboxes, 15000) # print(len(samples)) bboxes = np.array(bboxes) # samples = np.array(samples) # print(samples.shape) KMeans = KMeansClusterer(5, negIoU, repeats=1) # clusters = KMeans.cluster(samples, True) clusters = KMeans.cluster(bboxes, True) centroids = KMeans.means() print(np.array(centroids) / np.array((1., 1.)))
def main(argv): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='input file', required=True) parser.add_argument('-s', '--step', help='step', required=True) parser.add_argument('-ik', '--init_k', help='K initial', required=True) parser.add_argument('-fk', '--final_k', help='K final', required=True) parser.add_argument('-od', '--distortion_out_file', help='elbow distortion graph file', required=True) parser.add_argument('-os', '--silhouette_out_file', help='elbow silhoutte graph', required=True) parser.add_argument('-pca', '--pca', help='with pca', action='store_true') parser.add_argument('-k_pca', '--k_pca', help='k pca') ARGS = parser.parse_args() descriptors = load_dataset(ARGS.input_file) if ARGS.pca == True: print("With pca") pca = PCA(n_components=int(ARGS.k_pca)) descriptors = pca.fit_transform(descriptors) ks = [] distortions = [] silhouettes = [] for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)): # kmeanModel = KMeans(n_clusters=k, init='k-means++') # kmeanModel.fit(descriptors) # predictions = kmeanModel.predict(descriptors) # cluster_centers_ = kmeanModel.cluster_centers_ kclusterer = KMeansClusterer( k, distance=nltk.cluster.util.cosine_distance) predictions = kclusterer.cluster(descriptors, assign_clusters=True) predictions = np.array(predictions) cluster_centers_ = np.array(kclusterer.means()) distortion = sum( np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'), axis=1)) / descriptors.shape[0] silhouette_score = metrics.silhouette_score(descriptors, predictions, metric='cosine') distortions.append(distortion) silhouettes.append(silhouette_score) ks.append(k) print("k:", k, "distortion:", distortion, "Silhouette Coefficient", silhouette_score) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, distortions, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method') fig.savefig(ARGS.distortion_out_file) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, silhouettes, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Silhouette Score') plt.title('Silhouette Score analysis') fig.savefig(ARGS.silhouette_out_file)
if score < 0.7: break try: arr = numpy.append(arr, numpy.reshape(model.wv.word_vec(phrase), (1, 100)), axis=0) except KeyError: pass else: embedded_phrases.append(phrase) print('number of sample points:', len(embedded_phrases)) kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance) clusters = kmeans.cluster(arr, assign_clusters=True) centers = kmeans.means() result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []} for i in range(len(clusters)): result[clusters[i]].append([ nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]), embedded_phrases[i] ]) for k in result: sorted_result = sorted(result[k], reverse=True) final_result = '\n'.join( ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result]) f = open('cluster' + str(k) + '.txt', 'w+') f.write(final_result)
# In[222]: # Part 3 - Run 50 random initializations cos_rand_scores = np.zeros((50)) cos_mi_scores = np.zeros((50)) centroids_cos = {} for i in range(0, 50): km_cos = KMeansClusterer(5, distance=cosine_distance, avoid_empty_clusters=True) km_cos_cl = km_cos.cluster(X, assign_clusters=True) cos_rand_scores[i] = (met.adjusted_rand_score(true_Labels, km_cos_cl)) cos_mi_scores[i] = (met.adjusted_mutual_info_score(true_Labels, km_cos_cl)) centroids_cos[i] = km_cos.means() # In[199]: # Report average Adjusted Rand and Mutual Information scores print( "Adjusted Rand Score Averaged over 50 initializations (Cosine Distance): " + str(cos_rand_scores.mean())) print( "Adjusted Rand Score St Dev over 50 initializations (Cosine Distance): " + str(cos_rand_scores.std())) print( "\nAdjusted Mutual Information Score Averaged over 50 initializations (Cosine Distance): " + str(cos_mi_scores.mean())) print(
def main(argv): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='input file', required=True) parser.add_argument('-ids', '--ids_file', help='ids file', required=True) parser.add_argument('-n_components', '--n_components', help='number of components in pca', required=True) parser.add_argument('-k', '--k', help='k of kmeans', required=True) ARGS = parser.parse_args() descriptors = load_dataset(ARGS.input_file) ids_list, news_groups = get_hash_ids(ARGS.ids_file) print("PCA") pca = PCA(n_components=int(ARGS.n_components)) descriptors = pca.fit_transform(descriptors) # kmeanModel = KMeans(n_clusters=int(ARGS.k), init='k-means++') # kmeanModel.fit(descriptors) # predictions = kmeanModel.predict(descriptors) # cluster_centers_ = kmeanModel.cluster_centers_ # print(predictions) print("Kmeans") kclusterer = KMeansClusterer(int(ARGS.k), distance=nltk.cluster.util.cosine_distance) predictions = np.array( kclusterer.cluster(descriptors, assign_clusters=True)) cluster_centers_ = np.array(kclusterer.means()) print("Distortions") # distortion_eu = sum(np.min(distance.cdist(descriptors, cluster_centers_, 'euclidean'), axis=1)) / descriptors.shape[0] distortion_cos = sum( np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'), axis=1)) / descriptors.shape[0] print("Silhouettes") # silhouette_score_eu = metrics.silhouette_score(descriptors, predictions, metric='euclidean') silhouette_score_cos = metrics.silhouette_score(descriptors, predictions, metric='cosine') # print("EUCLIDEAN K:", ARGS.k, "distortion:", distortion_eu, "silhouette score:", silhouette_score_eu) print("COS K:", ARGS.k, "distortion:", distortion_cos, "silhouette score:", silhouette_score_cos) closest, _ = pairwise_distances_argmin_min(cluster_centers_, descriptors) medoids_ids = ids_list[closest] medoids = descriptors[closest] dist = distance.cdist(medoids, medoids, metric='cosine') # Five knns = dist.argsort(axis=1)[:, :6][:, 1:] for id_, knn in zip(medoids_ids, knns): print("\nMedoid id:", id_, "label:", news_groups[id_]) print("Cercanos:") for nn in knn: print("\t id:", medoids_ids[nn], "labels:", news_groups[medoids_ids[nn]]) metric = [] for i in range(int(225)): ids_l = ids_list[np.where(predictions == i)] # if len(ids_l) == 0: # counter_0+=1 # continue clusters_labels = [] for id_l in ids_l: label_list = news_groups[id_l] for ll in label_list: clusters_labels.append(ll) clnp = np.array(clusters_labels) uni, con = np.unique(clnp, return_counts=True) #letter_counts = Counter(clusters_labels) #df = pandas.DataFrame.from_dict(letter_counts, orient='index') ind = np.argsort(con)[::-1] uni = uni[ind] con = con[ind] maxim = con.sum() cont = con[0] label = uni[0] uni = uni[1:] con = con[1:] marker = np.zeros(uni.shape) for s in label.split('.'): for j in range(uni.shape[0]): if marker[j] == 0 and s in uni[j]: cont += con[j] marker[j] = 1 # print("cluster:", i, "metrica:", cont/maxim ) metric.append(cont / maxim) metric = np.array(metric, dtype=np.float) print("mean:", metric.mean()) print("std:", metric.std()) print("median:", np.median(metric)) print("Min:", np.min(metric)) print("Max:", np.max(metric)) return 0