def jacards_Spectral_Clustering(jid, confDict, arff): data = la(arff) # pos_seq = confDict['input']['pos_seq'] # neg_seq = confDict['input']['neg_seq'] # pos_name = findName(pos_seq) # fasta file # neg_name = findName(neg_seq) ### get all features features = list(data[1])[:-1] # top = int(confDict['RF_gini_filter']['top']) # total_ranking_file = jid + "_gini_total_ranking.tsv" # total_ranking = open(total_ranking_file,"wb") # Y=np.array(data[0]["Class"]) # print Y # sum_pos = np.sum(Y) sum_pos = 334 X = np.array(map(lambda x: list(x), data[0][features].tolist())) X = X[1:sum_pos, :] n, m = X.shape print n, m Sim = np.ones((n, n)) print Sim for i in range(n): # print i for j in range(i + 1, n): Sim[i, j] = jacards_score(X[i], X[j], m) Sim[j, i] = Sim[i, j] print Sim Cluster = [] a = SC(n_clusters=2, affinity="precomputed", assign_labels="discretize") Clusters2 = a.fit_predict(Sim) a = SC(n_clusters=3, affinity="precomputed", assign_labels="discretize") Clusters3 = a.fit_predict(Sim) a = SC(n_clusters=4, affinity="precomputed", assign_labels="discretize") Clusters4 = a.fit_predict(Sim) a = SC(n_clusters=5, affinity="precomputed", assign_labels="discretize") Clusters5 = a.fit_predict(Sim) Cluster.append(Clusters2) Cluster.append(Clusters3) Cluster.append(Clusters4) Cluster.append(Clusters5) sil_score = [] silhouette_avg2 = silhouette_score(X, Clusters2) silhouette_avg3 = silhouette_score(X, Clusters3) silhouette_avg4 = silhouette_score(X, Clusters4) silhouette_avg5 = silhouette_score(X, Clusters5) sil_score.append(silhouette_avg2) sil_score.append(silhouette_avg3) sil_score.append(silhouette_avg4) sil_score.append(silhouette_avg5) ind = np.argmax(sil_score) print sil_score return Cluster[ind]
def Run(data, iter_count=1, limit=3): Run.counter += 1 graph = defaultdict(list) build_graph(data, graph, iter_count) data_matrix = build_data_matrix(graph, number_of_walks_per_node, rand=rand, restart=0) model = build_word2vec_model(data_matrix, embedding_size=embedding_size) train = model.syn0.astype(np.float) index = np.array(model.index2word) clf = SC(n_clusters=2) output = clf.fit_predict(train) index_of_data_0 = index[output == 0] index_of_data_1 = index[output == 1] data_0 = dict((key, data[key]) for key in index_of_data_0) data_1 = dict((key, data[key]) for key in index_of_data_1) print len(data_0), len(data_1), iter_count, limit if iter_count == limit: f = open("file_" + str(Run.counter) + '_0', "w") pickle.dump(data_0, f) f = open("file_" + str(Run.counter) + '_1', "w") pickle.dump(data_1, f) return elif iter_count > limit: return else: Run(data_0, iter_count + 1, limit) Run(data_1, iter_count + 1, limit)
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.sc = SC( n_clusters=self.hyperparams["n_clusters"], n_init=self.hyperparams["n_init"], n_neighbors=self.hyperparams["n_neighbors"], affinity=self.hyperparams["affinity"], random_state=self.random_seed, )
def build_model(file_name): label,data=get_embeddings(file_name) clf=SC(n_clusters=2) #temp=np.transpose(data) output=clf.fit_predict(data) # s=open("color",'w') # pickle.dump(output,s) plt.scatter(data[:,0],data[:,1],c=output) plt.show() # print "here" return output,np.array(label),data
def build_model(file_name): label,data=get_embeddings(file_name) print data.shape clf=SC(n_clusters=2) #temp=np.transpose(data) output=clf.fit_predict(data) s=open("color",'w') pickle.dump(output,s) s=output plt.scatter(data[:,0][s==0],data[:,1][s==0],marker='+',s=45,label='Class 1',c='r') plt.scatter(data[:,0][s==1],data[:,1][s==1],marker='o',s=45,label='Class 2',c='b') plt.legend(loc='upper left') plt.show() return output,np.array(label),data
def clusterResult(k, file_name,former): # 获取已经创建好的模型信息 print("加载文档-主题矩阵") result = btm.loadModel(file_name) data = np.array(result) print("开始kMeans聚类") result = np.zeros(6); for i in range(30): #estimator = kmn.kMeansByFeature(k, data) #labels = estimator.labels_ labels = SC(assign_labels="discretize", gamma=1e-7, n_clusters=k).fit_predict(data) result += cr.printResult(k, labels, former) return result / 30; print("聚类完成")
def cluster(X, k): #结果取30词平均 reslut = np.zeros(6) for i in range(30): # kmeans 算法 # res = km.kMeansByFeature(k,X) # labels = res.labels_ # 谱聚类 # assign_labels = "discretize" 离散化 取优值 # labels = SC(gamma=1e-7, n_clusters=k).fit_predict(X) labels = SC(assign_labels="discretize", gamma=1e-7, n_clusters=k).fit_predict(X) #labels = SC( affinity="nearest_neighbors",n_neighbors=10, n_clusters=k).fit_predict(X) reslut += np.array(printResult(k, labels, label)) reslut = reslut / 30 print("纯度:{}, RI:{}, F1_measure:{}, 熵:{}, 准确率:{}, 召回率:{}".format( reslut[0], reslut[1], reslut[2], reslut[3], reslut[4], reslut[5]))
def sc_cluster_and_plot(row_number, row, csv_out = None): filename = str(row_number) + "-" + str(row[0]) + "-color" + ".png" #matrix = np.matrix(np.array(row[1:])).reshape(28,28) #plot(filename, matrix) data_entry = row_to_data(row[1:]) #sc = SC(assign_labels='discretize', n_clusters=3).fit(data_entry) sc = SC(assign_labels='discretize', affinity='rbf', n_clusters=3).fit(data_entry) #sc = SC(n_clusters=1).fit(data_entry) al = sc.labels_ # metrics = get_clusters_metrcs(data_entry, al) print filename," ",metrics dump(csv_out, row_number, row[0], metrics) #plot_color(filename, data_entry, al) major_points = get_major_points(data_entry, al) mapping = order_clusters(split_to_clusters(data_entry, al)) plot_color(filename, data_entry, al, major_points, mapping) print "Row:",row_number," Digit:",row[0],"Mapping: ",mapping, "Maj:", major_points
def __init__(self, n_clusters=8, eigen_solver='None', n_components='n_clusters', random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None): self.assign_labels = assign_labels self.random_state = random_state self.n_init = n_init self.n_clusters = n_clusters self.coef0 = coef0 self.n_jobs = n_jobs self.eigen_solver = eigen_solver self.affinity = affinity self.degree = degree self.n_neighbors = n_neighbors self.eigen_tol = eigen_tol self.gamma = gamma self.kernel_params = kernel_params self.n_components = n_components self.model = SC(coef0=self.coef0, eigen_solver=self.eigen_solver, n_components=self.n_components, gamma=self.gamma, eigen_tol=self.eigen_tol, affinity=self.affinity, assign_labels=self.assign_labels, n_init=self.n_init, n_jobs=self.n_jobs, degree=self.degree, kernel_params=self.kernel_params, n_clusters=self.n_clusters, n_neighbors=self.n_neighbors, random_state=self.random_state)
def build_model(file_name): label, data = get_embeddings(file_name) print data.shape n_c = 2 clf = SC(n_clusters=n_c) #temp=np.transpose(data) output = clf.fit_predict(data) s = open("color", 'w') m = {0: '+', 1: 'o', 2: '^', 3: 'x'} pickle.dump(output, s) for x in xrange(n_c): plt.scatter(data[:, 0][output == x], data[:, 1][output == x], marker=m[x], s=45, label='Class %s' % x) plt.legend(loc='upper left') plt.show() print "here" return output, np.array(label), data
def clustering(idTfidf, num_clu, term_num): docFeature = idTfidf vecTfidf = {} for file in idTfidf: row = np.zeros(len(idTfidf[file])) col = idTfidf[file].keys() val = idTfidf[file].values() vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num)) vecTfidf[file] = vec.todense().tolist()[0] # print vecTfidf features = vecTfidf.values() # print features selection = 'GM' # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM if selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() # print result with open('gt_GMRes.json', 'w') as f: f.write(json.dumps(result)) return result
def lda_kmn_result(k, topic, doc, former, iterator=1000): # 返回对应的聚类结果 # 获取lda模型和词袋 print("创建主题模型") word_list, r_model = ldaa.lda_model(doc, topic, iterator) # 获取文档——主题分布 doc_topic = r_model.doc_topic_ # 转为普通list进行聚类 doc_topic_list = np.array(doc_topic).tolist() result = np.zeros(6) for i in range(30): #estimator = kmn.kMeansByFeature(topic, doc_topic_list) # labels = estimator.labels_ # assign_labels="discretize", labels = SC(assign_labels="discretize", gamma=1e-7, n_clusters=k).fit_predict(doc_topic) result += cr.printResult(k, labels, former) return result / 30
def build_model(file_name): label, data = get_embeddings(file_name) print data.shape n_c = 8 clf = SC(n_clusters=n_c) #temp=np.transpose(data) output = clf.fit_predict(data) s = open("color", 'w') m = {0: '+', 1: 'o', 2: '^', 3: 'x', 4: 'D', 5: '*', 6: '>', 7: 'v'} c = ['r', 'b', 'g', 'c', 'm', 'y', 'k', '#eeefff'] pickle.dump(output, s) for x in xrange(n_c): plt.scatter(data[:, 0][output == x], data[:, 1][output == x], marker=m[x], s=45, label='Class %s' % x, c=c[x]) plt.legend(loc='upper left') plt.show() print "here" return output, np.array(label), data
def sc_cluster(data): sc = SC(assign_labels='discretize', affinity='rbf', n_clusters=3).fit(data) al = sc.labels_ metrics = get_clusters_metrcs(data, al) return metrics
def supervised(numClu, affinity): print 'Buidling supervised model...' model = SC(n_clusters=numClu, affinity='precomputed') res = model.fit_predict(affinity) return res
def unsupervised_clu(feature, part, model_selection): if part: if feature == 'graph': docFeature = json.loads( open('rmMultiPart1WOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read()) if feature == 'comb': walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read()) dv = json.loads(open('rmMultiPart1Doc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiPart1CluInd.json').read()) num_clu = len(groundTruth) # number of clusters in each part else: rmMulti = True # False # if rmMulti: if feature == 'graph': docFeature = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads( open('rmMultiCluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk) else: if feature == 'graph': docFeature = json.loads( open('cluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('cluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads(open('cluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('cluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('groundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk) features = docFeature.values() if model_selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if model_selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if model_selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if model_selection == 'KMeans': model = KMeans(n_clusters=num_clu) if model_selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() return (result, groundTruth)
for i in xrange(100): out = sess.run([opt, loss_, h_1], feed_dict=feed_dict) acc = sess.run([acc_, h_2], feed_dict=feed_dict) print out[1], acc[0] #raw_input() q = np.argmax(sess.run(predict, feed_dict=feed_dict), 1) a = sess.run(acc_, feed_dict=feed_dict) print "\n", q, a, np.argmax(y, 1), "\n" viz = sess.run(h_1, feed_dict=feed_dict) #plt.scatter(viz[:,0],viz[:,1],s=100) from sklearn.cluster import SpectralClustering as SC clf = SC(n_clusters=2) output = clf.fit_predict(viz) plt.scatter(viz[:, 0], viz[:, 1], c=output, s=75) plt.show() d1 = data[output == 0] d2 = data[output == 1] import matplotlib.image as mpimg im = mpimg.imread('scene.jpg') plt.imshow(im) for row in d1: row = np.reshape(row, (15, 2)) plt.scatter(row[:, 0], row[:, 1], c='r') plt.show()
quats_arr = np.array(quats_arr) # compute similarity matrix X = np.zeros((n_samples, n_samples)) for x in range(n_samples): for y in range(n_samples): if x == y: X[x, y] = 1 else: a = quats[x] b = quats[y] X[x, y] = (sqrt_2 - pyq.Quaternion.absolute_distance(a, b)) / sqrt_2 clustering = SC(n_clusters=n_clust, affinity='precomputed') clustering.fit(X) samples_labels = clustering.labels_ print(np.bincount(samples_labels)) neigh = Nearest(n_neighbors=K, metric=utils.quatmetric) neigh.fit(quats_arr) pos = [] labels = [] labels_cvt = [] for _ in range(n_test): quat = pyq.Quaternion.random() test = np.array(quat.elements) pos.append(quat)
def get_trending_topics(FILE_LOAD, dt, load_file, load_from_file=0, no_of_topics=10, t=50000): DataIndex = IndexBox() DataIndex.load(FILE_LOAD) if load_from_file == 0: trendTime = DataIndex.getIndexTime(dt) df_idft_scores = {} for i in DataIndex.data: df_idft_scores[i] = get_df_idft(DataIndex.data[i], t, trendTime) sorted_by_score = sorted(df_idft_scores.items(), key=operator.itemgetter(1), reverse=True) trending_topics = [] count = 0 for i in sorted_by_score: trending_topics += [i[0].encode('utf-8')] count += 1 if count == 1000: # 1000 top df-idf ngrams for clustering break save_to_file = open(load_file, 'w') save_to_file.write('$$'.join(trending_topics)) save_to_file.close() return 'saved to file' else: load_from_file = open(load_file, 'r') trending_topics = load_from_file.read().split('$$') GraphMatrix = [] for ng1 in trending_topics: row = [] for ng2 in trending_topics: score = 0 for tw1 in DataIndex.data[ng1.decode('utf-8')]: for tw2 in DataIndex.data[ng2.decode('utf-8')]: if tw1 == tw2: score += 1 row += [math.log(score + 1, 2)] GraphMatrix += [np.array(row)] GraphMatrix = np.array(GraphMatrix) No_of_clusters = 5 clusters = SC(GraphMatrix, n_clusters=No_of_clusters, eigen_solver='arpack') f_stop = open('stopwords.txt', 'r') stopwords = f_stop.read().split('\n') Mark = [0] * No_of_clusters count = 0 topics_trending = [] for i in clusters: current_gram = trending_topics[count].decode('utf-8') if Mark[i] == 0: if '~~' not in current_gram: if current_gram not in stopwords and ( not current_gram.isdigit()): topics_trending += [current_gram] Mark[i] = 1 else: topics_trending += [current_gram] Mark[i] = 1 count += 1 return topics_trending
def supervised_clu(feature, rmMulti, trial): (part1Pos, part1Neg, part2Pos, part2Neg, part3Pos, part3Neg, part4Pos, part4Neg, part5Pos, part5Neg, globalPos, globalNeg) = data_selection(feature, rmMulti) sumpurity = 0 sumfone = 0 for i in range(0, trial): print '#', i + 1, 'trial!!!' pos_dataset = dic2List( globalPos ) # dic2List(part1Pos) + dic2List(part2Pos) + dic2List(part3Pos) + dic2List(part4Pos) + dic2List(part5Pos) # neg_dataset = dic2List( globalNeg ) # dic2List(part1Neg) + dic2List(part2Neg) + dic2List(part3Neg) + dic2List(part4Neg) + dic2List(part5Neg) # # print len(pos_dataset) num_pos_sample = int(0.3 * len(pos_dataset)) num_neg_sample = num_pos_sample (posPicked, posNotPicked) = takingSamples(pos_dataset, num=num_pos_sample) (negPicked, negNotPicked) = takingSamples(neg_dataset, num=num_neg_sample) # print len(posPicked),len(negPicked) # print posPicked, posNotPicked # train_X = pd.DataFrame(mat2arr(list2Dic(posPicked).values() + list2Dic(negPicked).values())) train_X = pd.DataFrame( list2Dic(posPicked).values() + list2Dic(negPicked).values()) train_y = np.array( [1 for i in range(len(list2Dic(posPicked).values()))] + [0 for i in range(len(list2Dic(negPicked).values()))]) print len(train_X), len(train_y) reg = RFC(n_estimators=200, max_features='log2') model = reg.fit(train_X, train_y) # print 'model ready!' # print 'get affinity matrix...' matrixVal = {} for item in posPicked: matrixVal[str(item.keys()[0])] = 1 for item in negPicked: matrixVal[str(item.keys()[0])] = 0 test_X = posNotPicked + negNotPicked modelIn = list2Dic(test_X) test_Y = model.predict_proba(modelIn.values())[:, 1] for i in range(0, len(modelIn)): matrixVal[modelIn.keys()[i]] = test_Y[i] # print matrixVal.keys() # print map(eval,matrixVal.keys()) # print matrixVal.values() # print size row = [] col = [] docMap = {} mapDoc = {} size = 0 for pair in map(eval, matrixVal.keys()): for doc in pair: if not docMap.has_key(doc): docMap[doc] = size mapDoc[size] = doc size += 1 # print mapDoc # print docMap for pair in map(eval, matrixVal.keys()): row.append(docMap[pair[0]]) col.append(docMap[pair[1]]) for pair in map(eval, matrixVal.keys()): row.append(docMap[pair[1]]) col.append(docMap[pair[0]]) data = matrixVal.values() + matrixVal.values() # print size affinity = csc_matrix((data, (row, col)), shape=(size, size)).toarray() # print 'affinity matrix get!' # print 'run clustering...' # groundTruth = json.loads(open('groundTruth.json').read()) # groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) # some documents appears in one part only once, but multiple time in global groundTruth = json.loads(open('rmMultiGroundTruthNew.json').read( )) # rmMultiGroundTruthNew.json is for simply combining all parts only # groundTruth = json.loads(open('part1CluInd.json').read()) # groundTruth = json.loads(open('rmMultiPart5CluInd.json').read()) num_clu = len(groundTruth) # print num_clu model = SC(n_clusters=num_clu, affinity='precomputed') res = model.fit_predict(affinity) # print res # print len(res), len(set(res)) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(mapDoc[i]) else: resDic[res[i]].append(mapDoc[i]) result = resDic.values() purVal = purity(result, groundTruth) (pre, rec, fone) = fmeasure(result, groundTruth) sumpurity += purVal sumfone += fone print 'purity %.4f' % purVal, 'precision: %.4f' % pre, 'recall: %.4f' % rec, 'f1: %.4f' % fone return (sumpurity, sumfone)
sigma2 = 0 for i in range(shape_img0): for j in range(i + 1, shape_img0): sigma1 += (W[i, j] - mean1) ** 2 for i in range(shape_img0, len(img_list)): for j in range(i + 1, len(img_list)): sigma2 += (W[i, j] - mean2) ** 2 sigma1 = np.sqrt(sigma1 / (shape_img0 * (shape_img0 - 1) / 2)) sigma2 = np.sqrt(sigma2 / (shape_img1 * (shape_img1 - 1) / 2)) sigma = (sigma1 + sigma2) / 2 W = np.exp(-1 * W / sigma) # NCut cluster img_list = np.array(img_list) cluster = SC(n_clusters=2, affinity='precomputed') cluster.fit(W) result = cluster.fit_predict(W) print(result) # accuracy compute accuracy = 0 for i in range(len(result)): if i < shape_img0 and result[i] == 0: accuracy += 1 if i >= shape_img0 and result[i] == 1: accuracy += 1 accuracy = float(accuracy) / len(result) print(accuracy)