def test_cluster_kmean(self): self.assertTrue(True) model = w2c.get() features, X = get_features_X(model) n_clusters = 200 kmean = KMeans(n_clusters=n_clusters) labels = kmean.fit_predict(X) centers = kmean.cluster_centers_ cluster_features = defaultdict(list) cluster_X = defaultdict(list) cluster_centers = dict() for f, l, x in zip(features, labels, X): cluster_features[l].append(f) cluster_X[l].append(x) for l, x in list(cluster_X.items()): i = np.argmax(cosine_similarity(centers[l], cluster_X[l])) cluster_centers[l] = cluster_features[l][i] with open( os.path.join( os.path.join(RESOURCE_DIR, 'cluster', 'cluster.txt')), 'wb') as f: for label in cluster_features: f.write('%s --- %s\n' % (cluster_centers[label], ' '.join( cluster_features[label])))
def create2(features, n_clusters=200): logger.info('cluster features...') model = w2c.get() features = filter_features(features, model) X = get_X(features, model) kmean = KMeans(n_clusters=n_clusters) labels = kmean.fit_predict(X) centers = kmean.cluster_centers_ cluster_features = defaultdict(list) cluster_X = defaultdict(list) cluster_centers = dict() for f, l, x in zip(features, labels, X): cluster_features[l].append(f) cluster_X[l].append(x) for l, x in list(cluster_X.items()): i = np.argmax(cosine_similarity(centers[l], cluster_X[l])) cluster_centers[l] = cluster_features[l][i] with open(os.path.join(os.path.join(RESOURCE_DIR, 'cluster', 'cluster.txt')), 'wb') as f: for label in cluster_features: f.write('%s --- %s\n' % (cluster_centers[label], ' '.join(cluster_features[label]))) return cluster_features
def test_cluster_h(self): self.assertTrue(True) model = w2c.get() features, X = get_features_X(model) for f in features: print(f)
def test_ap_features2(self): self.assertTrue(True) model = w2c.get() features, X = get_features_X(model) ap = AffinityPropagation(preference=-50) labels = ap.fit_predict(X) n_cluster = len(ap.cluster_centers_indices_) print('n_cluster: ', n_cluster) # for c in ap.cluster_centers_indices_: # print c, features[c] # 绘制图表展示 import matplotlib.pyplot as plt from itertools import cycle from sklearn.manifold import TSNE plt.close('all') # 关闭所有的图形 plt.figure(1) # 产生一个新的图形 plt.clf() # 清空当前的图形 colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') colors = [c for _, c in zip(list(range(n_cluster)), colors)] print(len(colors)) print(colors) tsne = TSNE(n_components=2) X_2d = tsne.fit_transform(X) for x, l in zip(X_2d, labels): plt.scatter(x[0], x[1], c=colors[l]) ''' for k, col in zip(range(n_cluster), colors): plt.scatter() for k, col in zip(range(n_cluster), colors): # labels == k 使用k与labels数组中的每个值进行比较 # 如labels = [1,0],k=0,则‘labels==k’的结果为[False, True] class_members = labels == k cluster_center = X_2d[ap.cluster_centers_indices_[k]] # 聚类中心的坐标 plt.plot(X_2d[class_members, 0], X_2d[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X_2d[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) ''' plt.title('预测聚类中心个数:%d' % n_cluster) plt.show()
def run(self, pinglun_file, O_seeds): """ 提取特征词/评价词 :param pinglun_file: 评论文本 :param O_seeds: 种子评价词 :return: """ logger.info('pipeline run...') if not os.path.exists(self._clean_file): logger.info('清洗文本') clean.clean_file(pinglun_file, self._clean_file) if not os.path.exists(self._relation_file): logger.info('句法解析') relation_parse.parse(self._clean_file, self._relation_file) logger.info('提取特征词/评价词, double propagation算法') S = self._iter_sentences_relations(self._relation_file) F, O, fcounter, ocounter, rcount = double_propagation.extract( O_seeds, S) utils.write_file(self._dp_f_file, F) utils.write_file(self._dp_o_file, O) utils.save_obj(fcounter, self._dp_f_counter) utils.save_obj(ocounter, self._dp_o_counter) logger.info('特征词/评价词剪枝') F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold) utils.write_file(self._prune_f_file, F) utils.write_file(self._prune_o_file, O) if not os.path.exists(self._word2vec_file): logger.info('训练word2vec模型') T = self._iter_sentences_tokens(self._relation_file) w2c.train(T, self._word2vec_file) model = w2c.get(self._word2vec_file) logger.info('聚类特征词') cf = cluster.create(F, model, preference=-30) features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf] utils.write_file(self._feature_file, features) logger.info('聚类评价词') O = utils.read_file(self._prune_o_file) of = cluster.create(O, model, preference=None) opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of] utils.write_file(self._opinion_file, opinions) logger.info('pipeline over.') return cf, of, F, O
def test_dbscan(self): self.assertTrue(True) features, X = get_features_X(w2c.get()) X = cosine_similarity(X) eps = 0.5 dbscan = DBSCAN(metric='precomputed', eps=eps) labels = dbscan.fit_predict(X) print(labels) print('eps: %f, n_cluster: %d' % (eps, len(set(labels))))
def test_ap_features(self): self.assertTrue(True) from collections import defaultdict model = w2c.get() features, X = get_features_X(model) # preference是小于0的值。值越大,聚类数就越多。 ap = AffinityPropagation(preference=-30) labels = ap.fit_predict(X) centers = dict() for label, index in enumerate(ap.cluster_centers_indices_): centers[label] = features[index] clusters = defaultdict(set) for label, feature in zip(labels, features): clusters[label].add(feature) for label in centers: print('%s --- %s' % (centers[label], ' '.join(clusters[label])))
def test_word2vec_vec(self): self.assertTrue(True) model = w2v.get() print(model['屏幕'])
def test_word2vec_model(self): self.assertTrue(True) model = w2v.get() for token, similarity in model.most_similar(positive=['屏幕']): print(token, similarity)