Ejemplo n.º 1
0
    def test_cluster_kmean(self):
        self.assertTrue(True)

        model = w2c.get()

        features, X = get_features_X(model)

        n_clusters = 200
        kmean = KMeans(n_clusters=n_clusters)
        labels = kmean.fit_predict(X)

        centers = kmean.cluster_centers_

        cluster_features = defaultdict(list)
        cluster_X = defaultdict(list)
        cluster_centers = dict()

        for f, l, x in zip(features, labels, X):
            cluster_features[l].append(f)
            cluster_X[l].append(x)

        for l, x in list(cluster_X.items()):
            i = np.argmax(cosine_similarity(centers[l], cluster_X[l]))
            cluster_centers[l] = cluster_features[l][i]

        with open(
                os.path.join(
                    os.path.join(RESOURCE_DIR, 'cluster', 'cluster.txt')),
                'wb') as f:
            for label in cluster_features:
                f.write('%s --- %s\n' % (cluster_centers[label], ' '.join(
                    cluster_features[label])))
Ejemplo n.º 2
0
def create2(features, n_clusters=200):
    logger.info('cluster features...')

    model = w2c.get()

    features = filter_features(features, model)
    X = get_X(features, model)

    kmean = KMeans(n_clusters=n_clusters)
    labels = kmean.fit_predict(X)

    centers = kmean.cluster_centers_

    cluster_features = defaultdict(list)
    cluster_X = defaultdict(list)
    cluster_centers = dict()

    for f, l, x in zip(features, labels, X):
        cluster_features[l].append(f)
        cluster_X[l].append(x)

    for l, x in list(cluster_X.items()):
        i = np.argmax(cosine_similarity(centers[l], cluster_X[l]))
        cluster_centers[l] = cluster_features[l][i]

    with open(os.path.join(os.path.join(RESOURCE_DIR, 'cluster', 'cluster.txt')), 'wb') as f:
        for label in cluster_features:
            f.write('%s --- %s\n' % (cluster_centers[label], ' '.join(cluster_features[label])))

    return cluster_features
Ejemplo n.º 3
0
    def test_cluster_h(self):
        self.assertTrue(True)

        model = w2c.get()

        features, X = get_features_X(model)
        for f in features:
            print(f)
Ejemplo n.º 4
0
    def test_ap_features2(self):
        self.assertTrue(True)

        model = w2c.get()

        features, X = get_features_X(model)

        ap = AffinityPropagation(preference=-50)
        labels = ap.fit_predict(X)

        n_cluster = len(ap.cluster_centers_indices_)
        print('n_cluster: ', n_cluster)

        # for c in ap.cluster_centers_indices_:
        #     print c, features[c]

        # 绘制图表展示
        import matplotlib.pyplot as plt
        from itertools import cycle
        from sklearn.manifold import TSNE

        plt.close('all')  # 关闭所有的图形
        plt.figure(1)  # 产生一个新的图形
        plt.clf()  # 清空当前的图形

        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        colors = [c for _, c in zip(list(range(n_cluster)), colors)]
        print(len(colors))
        print(colors)

        tsne = TSNE(n_components=2)
        X_2d = tsne.fit_transform(X)

        for x, l in zip(X_2d, labels):
            plt.scatter(x[0], x[1], c=colors[l])
        '''
        for k, col in zip(range(n_cluster), colors):
            plt.scatter()

        for k, col in zip(range(n_cluster), colors):
            # labels == k 使用k与labels数组中的每个值进行比较
            # 如labels = [1,0],k=0,则‘labels==k’的结果为[False, True]

            class_members = labels == k
            cluster_center = X_2d[ap.cluster_centers_indices_[k]]  # 聚类中心的坐标

            plt.plot(X_2d[class_members, 0], X_2d[class_members, 1], col + '.')
            plt.plot(cluster_center[0], cluster_center[1], markerfacecolor=col,
                     markeredgecolor='k', markersize=14)
            for x in X_2d[class_members]:
                plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
        '''

        plt.title('预测聚类中心个数:%d' % n_cluster)
        plt.show()
Ejemplo n.º 5
0
    def run(self, pinglun_file, O_seeds):
        """
        提取特征词/评价词
        :param pinglun_file: 评论文本
        :param O_seeds: 种子评价词
        :return:
        """
        logger.info('pipeline run...')

        if not os.path.exists(self._clean_file):
            logger.info('清洗文本')
            clean.clean_file(pinglun_file, self._clean_file)

        if not os.path.exists(self._relation_file):
            logger.info('句法解析')
            relation_parse.parse(self._clean_file, self._relation_file)

        logger.info('提取特征词/评价词, double propagation算法')
        S = self._iter_sentences_relations(self._relation_file)
        F, O, fcounter, ocounter, rcount = double_propagation.extract(
            O_seeds, S)

        utils.write_file(self._dp_f_file, F)
        utils.write_file(self._dp_o_file, O)
        utils.save_obj(fcounter, self._dp_f_counter)
        utils.save_obj(ocounter, self._dp_o_counter)

        logger.info('特征词/评价词剪枝')
        F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold)

        utils.write_file(self._prune_f_file, F)
        utils.write_file(self._prune_o_file, O)

        if not os.path.exists(self._word2vec_file):
            logger.info('训练word2vec模型')
            T = self._iter_sentences_tokens(self._relation_file)
            w2c.train(T, self._word2vec_file)

        model = w2c.get(self._word2vec_file)

        logger.info('聚类特征词')
        cf = cluster.create(F, model, preference=-30)
        features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf]
        utils.write_file(self._feature_file, features)

        logger.info('聚类评价词')
        O = utils.read_file(self._prune_o_file)
        of = cluster.create(O, model, preference=None)
        opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of]
        utils.write_file(self._opinion_file, opinions)

        logger.info('pipeline over.')

        return cf, of, F, O
Ejemplo n.º 6
0
    def test_dbscan(self):
        self.assertTrue(True)

        features, X = get_features_X(w2c.get())

        X = cosine_similarity(X)

        eps = 0.5
        dbscan = DBSCAN(metric='precomputed', eps=eps)

        labels = dbscan.fit_predict(X)
        print(labels)
        print('eps: %f, n_cluster: %d' % (eps, len(set(labels))))
Ejemplo n.º 7
0
    def test_ap_features(self):
        self.assertTrue(True)

        from collections import defaultdict

        model = w2c.get()

        features, X = get_features_X(model)

        # preference是小于0的值。值越大,聚类数就越多。
        ap = AffinityPropagation(preference=-30)
        labels = ap.fit_predict(X)

        centers = dict()
        for label, index in enumerate(ap.cluster_centers_indices_):
            centers[label] = features[index]

        clusters = defaultdict(set)
        for label, feature in zip(labels, features):
            clusters[label].add(feature)

        for label in centers:
            print('%s --- %s' % (centers[label], ' '.join(clusters[label])))
Ejemplo n.º 8
0
    def test_word2vec_vec(self):
        self.assertTrue(True)

        model = w2v.get()
        print(model['屏幕'])
Ejemplo n.º 9
0
    def test_word2vec_model(self):
        self.assertTrue(True)

        model = w2v.get()
        for token, similarity in model.most_similar(positive=['屏幕']):
            print(token, similarity)