Esempio n. 1
0
def _eval_cluster(texts, tokenizer, n_clusters):
    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                 stop_words=get_stopwords())
    transformer = TfidfTransformer()
    km = KMeans(n_clusters=n_clusters)
    tfidf = transformer.fit_transform(vectorizer.fit_transform(texts))
    km.fit(tfidf)
    return silhouette_score(tfidf, km.labels_.tolist(), sample_size=1000)
Esempio n. 2
0
def get_cluster_labels(texts, tokenizer, n_clusters):
    print("Clustering %d texts into %d groups ..." % (len(texts), n_clusters))
    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                 stop_words=get_stopwords())
    transformer = TfidfTransformer()
    km = KMeans(n_clusters=n_clusters)

    tfidf = transformer.fit_transform(vectorizer.fit_transform(texts))
    km.fit(tfidf)
    return km.labels_.tolist()
Esempio n. 3
0
def get_cluster_labels(texts, tokenizer, n_clusters):  # 典型的对文档进行聚类的流程
    print "Clustering %d texts into %d groups ..." % (len(texts), n_clusters)
    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                 stop_words=get_stopwords())
    transformer = TfidfTransformer()
    km = KMeans(n_clusters=n_clusters)  # 构造聚类器
    # vectorizer.fit_transform(texts)将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在第i个文本下的词频
    tfidf = transformer.fit_transform(vectorizer.fit_transform(
        texts))  #TfidfTransformer是统计vectorizer中每个词语的tf-idf权值返回权值矩阵
    km.fit(tfidf)  #  对文档texts中每个单词对每篇文档的tfidf权值矩阵实行聚类
    return km.labels_.tolist()  # 获取聚类标签