def _eval_cluster(texts, tokenizer, n_clusters): vectorizer = CountVectorizer(tokenizer=tokenizer, stop_words=get_stopwords()) transformer = TfidfTransformer() km = KMeans(n_clusters=n_clusters) tfidf = transformer.fit_transform(vectorizer.fit_transform(texts)) km.fit(tfidf) return silhouette_score(tfidf, km.labels_.tolist(), sample_size=1000)
def get_cluster_labels(texts, tokenizer, n_clusters): print("Clustering %d texts into %d groups ..." % (len(texts), n_clusters)) vectorizer = CountVectorizer(tokenizer=tokenizer, stop_words=get_stopwords()) transformer = TfidfTransformer() km = KMeans(n_clusters=n_clusters) tfidf = transformer.fit_transform(vectorizer.fit_transform(texts)) km.fit(tfidf) return km.labels_.tolist()
def get_cluster_labels(texts, tokenizer, n_clusters): # 典型的对文档进行聚类的流程 print "Clustering %d texts into %d groups ..." % (len(texts), n_clusters) vectorizer = CountVectorizer(tokenizer=tokenizer, stop_words=get_stopwords()) transformer = TfidfTransformer() km = KMeans(n_clusters=n_clusters) # 构造聚类器 # vectorizer.fit_transform(texts)将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在第i个文本下的词频 tfidf = transformer.fit_transform(vectorizer.fit_transform( texts)) #TfidfTransformer是统计vectorizer中每个词语的tf-idf权值返回权值矩阵 km.fit(tfidf) # 对文档texts中每个单词对每篇文档的tfidf权值矩阵实行聚类 return km.labels_.tolist() # 获取聚类标签