def cluster_paragraphs(paragraphs, num_clusters=2): word_lists = make_word_lists(paragraphs) word_set = make_word_set(word_lists) word_vectors = make_word_vectors(word_set, word_lists) paragraph_map = dict(zip(map(str, word_vectors), paragraphs)) k_means = KMeans(num_clusters, word_vectors) k_means.main_loop() return translator(k_means.clusters, paragraph_map)
def cluster_paragraphs(paragraphs): word_lists = make_word_lists(paragraphs) #二维列表 word_lists1 = [] for i in range(len(word_lists)): str1 = " ".join(word_lists[i]) word_lists1.append(str1) # print "word_lists1:",word_lists1 word_set = make_word_set(word_lists) #所有词的集合 vec_df = tfidf(word_lists1) word_vectors = make_word_vectors(word_set, word_lists) #将每一条数据处理成一个固定长度的向量 # print "word_vectors:",word_vectors paragraph_map = dict(zip(map(str, word_vectors), paragraphs)) optimum_k = find_optimum_k(vec_df) k_means = KMeans(optimum_k, word_vectors) k_means.main_loop() return translator(k_means.clusters, paragraph_map)