Esempio n. 1
0
def cluster_paragraphs(paragraphs, num_clusters=2):
    word_lists = make_word_lists(paragraphs)
    word_set = make_word_set(word_lists)
    word_vectors = make_word_vectors(word_set, word_lists)

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    k_means = KMeans(num_clusters, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)
Esempio n. 2
0
def cluster_paragraphs(paragraphs, num_clusters=2):
    word_lists = make_word_lists(paragraphs)
    word_set = make_word_set(word_lists)
    word_vectors = make_word_vectors(word_set, word_lists)

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    k_means = KMeans(num_clusters, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)
Esempio n. 3
0
def cluster_paragraphs(paragraphs):
    word_lists = make_word_lists(paragraphs)  #二维列表
    word_lists1 = []
    for i in range(len(word_lists)):
        str1 = " ".join(word_lists[i])
        word_lists1.append(str1)
    # print "word_lists1:",word_lists1
    word_set = make_word_set(word_lists)  #所有词的集合
    vec_df = tfidf(word_lists1)
    word_vectors = make_word_vectors(word_set, word_lists)  #将每一条数据处理成一个固定长度的向量
    # print "word_vectors:",word_vectors

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    optimum_k = find_optimum_k(vec_df)
    k_means = KMeans(optimum_k, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)