Ejemplo n.º 1
0
left_out = []
labels_samples = []
labels_left_out = []

lemmatized = []
for sentence in sentences:
	curr = [lem.lemmatize(token) for token in sentence.split() if token not in stopwords]
	lemmatized.append(' '.join(elem for elem in curr))

vect.fit_transform(lemmatized)

for i in range(len(sentences)):
	curr = [token for token in sentences[i].split() if token in vect.vocabulary_]
	if len(curr) > 0:
		samples.append(curr)
		samples_raw.append(sentences[i])
		labels_samples.append(labels[i])
	else:
		left_out.append(sentences[i])
		labels_left_out.append(labels[i])
print(len(vect.vocabulary_))

mgp = MovieGroupProcess(K=40, alpha=0.1, beta=0.1, n_iters=30)
mgp.fit(samples,len(vect.vocabulary_))

results = [mgp.choose_best_label(sample) for sample in samples]
if len(results) < len(sentences):
	results.extend(["Left out"] * (len(sentences)-len(results)))

pd.DataFrame({'Sentence':samples_raw+left_out, 'Label':labels_samples+labels_left_out, 'Cluster':results}).to_csv('clustering_results.csv')
Ejemplo n.º 2
0
    print("data_list: ")
    print(data_list)
    V = compute_V(data_list)
    mgp = MovieGroupProcess(K=50, n_iters=1000, alpha=0.02, beta=0.01)
    print("GSDMM算法开始!")
    y = mgp.fit(data_list, V)
    print("GSDMM算法结束!")
    #生成聚类的result字典,并且写入cluster_result文件
    result = {}
    aid_bid = [
        line_content.strip()
        for line_content in codecs.open(mydir + 'result_file/aid_bid')
    ]
    f5 = open(mydir + 'result_file/cluster_result', 'w', encoding='UTF-8')
    for index in range(len(data_list)):
        z = mgp.choose_best_label(data_list[index])
        f5.write("%s" % aid_bid[index] + " " + "%s" % z[0] + "\n")
        result["%s" % aid_bid[index]] = z[0]
    print(result)

    # 处理result
    dir = ''
    for key in result:
        if result[key] != -1:
            dir = mydir + 'cluster_result_text/' + "%s" % result[key]
            if not os.path.exists(dir):
                os.makedirs(dir)
            f6 = open(dir + '/' + "%s" % key, 'w', encoding='UTF-8')
            f6.write(texts["%s" % key])
            f6.close()