Ejemplo n.º 1
0
	n = config.list_alltf_result # 各文書における総単語のtf値が入ったリストを代入

	"実行したら保存したファイル名を変えること"
	fw = open("/home/matsui-pc/matsui/experiment/plsa_n.csv" ,"wa") # 始めは新しく作りその後は追加で書き込んでいく
	csvWriter = csv.writer(fw) # csvファイルに書き込むための準備
	for i in range(len(n)):
		csvWriter.writerow(n[i]) # ファイル名と特徴量をcsvファイルへ書き込む
	fw.close()

	# print len(n[0])
	## 確認用
	# for i in range(len(n)):
	# 	print n[i]
	# print len(n[0])

	p = plsa.plsa(n) # plsa.py の関数plsaに投げる

	print "PLSA計算中"
	p.train() # EMステップを繰り返す

	print "\n"
	# print "*********** 最終的な出力 ************"

	print "P(z) = ",
	print p.pz # P(z)
	print "P(d|z) = ",
	print p.pz_d # P(d|z)
	print "P(w|z) = ",
	print p.pz_w # P(w|z)
	print "P(z|d,w)",
	print p.pdw_z # P(z|d,w)
Ejemplo n.º 2
0
# iterate over the files in the directory.
document_paths = ['./Files/']
documents = []
for document_path in document_paths:
    for document_file in glob.glob(os.path.join(document_path, '*.txt')):
        words, lines = common_utils.split(stop_words_set,
                                          document_file)  # tokenize
        documents.append(words)

vocabulary = common_utils.build_vocabulary(documents)

number_of_topics = 3
max_iterations = 1

topic_word_prob, document_topic_prob = plsa.plsa(number_of_topics,
                                                 max_iterations, documents)

common_utils.print_topic_word_distribution(topic_word_prob, vocabulary,
                                           number_of_topics, 3,
                                           "./topic-word.txt")
common_utils.print_document_topic_distribution(document_topic_prob, documents,
                                               number_of_topics, 3,
                                               "./document-topic.txt")

path_wordsim = './wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'
data_cos = []
data_scalar = []

plsa_matrix = pd.DataFrame(data=topic_word_prob, columns=vocabulary)

consistent_wordsim = common_utils.read_consistent_wordsim(
Ejemplo n.º 3
0
for  i in range(nD):
    for word in word_freq[i]:
        Ndw[i][wordID[word]] = word_freq[i][word]

print Ndw



#pprint(total_freq)
#print Ndw


nZ  = 3

noise = +np.random.rand(nD,nW)
Pd_z, Pw_z,Pz_d,Pz_w  = plsa.plsa(Ndw+noise,nZ,100)

Y = np.concatenate((Pz_d.T,Pz_w.T))


Y = Y[:,:-1]
#for i in range(len(Y)):
#    Y[i] = Y[i][:2]

#Y = Y[:2]
#print np.shape(Y)

#Y = tsne.tsne(Ndw.T,2,nD)