/
word2vec_for_breakdown.py
29 lines (29 loc) · 1.22 KB
/
word2vec_for_breakdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import gensim, logging
import pickle
import readall
import nltk
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
all_v1 = readall.readall('/home/ubuntu/zhou/Backend/rating_log/v1')
all_v2 = readall.readall('/home/ubuntu/zhou/Backend/rating_log/v2')
all_v3 = readall.readall('/home/ubuntu/zhou/Backend/rating_log/v3')
all_v5 = readall.readall('/home/ubuntu/zhou/Backend/rating_log/v5')
all_logs = dict(all_v1.items() + all_v2.items() + all_v3.items() +all_v5.items())
sentences =[]
user_input = []
dictionary = []
for item in all_logs:
#print item
conv = all_logs[item]["Turns"]
for turn in conv:
sentences.append(nltk.word_tokenize(conv[turn]["You"].lower()))
sentences.append(nltk.word_tokenize(conv[turn]["TickTock"].lower()))
user_input.append(conv[turn]["You"])
#print len(sentences)
#print sentences
model = gensim.models.Word2Vec(sentences,size =100, min_count=1)
dictionary = list(set([item for sublist in sentences for item in sublist]))
#print dictionary
model.save('/tmp/word2vec_100_break')
pickle.dump(dictionary, open('dictionary_conv.pkl','w'))
with open('user_input_all.pkl','w') as f:
pickle.dump(user_input,f)