def vec_similarity_sentences(sentence1, sentence2): if len(sentence1) == 0 or len(sentence2)==0: return 0 input_file = 'test.txt' sent_file = 'sent.txt' f = open(sent_file,'w') f.write(sentence1) f.close() model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_sent2vec_format(sent_file + '.vec') lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:] lines = lines[0].split()[1:] sentence1_rep = [float(i) for i in lines] f = open(sent_file,'w') f.write(sentence2) f.close() model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_sent2vec_format(sent_file + '.vec') lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:] lines = lines[0].split()[1:] sentence2_rep = [float(i) for i in lines] return 1 - spatial.distance.cosine(sentence1_rep, sentence2_rep)
def ExtractSent2Vec(filename): model = Word2Vec(LineSentence(filename), size=512, window=5, sg=0, min_count=5, workers=8) model.save(filename + '.model') model.save_word2vec_format(filename + '-01.vec') model = Sent2Vec(LineSentence(filename), model_file=filename + '.model') model.save_sent2vec_format(filename + '-02.vec')
def sort_vectorized_sentences(self, word2vec_model, sentences, question): sent_file = "my_sent.txt" with io.open(sent_file, 'w', encoding='utf8') as f: f.write(question + u"\n") for sentence in sentences: f.write(sentence + u"\n") model = Sent2Vec(LineSentence(sent_file), model_file=word2vec_model) sim = [] for i in range(1, len(sentences) + 1): cos = model.similarity(0, i) sim.append((cos, sentences[i - 1])) sim = sorted(sim) sim = sim[::-1] os.remove(sent_file) return [tup[1] for tup in sim]
def getTextualFeature(text_reading_path): # Train and save the Word2Vec model for the text file. # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'. model = Word2Vec(LineSentence(text_reading_path), size=500, window=5, sg=0, min_count=5, workers=8) model.save(text_reading_path + '.model') # Train and save the Sentence2Vec model for the sentence file. model = Sent2Vec(LineSentence(text_reading_path), model_file=text_reading_path + '.model') model.save_sent2vec_format(text_reading_path + '.vec') program = os.path.basename(sys.argv[0])
logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) #input_file = 'test2.txt' input_file = '../inputFile/' + category + '.corpus.txt' model = Word2Vec(LineSentence(input_file), size=50, window=7, sg=0, min_count=3, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec') # f_wv=codecs.open('../inputFile/word-vec.txt','w','utf-8') # with open('../inputFile/vocab.txt') as textfile1, open('../inputFile/wordVectors.txt') as textfile2: # for x, y in izip(textfile1, textfile2): # x = x.strip() # y = y.strip() # f_wv.write(x+'\t'+y+'\n') # f_wv.close() sent_file = input_file model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
input_file = path + '/course.txt' sent_file = path + '/course.txt' data = {'course.txt'} #, 'paper.txt'} for txt in data: sent_file = path + "/" + txt tmp_file = path + "/" + txt + ".tmp" mod_file = path + "/" + txt + ".model" if course != "": f = open(tmp_file, "a") f.write(course + "\n") f1 = open(sent_file, 'rU') f.write("".join(line for line in f1.readlines())) model = Sent2Vec(LineSentence(tmp_file), model_file=mod_file) if course != "": os.remove(tmp_file) result = {} for i in range(0, len(model.sents)): result[str(model.similarity(0, i))] = model.sentences[str(i)] logging.info("similarity data:") for k, v in [(k, result[k]) for k in sorted(result.keys(), reverse=True)]: if (float(k) > 0.8 and float(k) < 1): logging.info(" " + v)
# -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'TOTAL_LDA_SOURCE.txt' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save(input_file + '.model') # model.save_word2vec_format(input_file + '.vec') sent_file = '/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/TEST_S2V_SOURCE.txt' model = Sent2Vec( LineSentence(sent_file), model_file='/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
if not line.strip(): continue label_string, text = line.rstrip().split( "\t" ) # rstrip instead of strip, because precending tabs are important here labels = label_string.split(',') y.append(labels) X_text.append(' '.join(nltk.word_tokenize(text))) if labels: for label in labels: global_labels.add(label) # generate sentence vectors print('building sentence vectors') X = [] sentence_model = Sent2Vec(X_text, model_file=w2v_model_file) for i in range(len(X_text)): # convert from numpy array representation = list(val for val in sentence_model.sents[i]) X.append(representation) # generate an svmlight file for each label print('writing feature representations') for label in global_labels: if not label: continue outfile_name = '.'.join([sent_file, label, 'svmlight.features']) outfile = open(outfile_name, 'w') print('-> ', outfile_name) positives = 0 for i in range(len(y)):
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'zh_text_source.csv' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save('S2V.model') # model.save_word2vec_format(input_file + '.vec') sent_file = '/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/TEST_S2V_SOURCE.txt' model = Sent2Vec(LineSentence(sent_file), model_file='/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'zh_text_source.csv' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save('S2V.model') # model.save_word2vec_format(input_file + '.vec') sent_file = 'TRAIN_S2V_SOURCE.txt' model = Sent2Vec(LineSentence(sent_file), model_file='./S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)