def vec_similarity_sentences(sentence1, sentence2): if len(sentence1) == 0 or len(sentence2)==0: return 0 input_file = 'test.txt' sent_file = 'sent.txt' f = open(sent_file,'w') f.write(sentence1) f.close() model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_sent2vec_format(sent_file + '.vec') lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:] lines = lines[0].split()[1:] sentence1_rep = [float(i) for i in lines] f = open(sent_file,'w') f.write(sentence2) f.close() model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_sent2vec_format(sent_file + '.vec') lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:] lines = lines[0].split()[1:] sentence2_rep = [float(i) for i in lines] return 1 - spatial.distance.cosine(sentence1_rep, sentence2_rep)
def ExtractSent2Vec(filename): model = Word2Vec(LineSentence(filename), size=512, window=5, sg=0, min_count=5, workers=8) model.save(filename + '.model') model.save_word2vec_format(filename + '-01.vec') model = Sent2Vec(LineSentence(filename), model_file=filename + '.model') model.save_sent2vec_format(filename + '-02.vec')
def getTextualFeature(text_reading_path): # Train and save the Word2Vec model for the text file. # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'. model = Word2Vec(LineSentence(text_reading_path), size=500, window=5, sg=0, min_count=5, workers=8) model.save(text_reading_path + '.model') # Train and save the Sentence2Vec model for the sentence file. model = Sent2Vec(LineSentence(text_reading_path), model_file=text_reading_path + '.model') model.save_sent2vec_format(text_reading_path + '.vec') program = os.path.basename(sys.argv[0])
def initialise_model(data): input_file = 'test.txt' f = open(input_file,'w') input_txt = get_all_text(data) f.write(input_txt) f.close() model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec')
def sort_vectorized_sentences(self, word2vec_model, sentences, question): sent_file = "my_sent.txt" with io.open(sent_file, 'w', encoding='utf8') as f: f.write(question + u"\n") for sentence in sentences: f.write(sentence + u"\n") model = Sent2Vec(LineSentence(sent_file), model_file=word2vec_model) sim = [] for i in range(1, len(sentences) + 1): cos = model.similarity(0, i) sim.append((cos, sentences[i - 1])) sim = sorted(sim) sim = sim[::-1] os.remove(sent_file) return [tup[1] for tup in sim]
logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) category = 'Diseases_and_disorders' logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) #input_file = 'test2.txt' input_file = '../inputFile/' + category + '.corpus.txt' model = Word2Vec(LineSentence(input_file), size=50, window=7, sg=0, min_count=3, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec') # f_wv=codecs.open('../inputFile/word-vec.txt','w','utf-8') # with open('../inputFile/vocab.txt') as textfile1, open('../inputFile/wordVectors.txt') as textfile2: # for x, y in izip(textfile1, textfile2): # x = x.strip() # y = y.strip() # f_wv.write(x+'\t'+y+'\n') # f_wv.close()
input_file = path + '/course.txt' sent_file = path + '/course.txt' data = {'course.txt'} #, 'paper.txt'} for txt in data: sent_file = path + "/" + txt tmp_file = path + "/" + txt + ".tmp" mod_file = path + "/" + txt + ".model" if course != "": f = open(tmp_file, "a") f.write(course + "\n") f1 = open(sent_file, 'rU') f.write("".join(line for line in f1.readlines())) model = Sent2Vec(LineSentence(tmp_file), model_file=mod_file) if course != "": os.remove(tmp_file) result = {} for i in range(0, len(model.sents)): result[str(model.similarity(0, i))] = model.sentences[str(i)] logging.info("similarity data:") for k, v in [(k, result[k]) for k in sorted(result.keys(), reverse=True)]: if (float(k) > 0.8 and float(k) < 1): logging.info(" " + v)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) input_file = 'ieee-deepwalk-2014.txt' model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=5, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec')
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) input_file = 'zh_text_source.csv' model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) model.save('S2V.model') model.save_word2vec_format(input_file + '.vec') # sent_file = 'TOTAL_LDA_SOURCE.txt' # model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model') # model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
# -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'TOTAL_LDA_SOURCE.txt' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save(input_file + '.model') # model.save_word2vec_format(input_file + '.vec') sent_file = '/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/TEST_S2V_SOURCE.txt' model = Sent2Vec( LineSentence(sent_file), model_file='/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'zh_text_source.csv' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save('S2V.model') # model.save_word2vec_format(input_file + '.vec') sent_file = '/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/TEST_S2V_SOURCE.txt' model = Sent2Vec(LineSentence(sent_file), model_file='/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ """ import logging import sys import os from word2vec import Word2Vec, Sent2Vec, LineSentence logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # input_file = 'zh_text_source.csv' # model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8) # model.save('S2V.model') # model.save_word2vec_format(input_file + '.vec') sent_file = 'TRAIN_S2V_SOURCE.txt' model = Sent2Vec(LineSentence(sent_file), model_file='./S2V.model') model.save_sent2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)