Exemple #1
0
def vec_similarity_sentences(sentence1, sentence2):
	if len(sentence1) == 0 or len(sentence2)==0:
		return 0
	input_file = 'test.txt'
	sent_file = 'sent.txt'
	
	f = open(sent_file,'w')
	f.write(sentence1)
	f.close()
	model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
	model.save_sent2vec_format(sent_file + '.vec')
	lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:]
	lines = lines[0].split()[1:]
	sentence1_rep = [float(i) for i in lines]

	f = open(sent_file,'w')
	f.write(sentence2)
	f.close()
	model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
	model.save_sent2vec_format(sent_file + '.vec')
	
	lines = [line.rstrip('\n') for line in open(sent_file + '.vec')][1:]
	lines = lines[0].split()[1:]
	sentence2_rep = [float(i) for i in lines]
	return 1 - spatial.distance.cosine(sentence1_rep, sentence2_rep)
Exemple #2
0
def ExtractSent2Vec(filename):
    model = Word2Vec(LineSentence(filename),
                     size=512,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(filename + '.model')
    model.save_word2vec_format(filename + '-01.vec')

    model = Sent2Vec(LineSentence(filename), model_file=filename + '.model')
    model.save_sent2vec_format(filename + '-02.vec')
def getTextualFeature(text_reading_path):
    # Train and save the Word2Vec model for the text file.
    # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'.
    model = Word2Vec(LineSentence(text_reading_path),
                     size=500,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(text_reading_path + '.model')

    # Train and save the Sentence2Vec model for the sentence file.
    model = Sent2Vec(LineSentence(text_reading_path),
                     model_file=text_reading_path + '.model')
    model.save_sent2vec_format(text_reading_path + '.vec')

    program = os.path.basename(sys.argv[0])
Exemple #4
0
def initialise_model(data):
	input_file = 'test.txt'
	f = open(input_file,'w')
	input_txt = get_all_text(data)
	f.write(input_txt)
	f.close()
	model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8)
	model.save(input_file + '.model')
	model.save_word2vec_format(input_file + '.vec')
 def sort_vectorized_sentences(self, word2vec_model, sentences, question):
     sent_file = "my_sent.txt"
     with io.open(sent_file, 'w', encoding='utf8') as f:
         f.write(question + u"\n")
         for sentence in sentences:
             f.write(sentence + u"\n")
     model = Sent2Vec(LineSentence(sent_file), model_file=word2vec_model)
     sim = []
     for i in range(1, len(sentences) + 1):
         cos = model.similarity(0, i)
         sim.append((cos, sentences[i - 1]))
     sim = sorted(sim)
     sim = sim[::-1]
     os.remove(sent_file)
     return [tup[1] for tup in sim]
Exemple #6
0
logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

category = 'Diseases_and_disorders'

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

#input_file = 'test2.txt'
input_file = '../inputFile/' + category + '.corpus.txt'
model = Word2Vec(LineSentence(input_file),
                 size=50,
                 window=7,
                 sg=0,
                 min_count=3,
                 workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')

# f_wv=codecs.open('../inputFile/word-vec.txt','w','utf-8')
# with open('../inputFile/vocab.txt') as textfile1, open('../inputFile/wordVectors.txt') as textfile2:
#         for x, y in izip(textfile1, textfile2):
#             x = x.strip()
#             y = y.strip()
#             f_wv.write(x+'\t'+y+'\n')
# f_wv.close()
input_file = path + '/course.txt'
sent_file = path + '/course.txt'

data = {'course.txt'}  #, 'paper.txt'}

for txt in data:

    sent_file = path + "/" + txt
    tmp_file = path + "/" + txt + ".tmp"
    mod_file = path + "/" + txt + ".model"
    if course != "":
        f = open(tmp_file, "a")
        f.write(course + "\n")

        f1 = open(sent_file, 'rU')
        f.write("".join(line for line in f1.readlines()))

    model = Sent2Vec(LineSentence(tmp_file), model_file=mod_file)

    if course != "":
        os.remove(tmp_file)

    result = {}
    for i in range(0, len(model.sents)):
        result[str(model.similarity(0, i))] = model.sentences[str(i)]

    logging.info("similarity data:")
    for k, v in [(k, result[k]) for k in sorted(result.keys(), reverse=True)]:
        if (float(k) > 0.8 and float(k) < 1):
            logging.info("      " + v)
Exemple #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""

"""

import logging
import sys
import os
from word2vec import Word2Vec, Sent2Vec, LineSentence

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

input_file = 'ieee-deepwalk-2014.txt'
model = Word2Vec(LineSentence(input_file),
                 size=100,
                 window=5,
                 sg=0,
                 min_count=5,
                 workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')
Exemple #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""

"""

import logging
import sys
import os
from word2vec import Word2Vec, Sent2Vec, LineSentence

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

input_file = 'zh_text_source.csv'
model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8)
model.save('S2V.model')
model.save_word2vec_format(input_file + '.vec')

# sent_file = 'TOTAL_LDA_SOURCE.txt'
# model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
# model.save_sent2vec_format(sent_file + '.vec')

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)
Exemple #10
0
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""

"""

import logging
import sys
import os
from word2vec import Word2Vec, Sent2Vec, LineSentence

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

# input_file = 'TOTAL_LDA_SOURCE.txt'
# model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8)
# model.save(input_file + '.model')
# model.save_word2vec_format(input_file + '.vec')

sent_file = '/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/TEST_S2V_SOURCE.txt'
model = Sent2Vec(
    LineSentence(sent_file),
    model_file='/Users/gaozhipeng/ML/RANK_TEST/TEST/S2V/S2V.model')
model.save_sent2vec_format(sent_file + '.vec')

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)
Exemple #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""

"""

import logging
import sys
import os
from word2vec import Word2Vec, Sent2Vec, LineSentence

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

# input_file = 'zh_text_source.csv'
# model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8)
# model.save('S2V.model')
# model.save_word2vec_format(input_file + '.vec')

sent_file = '/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/TEST_S2V_SOURCE.txt'
model = Sent2Vec(LineSentence(sent_file), model_file='/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/S2V.model')
model.save_sent2vec_format(sent_file + '.vec')

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)
Exemple #12
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""

"""

import logging
import sys
import os
from word2vec import Word2Vec, Sent2Vec, LineSentence

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

# input_file = 'zh_text_source.csv'
# model = Word2Vec(LineSentence(input_file), size=50, window=5, sg=0, min_count=5, workers=8)
# model.save('S2V.model')
# model.save_word2vec_format(input_file + '.vec')

sent_file = 'TRAIN_S2V_SOURCE.txt'
model = Sent2Vec(LineSentence(sent_file), model_file='./S2V.model')
model.save_sent2vec_format(sent_file + '.vec')

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)