def testWord2vec(self): base_dir = os.path.dirname(__file__) word2vector = Word2Vector.get_word_model( os.path.join(base_dir, "./model/word2vec_word.word2vec.model")) words = word2vector.most_similar([u"香辣蟹"]) self.assertEqual(words[0][0], u"椒盐") self.assertEqual(words[1][0], u"排档") self.assertEqual(len(words), 10) words = word2vector.most_similar([u"香辣蟹", u"啤酒"], topn=5) self.assertEqual(words[0][0], u"椒盐") self.assertEqual(words[1][0], u"菜品") self.assertEqual(len(words), 5) word = word2vector.doesnt_match([u"香辣蟹", u"啤酒", u"椒盐", u"地铁站"]) self.assertEqual(word, u"地铁站") cos = word2vector.similarity(u"椒盐", u"香辣蟹") self.assertTrue(0.996437 > cos > 0.996436) res = word2vector.n_similarity([u"椒盐", u"香辣蟹"], [u"地铁站", u"出去"]) self.assertTrue(0.980 < res < 0.981)
from _collections import defaultdict from pynlpini import Word2Vector from pipe import select, sort, as_list import os import logging logging.basicConfig( format= '%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s', level=logging.INFO) labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv" tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv" unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv" result_path = "../../../data/app/word_correlate_tagger/tag_result.csv" model = Word2Vector.get_phrase_model() vocabs = set(model.vocab.keys()) ktag_to_utags = defaultdict(set) if os.path.exists(labelled_word_path): with open(labelled_word_path) as labelled_tag_file: logging.info("Processing " + labelled_word_path) for line in labelled_tag_file: line = line.strip().decode("utf-8") if len(line.split("\t")) == 2: utag = line.split("\t")[0] ktag = line.split("\t")[1] if utag in vocabs: ktag_to_utags[ktag].add(utag) if os.path.exists(tag_list_path): with open(tag_list_path) as tag_list_file:
def phrase2vec(txt, topn): global phrase2vector if phrase2vector is None: phrase2vector = Word2Vector.get_phrase_model() words = txt.split() return json.dumps(phrase2vector.most_similar(words, topn=topn), ensure_ascii=False)
def word2vec(txt, topn): global word2vector if word2vector is None: word2vector = Word2Vector.get_word_model() words = txt.split() return json.dumps(word2vector.most_similar(words, topn=topn), ensure_ascii=False)
from _collections import defaultdict from pynlpini import Word2Vector from pipe import select, sort, as_list import os import logging logging.basicConfig(format='%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s', level=logging.INFO) labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv" tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv" unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv" result_path = "../../../data/app/word_correlate_tagger/tag_result.csv" model = Word2Vector.get_phrase_model() vocabs = set(model.vocab.keys()) ktag_to_utags = defaultdict(set) if os.path.exists(labelled_word_path): with open(labelled_word_path) as labelled_tag_file: logging.info("Processing " + labelled_word_path) for line in labelled_tag_file: line = line.strip().decode("utf-8") if len(line.split("\t")) == 2: utag = line.split("\t")[0] ktag = line.split("\t")[1] if utag in vocabs: ktag_to_utags[ktag].add(utag) if os.path.exists(tag_list_path): with open(tag_list_path) as tag_list_file: logging.info("Processing " + tag_list_path) for key in tag_list_file: key = key.strip().decode("utf-8")
def testPhrase2vec(self): base_dir = os.path.dirname(__file__) phrase2vector = Word2Vector.get_phrase_model( os.path.join(base_dir, "./model/word2vec_phrase.word2vec.model")) res = phrase2vector.most_similar([u"历史悠久", u"法国"]) self.assertEqual(res[2][0], "halohalo")