from _collections import defaultdict from pynlpini import Word2Vector from pipe import select, sort, as_list import os import logging logging.basicConfig( format= '%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s', level=logging.INFO) labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv" tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv" unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv" result_path = "../../../data/app/word_correlate_tagger/tag_result.csv" model = Word2Vector.get_phrase_model() vocabs = set(model.vocab.keys()) ktag_to_utags = defaultdict(set) if os.path.exists(labelled_word_path): with open(labelled_word_path) as labelled_tag_file: logging.info("Processing " + labelled_word_path) for line in labelled_tag_file: line = line.strip().decode("utf-8") if len(line.split("\t")) == 2: utag = line.split("\t")[0] ktag = line.split("\t")[1] if utag in vocabs: ktag_to_utags[ktag].add(utag) if os.path.exists(tag_list_path): with open(tag_list_path) as tag_list_file:
from _collections import defaultdict from pynlpini import Word2Vector from pipe import select, sort, as_list import os import logging logging.basicConfig(format='%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s', level=logging.INFO) labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv" tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv" unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv" result_path = "../../../data/app/word_correlate_tagger/tag_result.csv" model = Word2Vector.get_phrase_model() vocabs = set(model.vocab.keys()) ktag_to_utags = defaultdict(set) if os.path.exists(labelled_word_path): with open(labelled_word_path) as labelled_tag_file: logging.info("Processing " + labelled_word_path) for line in labelled_tag_file: line = line.strip().decode("utf-8") if len(line.split("\t")) == 2: utag = line.split("\t")[0] ktag = line.split("\t")[1] if utag in vocabs: ktag_to_utags[ktag].add(utag) if os.path.exists(tag_list_path): with open(tag_list_path) as tag_list_file: logging.info("Processing " + tag_list_path) for key in tag_list_file: key = key.strip().decode("utf-8")
def phrase2vec(txt, topn): global phrase2vector if phrase2vector is None: phrase2vector = Word2Vector.get_phrase_model() words = txt.split() return json.dumps(phrase2vector.most_similar(words, topn=topn), ensure_ascii=False)
def testPhrase2vec(self): base_dir = os.path.dirname(__file__) phrase2vector = Word2Vector.get_phrase_model( os.path.join(base_dir, "./model/word2vec_phrase.word2vec.model")) res = phrase2vector.most_similar([u"历史悠久", u"法国"]) self.assertEqual(res[2][0], "halohalo")