import numpy as np from gensim.models import Word2Vec from gensim.models.word2vec import KeyedVectors, LineSentence import itertools import sys sys.path.append('..') import config import pickle import os MAX_WORDS_IN_BATCH = 1000 logger = logging.Logger(name="word2vec", level=logging.INFO) logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') logging.root.setLevel(level=logging.INFO) question_path = config.getPathConfig("question", "all_seg_question") model_path = config.getPathConfig("word2vec", "word2vec") + ".teem" emb_dim = config.getIntConfig("word2vec", "emb_dim") vocab_path = config.vocab_path + ".teem" emb_path = config.emb_path + ".teem" def train_word2vec(): '''训练词项向量 ''' model = Word2Vec(sentences=LineSentence(question_path), size=emb_dim, window=5, min_count=5,
import logging import numpy as np from gensim.models import Word2Vec from gensim.models.word2vec import KeyedVectors,LineSentence import itertools import sys sys.path.append('..') import config MAX_WORDS_IN_BATCH=1000 logger=logging.Logger(name="word2vec",level=logging.INFO) logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') logging.root.setLevel(level=logging.INFO) kb_path=config.getPathConfig("kb","seg_kb") question_path=config.getPathConfig("question","all_seg_question") model_path=config.getPathConfig("word2vec","word2vec") emb_dim=config.getIntConfig("word2vec","emb_dim") def any2unicode(text, encoding='utf8', errors='strict'): """Convert a string (bytestring in `encoding` or unicode), to unicode.""" if isinstance(text, str): return text return str(text, encoding, errors=errors) to_unicode = any2unicode class MyLineSentence(object): """ Simple format: one sentence = one line; words already preprocessed and separated by whitespace. """
from gensim.models.word2vec import KeyedVectors, LineSentence import itertools import sys sys.path.append('..') import config import pickle MAX_WORDS_IN_BATCH = 1000 logger = logging.Logger(name="word2vec", level=logging.INFO) logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') logging.root.setLevel(level=logging.INFO) kb_path = config.kb_path question_path = config.all_seg_question_path model_path = config.getPathConfig("word2vec", "char2vec") emb_dim = config.getIntConfig("word2vec", "emb_dim") char_vocab_path = config.char_vocab_path char_emb_path = config.char_embedding def any2unicode(text, encoding='utf8', errors='strict'): """Convert a string (bytestring in `encoding` or unicode), to unicode.""" if isinstance(text, str): return text return str(text, encoding, errors=errors) to_unicode = any2unicode
#coding:utf-8 ''' Mean Reciprocal Rank: 平均排名倒数 ''' import sys sys.path.append("..") import config from myutils.io import read_table, read_lines train_gold_path = config.getPathConfig("data", "train_data") train_pre_path = config.train_score_path test_gold_path = config.getPathConfig("data", "test_data") test_pre_path = config.test_score_path delta = 2 def is_right(gold): if float(gold.strip()) > delta: return True else: return False class Metrics(object): def __init__(self, gold_path, pre_path): self.gold_path = gold_path self.pre_path = pre_path self.gold = None self.pre = None self.questions = dict() #每个问题对应的标签和预测的得分