def process_question(question, cossims, model, words, vectors): correct = 0 num_qs = 0 num_lines = 0 for line in question: num_lines += 1 qwords = line.split() # We lowercase all words to correspond to the preprocessing # we applied to our data. qwords = [x.lower().strip() for x in qwords] # If one of the words is not in the vocabulary we skip this question found = True for w in qwords: if w not in words: found = False break if not found: continue # The first three words form the query # We retrieve their word vectors and normalize them query = qwords[:3] query = [model.get_word_vector(x) for x in query] query = [x / np.linalg.norm(x) for x in query] # Get the query vector. Example: # Germany - Berlin + France query = query[1] - query[0] + query[2] # We don't need to rank all the words, only until we found # the first word not equal to our set of query words. ban_set = list(map(lambda x: words.index(x), qwords[:3])) if words[util.find_nearest_neighbor( query, vectors, ban_set, cossims=cossims )] == qwords[3]: correct += 1 num_qs += 1 return correct, num_qs, num_lines
def build_glove_vec(glove, model, output_vec): ''' 创建语料glove词向量 :param glove: 下载的预训练的glove词向量 :param model: fasttext训练的词向量的二进制文件 :param output_vec: 比赛语料的词向量 :return: ''' from fastText import load_model from fastText import util fmodel = load_model(model) words_list = fmodel.get_words() words = set(words_list) glove_vec = {} with open(glove, 'r', encoding='utf-8') as f: for line in f: line = line.strip().lower().split() if line[0] in words: glove_vec[line[0]] = ' '.join(line[1:]) words.remove(line[0]) vectors = np.zeros((len(words_list), fmodel.get_dimension()), dtype=float) with open('../outdata/outvocab.txt', 'w', encoding='utf-8') as ff: for w in words: ff.write(w + '\n') for i in range(len(words_list)): wv = fmodel.get_word_vector(words_list[i]) wv = wv / np.linalg.norm(wv) vectors[i] = wv banset = list(map(lambda x: words_list.index(x), words)) cossims = np.zeros(len(vectors), dtype=float) for w in words: query = fmodel.get_word_vector(w) query = query / (np.linalg.norm(query)) glove_vec[w] = glove_vec[words_list[util.find_nearest_neighbor( query=query, vectors=vectors, ban_set=banset, cossims=cossims)]] with open(output_vec, 'w', encoding='utf-8') as fw: fw.write('{} {}\n'.format(len(glove_vec), len(glove_vec['you'].split()))) for w in words_list: fw.write(w + ' ' + glove_vec[w] + '\n')
def process_question(question, cossims, model, words, vectors): correct = 0 num_qs = 0 num_lines = 0 for line in question: num_lines += 1 qwords = line.split() # We lowercase all words to correspond to the preprocessing # we applied to our data. qwords = [x.lower().strip() for x in qwords] # If one of the words is not in the vocabulary we skip this question found = True for w in qwords: if w not in words: found = False break if not found: continue # The first three words form the query # We retrieve their word vectors and normalize them query = qwords[:3] query = [model.get_word_vector(x) for x in query] query = [x / np.linalg.norm(x) for x in query] # Get the query vector. Example: # Germany - Berlin + France query = query[1] - query[0] + query[2] # We don't need to rank all the words, only until we found # the first word not equal to our set of query words. ban_set = list(map(lambda x: words.index(x), qwords[:3])) if words[util.find_nearest_neighbor(query, vectors, ban_set, cossims=cossims)] == qwords[3]: correct += 1 num_qs += 1 return correct, num_qs, num_lines