Example #1
0
def process_question(question, cossims, model, words, vectors):
    correct = 0
    num_qs = 0
    num_lines = 0
    for line in question:
        num_lines += 1
        qwords = line.split()
        # We lowercase all words to correspond to the preprocessing
        # we applied to our data.
        qwords = [x.lower().strip() for x in qwords]
        # If one of the words is not in the vocabulary we skip this question
        found = True
        for w in qwords:
            if w not in words:
                found = False
                break
        if not found:
            continue
        # The first three words form the query
        # We retrieve their word vectors and normalize them
        query = qwords[:3]
        query = [model.get_word_vector(x) for x in query]
        query = [x / np.linalg.norm(x) for x in query]
        # Get the query vector. Example:
        # Germany  - Berlin + France
        query = query[1] - query[0] + query[2]
        # We don't need to rank all the words, only until we found
        # the first word not equal to our set of query words.
        ban_set = list(map(lambda x: words.index(x), qwords[:3]))
        if words[util.find_nearest_neighbor(
            query, vectors, ban_set, cossims=cossims
        )] == qwords[3]:
            correct += 1
        num_qs += 1
    return correct, num_qs, num_lines
def build_glove_vec(glove, model, output_vec):
    '''
    创建语料glove词向量
    :param glove: 下载的预训练的glove词向量
    :param model: fasttext训练的词向量的二进制文件
    :param output_vec: 比赛语料的词向量
    :return:
    '''
    from fastText import load_model
    from fastText import util
    fmodel = load_model(model)
    words_list = fmodel.get_words()
    words = set(words_list)

    glove_vec = {}
    with open(glove, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().lower().split()
            if line[0] in words:
                glove_vec[line[0]] = ' '.join(line[1:])
                words.remove(line[0])

    vectors = np.zeros((len(words_list), fmodel.get_dimension()), dtype=float)
    with open('../outdata/outvocab.txt', 'w', encoding='utf-8') as ff:
        for w in words:
            ff.write(w + '\n')

    for i in range(len(words_list)):
        wv = fmodel.get_word_vector(words_list[i])
        wv = wv / np.linalg.norm(wv)
        vectors[i] = wv

    banset = list(map(lambda x: words_list.index(x), words))
    cossims = np.zeros(len(vectors), dtype=float)
    for w in words:
        query = fmodel.get_word_vector(w)
        query = query / (np.linalg.norm(query))
        glove_vec[w] = glove_vec[words_list[util.find_nearest_neighbor(
            query=query, vectors=vectors, ban_set=banset, cossims=cossims)]]
    with open(output_vec, 'w', encoding='utf-8') as fw:
        fw.write('{} {}\n'.format(len(glove_vec),
                                  len(glove_vec['you'].split())))
        for w in words_list:
            fw.write(w + ' ' + glove_vec[w] + '\n')
Example #3
0
def process_question(question, cossims, model, words, vectors):
    correct = 0
    num_qs = 0
    num_lines = 0
    for line in question:
        num_lines += 1
        qwords = line.split()
        # We lowercase all words to correspond to the preprocessing
        # we applied to our data.
        qwords = [x.lower().strip() for x in qwords]
        # If one of the words is not in the vocabulary we skip this question
        found = True
        for w in qwords:
            if w not in words:
                found = False
                break
        if not found:
            continue
        # The first three words form the query
        # We retrieve their word vectors and normalize them
        query = qwords[:3]
        query = [model.get_word_vector(x) for x in query]
        query = [x / np.linalg.norm(x) for x in query]
        # Get the query vector. Example:
        # Germany  - Berlin + France
        query = query[1] - query[0] + query[2]
        # We don't need to rank all the words, only until we found
        # the first word not equal to our set of query words.
        ban_set = list(map(lambda x: words.index(x), qwords[:3]))
        if words[util.find_nearest_neighbor(query,
                                            vectors,
                                            ban_set,
                                            cossims=cossims)] == qwords[3]:
            correct += 1
        num_qs += 1
    return correct, num_qs, num_lines