コード例 #1
0
def main():
    """Converts glove txt files to word2vec format"""
    for dim in (50, 100, 200, 300):
        data_file = data.FileFinder().get_file('GLOVE_TXT_FILE').format(dim)
        output_file = data.FileFinder().get_file('GLOVE_WORD2VEC_FILE').format(
            dim)
        print("Converting {} to {}".format(data_file, output_file))
        glove2word2vec(data_file, output_file)
コード例 #2
0
def convert_large_gloves():
    finder = data.FileFinder()
    data_dir = finder.data_dir
    for data_file in ('glove.42B.300d.txt', 'glove.twitter.27B.200d.txt'):
        output_file = data_file.split('.')
        output_file[-1] = 'word2vec'
        output_file = '.'.join(output_file)
        data_file = os.path.join(data_dir, data_file)
        output_file = os.path.join(data_dir, output_file)
        print("Converting {} to {}".format(data_file, output_file))
        glove2word2vec(data_file, output_file)
コード例 #3
0
def score_model(m):
    sat = data.FileFinder().get_sat_data()
    n_correct = n_total = 0

    for question in sat:
        try:
            q = question['question']
            # Only take the words, not the POS
            a = [ans[0] for ans in question['answers']]
            dists = m.score_answers(q, a)
            order = np.argsort(dists)
            sorted_dists = np.array(dists)[order]
            sorted_a = np.array(a)[order]

            print("------------")
            print("Question: {}".format(q))

            print("Sorted distances:")
            for dist, ans in zip(sorted_dists, sorted_a):
                print("Words: {}, score: {}".format(ans, dist))

            if m.dist_metric == 'euclidean':
                best = np.argmin(dists)
            elif m.dist_metric == 'cosine':
                best = np.argmax(dists)
            print('Best answer found: {}'.format(','.join(a[best])))
            print('Correct answer: {}'.format(question['correct'][0]))

            correct_letter = question['correct_letter']
            if score_correct(correct_letter, best):
                n_correct += 1
                print('Correct!')
            else:
                print('Incorrect :-(')

            n_total += 1
        except Exception as e:
            print(e)
            q = question['question']

            print("------------")
            print("Question: {}".format(q))
            for ans in question['answers']:
                print("Words: {}".format(ans))
            print("Unknown words!")
            n_total += 1

    print("Total accuracy: {}/{} == {}".format(n_correct, n_total,
                                               n_correct / n_total))
コード例 #4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utils import data

finder = data.FileFinder()
sat = finder.get_sat_data()

with open('pos_and_src.txt', 'w') as f:
    f.write('POS1,POS2,Source')
    for q in sat:
        f.write('\n' + ','.join(q['question_POS'] + q['source'].split()[:1]))
コード例 #5
0
import os
from utils import data
from subprocess import call

directory = data.FileFinder().get_file('BATS_DIR')
embedding_file = data.FileFinder().get_file('CUSTOM_GLOVE')
print("Reading embedding file for vocabulary...", end='', flush=True)
# Read embedding file for vocabulary
vocab = []
embed_file = open(embedding_file, 'r')
for line in embed_file.readlines():
    vocab.append(line.strip().split()[0])
embed_file.close()
print("OK", flush=True)
print("Reading BATS files and merging...", end='', flush=True)
output_file = open(data.FileFinder().get_file('BATS_FULL_FILE'), 'w')
# Merge regular and irregular plural files
file1 = os.path.join(
    directory, "1_Inflectional_morphology/I02 [noun - plural_irreg].txt")
file2 = os.path.join(directory,
                     "1_Inflectional_morphology/I01 [noun - plural_reg].txt")
merged = open(
    os.path.join(directory,
                 "1_Inflectional_morphology/I01 I02 [noun - plurals].txt"),
    'w')
file1r = open(file1, 'r')
file2r = open(file2, 'r')
for line in file1r.readlines():
    merged.write(line)
file1r.close()
for line in file2r.readlines():
コード例 #6
0
from utils import nn_analogy_model, data

sat_data = data.FileFinder().get_sat_data()
model = nn_analogy_model.nn_analogy_model(
    embed_file="/home/fcampos/w266_final/data/glove.6B.300d.txt")
model.buildModel(0.001, [4096, 2048, 1024, 512], use_dropout=True)
score = 0
questions = 0
oov_questions = 0
index_to_letter = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}
print("Running questions...", flush=True)
for question in sat_data:
    questions += 1
    alternatives = []
    d = []
    for answer in question['answers']:
        alternatives.append(
            [question['question'][0], question['question'][1], answer[0][0]])
        d.append(answer[0][1])
    results, oov, scores = model.predict(alternatives,
                                         "/home/fcampos/w266_model",
                                         return_scores=True)
    choice_index = []
    for i in range(len(question['answers'])):
        if i not in oov:
            if results[i] == d[i]:
                choice_index.append(i)
    if len(choice_index) == 0 and len(oov) == 5:
        print(
            "Question %d: Out-of-vocabulary words in question or all alternatives"
            % questions)
コード例 #7
0
def score_elmo_model():
    """
    Use all layers from an elmo model for scoring


    """

    ee = ElmoEmbedder()

    sat = list(data.FileFinder().get_sat_data())

    accuracies = pd.DataFrame(sat)

    toplayers = 3
    for chooselayer in (0, 1, 2, None):
        for style in ('pairs', 'single'):
            print('{} -- {}'.format(chooselayer, style))
            acc_array = []

            for question in tqdm(sat):
                q = question['question']
                a = [ans[0] for ans in question['answers']]
                n_answers = len(a)

                if style == "pairs":
                    q_embed = ee.embed_sentence(q)
                    q_layers = q_embed[:, 0] - q_embed[:, 1]

                    # Only take the words, not the POS
                    a_layers = np.array(
                        [e[:, 0] - e[:, 1] for e in ee.embed_sentences(a)])

                elif style == "single":
                    q_embed = np.array(
                        list(ee.embed_sentences([[w] for w in q])))
                    q_embed = np.array(q_embed).reshape(2, 3, 1024).transpose(
                        (1, 0, 2))
                    q_layers = q_embed[:, 0] - q_embed[:, 1]

                    word1 = np.array(
                        list(ee.embed_sentences([[w[0]] for w in a])))
                    word2 = np.array(
                        list(ee.embed_sentences([[w[1]] for w in a])))
                    a_embed = word1 - word2
                    a_layers = a_embed.reshape((5, 3, 1024))

                # So that the first dimensions in both q and a is layers
                a_layers = a_layers.transpose(1, 0, 2)

                # If we just want one layer
                if chooselayer in (0, 1, 2):
                    q_layers = q_layers[chooselayer].reshape(1, 1024)
                    a_layers = a_layers[chooselayer].reshape(1, -1, 1024)

                # Take top N layers
                else:
                    q_layers = q_layers[-toplayers:]
                    a_layers = a_layers[-toplayers:]

                dists = []
                for i, (ql, als) in enumerate(zip(q_layers, a_layers)):
                    for al in als:
                        dists.append(cosine(ql, al))

                order = np.argsort(dists)
                sorted_dists = np.array(dists)[order]
                sorted_a = np.tile(np.array(a), (3, 1))[order]

                best = np.argmin(dists) % n_answers

                correct_letter = question['correct_letter']
                if score_correct(correct_letter, best):
                    acc_array.append(1)
                else:
                    acc_array.append(0)

                if chooselayer is None:
                    chooselayer = 'all'

                accuracies['style-{}_layer-{}'.format(
                    style, chooselayer)] = pd.Series(acc_array)

        accuracies.to_csv("elmo_accuracies.csv")