예제 #1
0
def run_word_embeddings_model():
    print('Loading Test Data...')

    #question, answer = get_test_data('dataset/MSR_Sentence_Completion_Challenge_V1/Data/')
    question, answer = get_test_data('dataset/SAT_Questions/')

    print('Loading Model Data...')

    m = Model('./vectors.txt')
    m.load()

    right = 0.0
    wrong = 0.0

    print('Predicting Answers... ')

    for i in range(1, len(question) + 1):
        #print('Predicting Q' + str(i))
        q_num = str(i)
        best = find_best_answer(question[q_num], m, wmd)
        if answer[q_num] in best:
            right += 1
        else:
            wrong += 1

    print('The accuracy is ' + str(right / (right + wrong)))
예제 #2
0
def fill_in_choices(datafolder):
    question, answer = get_test_data(datafolder)
    sentences = [question[x]['statement'] for x in question]
    n = len(sentences)

    new_sentences = []
    for i in range(1, n):
        for choice in "abcde":
            word_choice = question[str(i)][choice]
            sentence = question[str(i)]['statement']
            sentence = sentence.replace('_____', word_choice)
            # replace punctuation marks using logic above
            new_sentence = replace_punctuation_marks(sentence)
            new_sentences.extend(new_sentence)
    return new_sentences
예제 #3
0
def _raw_data(data_path=None, backwards=False):
    """Load training/test raw data from data directory "data_path".
    Reads text files, converts strings to integer ids,
    and performs mini-batching of the inputs.
    Args:
        data_path: string path to the directory where simple-examples.tgz has
            been extracted.
    Returns:
        tuple (train_data, test_data, vocabulary)
        where each of the data objects can be passed to Iterator.
    """
    train_path = "dataset/treebank2/raw/wsj/"
    test_path = "dataset/SAT_Questions/"
    question, answer = get_test_data(test_path)

    word_to_id = _build_vocab(train_path)
    train_data, train_sentences, train_data_in_list_of_lists = _file_to_word_ids(
        train_path, word_to_id, True, backwards)
    test_data, test_sentences, test_data_in_list_of_lists = _file_to_word_ids(
        test_path, word_to_id, False, backwards)
    vocabulary = len(word_to_id)
    return word_to_id, train_data, test_sentences, test_data_in_list_of_lists, question, answer
예제 #4
0
def _read_test_stop_at_blank(datafolder, backwards=False):
    question, answer = get_test_data(datafolder)
    sentences = [question[x]['statement'] for x in question]
    n = len(sentences)
    new_sentences = []
    for i in range(1, len(question) + 1):
        sentence = question[str(i)]['statement']
        forward_sentence, backward_sentence = sentence.split('_____')

        word_choices = ""
        for choice in "abcde":
            word_choice = question[str(i)][choice] + " "
            word_choices += word_choice

        if backwards:
            reversed_backward_sentence = reverse_words_in_string(
                replace_punctuation_marks(backward_sentence))
            new_reversed_backward_sentence = word_choices + reversed_backward_sentence
            new_sentences.append(new_reversed_backward_sentence)
        else:
            new_forward_sentence = word_choices + replace_punctuation_marks(
                forward_sentence)
            new_sentences.append(new_forward_sentence)
    return new_sentences
예제 #5
0
def _read_test(datafolder):
    question, answer = get_test_data(datafolder)
    sentences = [question[x]['statement'] for x in question]
    return sentences
import sys
from NGram import get_test_data
from operator import add

test_data_location = 'dataset/MSR_Sentence_Completion_Challenge_V1/Data'
forward_file = 'forward_out_MSR.txt'
backward_file = 'forward_out_MSR.txt'
output_file = 'bidirectional_out_MSR.txt'

_, test_answer = get_test_data(test_data_location)

probs = {}
question_nums = []

with open(forward_file, 'r') as f:
    lines = f.readlines()
    for line in lines:
        values = line.split()
        # all data lines will have length 6
        if len(values) == 6:
            question_num = str(int(values[0]) + 1)
            question_nums.append(question_num)
            probs[question_num] = [float(x) for x in values[1:]]
        
with open(backward_file, 'r') as f:
    lines = f.readlines()
    for line in lines:
        values = line.split()
        if len(values) == 6:
            question_num = str(int(values[0]) + 1)
            for i in range(len(probs[question_num])):