def run_word_embeddings_model(): print('Loading Test Data...') #question, answer = get_test_data('dataset/MSR_Sentence_Completion_Challenge_V1/Data/') question, answer = get_test_data('dataset/SAT_Questions/') print('Loading Model Data...') m = Model('./vectors.txt') m.load() right = 0.0 wrong = 0.0 print('Predicting Answers... ') for i in range(1, len(question) + 1): #print('Predicting Q' + str(i)) q_num = str(i) best = find_best_answer(question[q_num], m, wmd) if answer[q_num] in best: right += 1 else: wrong += 1 print('The accuracy is ' + str(right / (right + wrong)))
def fill_in_choices(datafolder): question, answer = get_test_data(datafolder) sentences = [question[x]['statement'] for x in question] n = len(sentences) new_sentences = [] for i in range(1, n): for choice in "abcde": word_choice = question[str(i)][choice] sentence = question[str(i)]['statement'] sentence = sentence.replace('_____', word_choice) # replace punctuation marks using logic above new_sentence = replace_punctuation_marks(sentence) new_sentences.extend(new_sentence) return new_sentences
def _raw_data(data_path=None, backwards=False): """Load training/test raw data from data directory "data_path". Reads text files, converts strings to integer ids, and performs mini-batching of the inputs. Args: data_path: string path to the directory where simple-examples.tgz has been extracted. Returns: tuple (train_data, test_data, vocabulary) where each of the data objects can be passed to Iterator. """ train_path = "dataset/treebank2/raw/wsj/" test_path = "dataset/SAT_Questions/" question, answer = get_test_data(test_path) word_to_id = _build_vocab(train_path) train_data, train_sentences, train_data_in_list_of_lists = _file_to_word_ids( train_path, word_to_id, True, backwards) test_data, test_sentences, test_data_in_list_of_lists = _file_to_word_ids( test_path, word_to_id, False, backwards) vocabulary = len(word_to_id) return word_to_id, train_data, test_sentences, test_data_in_list_of_lists, question, answer
def _read_test_stop_at_blank(datafolder, backwards=False): question, answer = get_test_data(datafolder) sentences = [question[x]['statement'] for x in question] n = len(sentences) new_sentences = [] for i in range(1, len(question) + 1): sentence = question[str(i)]['statement'] forward_sentence, backward_sentence = sentence.split('_____') word_choices = "" for choice in "abcde": word_choice = question[str(i)][choice] + " " word_choices += word_choice if backwards: reversed_backward_sentence = reverse_words_in_string( replace_punctuation_marks(backward_sentence)) new_reversed_backward_sentence = word_choices + reversed_backward_sentence new_sentences.append(new_reversed_backward_sentence) else: new_forward_sentence = word_choices + replace_punctuation_marks( forward_sentence) new_sentences.append(new_forward_sentence) return new_sentences
def _read_test(datafolder): question, answer = get_test_data(datafolder) sentences = [question[x]['statement'] for x in question] return sentences
import sys from NGram import get_test_data from operator import add test_data_location = 'dataset/MSR_Sentence_Completion_Challenge_V1/Data' forward_file = 'forward_out_MSR.txt' backward_file = 'forward_out_MSR.txt' output_file = 'bidirectional_out_MSR.txt' _, test_answer = get_test_data(test_data_location) probs = {} question_nums = [] with open(forward_file, 'r') as f: lines = f.readlines() for line in lines: values = line.split() # all data lines will have length 6 if len(values) == 6: question_num = str(int(values[0]) + 1) question_nums.append(question_num) probs[question_num] = [float(x) for x in values[1:]] with open(backward_file, 'r') as f: lines = f.readlines() for line in lines: values = line.split() if len(values) == 6: question_num = str(int(values[0]) + 1) for i in range(len(probs[question_num])):