def load_data(coref): data_path = os.path.join(path, 'MCTest') if coref: with open(os.path.join(data_path, 'mc500.train.coref'), 'rb') as f: train_stories = clean(pickle.load(f)) with open(os.path.join(data_path, 'mc500.dev.coref'), 'rb') as f: dev_stories = clean(pickle.load(f)) with open(os.path.join(data_path, 'mc500.test.coref'), 'rb') as f: test_stories = clean(pickle.load(f)) else: train_stories = clean(load_stories( os.path.join(data_path, 'mc500.train.tsv'), os.path.join(data_path, 'mc500.train.ans'))) dev_stories = clean(load_stories( os.path.join(data_path, 'mc500.dev.tsv'), os.path.join(data_path, 'mc500.dev.ans'))) test_stories = clean(load_stories( os.path.join(data_path, 'mc500.test.tsv'), os.path.join(data_path, 'mc500.test.ans'))) return train_stories, dev_stories, test_stories
def load_data(coref): data_path = os.path.join(path, "MCTest") if coref: with open(os.path.join(data_path, "mc160.train.coref"), "rb") as f: train_stories = clean(pickle.load(f)) with open(os.path.join(data_path, "mc160.dev.coref"), "rb") as f: dev_stories = clean(pickle.load(f)) with open(os.path.join(data_path, "mc160.test.coref"), "rb") as f: test_stories = clean(pickle.load(f)) else: train_stories = clean( load_stories(os.path.join(data_path, "mc160.train.tsv"), os.path.join(data_path, "mc160.train.ans")) ) dev_stories = clean( load_stories(os.path.join(data_path, "mc160.dev.tsv"), os.path.join(data_path, "mc160.dev.ans")) ) test_stories = clean( load_stories(os.path.join(data_path, "mc160.test.tsv"), os.path.join(data_path, "mc160.test.ans")) ) return train_stories, dev_stories, test_stories
def process(path): stories = load_stories(path + '.tsv', path + '.ans') processed = [] filelist = [] try: tmpdir = mkdtemp() for i, story in enumerate(stories): infile = os.path.join(tmpdir, str(i) + '.txt') with open(infile, 'w') as f: f.write(story.text) filelist.append(infile) filelist_filename = os.path.join(tmpdir, 'filelist') with open(filelist_filename, 'w') as f: for name in filelist: f.write(name) f.write(os.linesep) subprocess.check_call([ 'java', '-cp', '*', '-Xmx2g', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos,lemma,ner,parse,dcoref', '-filelist', filelist_filename]) for i, story in enumerate(stories): outfile = str(i) + '.txt.out' with open(outfile) as f: lines = f.readlines() lines = [l.strip() for l in lines] lines = [l for l in lines if len(l) > 0] processed.append(Story(parse(story.text, lines), story.queries)) finally: shutil.rmtree(tmpdir) with open(path + '.coref', 'wb') as f: pickle.dump(processed, f, 2)
""" Runs a baseline test on a weakly supervised memory network. """ def clean(stories): return [s for s in stories if len(s.queries) > 1] def compute_accuracy(stories, model): accuracy = 0 for story in stories: score = model.predict_answer(story) accuracy += score return float(accuracy) / float(len(stories)) train_stories = clean(load_stories('MCTest/mc160.train.tsv','MCTest/mc160.train.ans')) dev_stories = clean(load_stories('MCTest/mc160.dev.tsv','MCTest/mc160.dev.ans')) test_stories = clean(load_stories('MCTest/mc160.test.tsv','MCTest/mc160.test.ans')) all_stories = train_stories + test_stories + dev_stories # initialize with all stories to get full vocab model = WeakMemoryNetwork(300, 256, all_stories, timetags=True, word2vec=False) print 'Training Accuracy prior to training: ', compute_accuracy(train_stories, model) print 'Testing Accuracy prior to training: ', compute_accuracy(test_stories, model) # Train for a certain number of epochs count = 0 for i in range(30): for story in train_stories + dev_stories: