def __init__(self, start, end, dataType, N): self.LETTERS = ['A', 'B', 'C', 'D'] self.fullTest = self.validationSet() self.dataType = dataType self.test = [q for i, q in enumerate(self.fullTest) if (i < end and i >= start)] self.correct = 0 self.incorrect = 0 self.answerReport = [] self.searchAnswerReport = [] self.timeReport = [] self.N = N # instantiate mindmaps if os.path.isfile(cache + 'mindmaps.p'): utils.loadData(cache + 'mindmaps.p') else: self.mindmaps = {}
def getSearchFromFile(): '''Opens local copy of search results''' searchResults = utils.loadData(cache + 'searchResults.p') searchObject = json.loads(searchResults) snippetDoc = '' items = searchObject['items'] for i in items: snippetDoc += i['snippet'] return snippetDoc
import searchText as scraper import util import QAUtils as utils from Models import Test import pickle, os, time cache = '../Dropbox/ScienceQASharedCache/' # Get local copy of freebase if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p') else: freebaseRelations = {} # Setup for worker pool poolWorkerNum = 200 poolIterations = 2 poolRedundancies = False # Get all keywords eightGradeExam = Test(start=0, end=8132, dataType='val', N=6) keywords = eightGradeExam.getSecondOrderKeywords() # save second order keywords utils.saveData(keywords, cache + 'SecondOrderKeywords.p') print('Keywords saved.') # Filter keywords already in local freebaseRelations keywords = [kw for kw in keywords if kw not in freebaseRelations] print('Number of first order keywords left: {}'.format(len(keywords)))
# - spacy word2vec cosine distance between question and answer (own and average of four) # - spacy word2vec cosine distance between answer option and other options (own and average of four) print('- Basic formatting') trainX = extractor.basicFormatFeatures(trainPairedQA) valX = extractor.basicFormatFeatures(valPairedQA) print(trainX.shape) # Feature measuring proximity of a given Q-A pair to authoritative texts # - Q-A combined into a single statement then search carried out to see distance to closest sentence in text # - Authoritative text from wikipedia and CK12 free online textbooks for elementary school children # - Two measures given--one requiring relatively strict matches, one allowing loose matches # - return both absolute value as well as average of other 3 answers print('- Text match features') if os.path.isfile(cache + 'trainX'): train_textMatch = utils.loadData(cache + 'trainX') print(train_textMatch.shape) else: trainX = extractor.getTextMatchFeatures(trainPairedQA, kList=[100, 10, 100, 1000, 3]) trainX = extractor.concat(trainX, utils.loadData(cache + 'trainX')) # if os.path.isfile(cache + 'valX'): # valX = extractor.concat(valX, utils.loadData(cache + 'valX')) # else: # valX = extractor.getTextMatchFeatures(valPairedQA, kList=[100, 10, 100, 1000, 3]) # print(trainX.shape) # Features from the keyword graph from the Aristo paper # - size of question graph, size of answer graph, coherence score of answers, coherence score # of question keywords, number of pruned words for each Q-A pair print('- Keyword graph features')
import os, sys import QAUtils as utils from whoosh.fields import * from whoosh.index import * from whoosh.query import * from whoosh.qparser import QueryParser # 0. Set global parameters cache = '../Dropbox/ScienceQASharedCache/' # 1. Get corpus corpus = utils.loadData(cache + 'allTextLines')[:100] # 2. Index using whoosh schema = Schema(content=TEXT, stored_content=TEXT(stored=True)) if not os.path.exists(cache + 'IRindex'): os.mkdir(cache + 'IRindex') ix = create_in(cache + 'IRindex', schema) ix = open_dir(cache + 'IRindex') writer = ix.writer() for i, line in enumerate(corpus): sys.stdout.write('\rAdding line {} of {} to index'.format(i+1, len(corpus))) sys.stdout.flush() writer.add_document(content = line, stored_content = line) writer.commit() # Try out a search with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse('Turkey') results = searcher.search(query)
import util import pickle import copy import time import os import QAUtils as utils cache = '../Dropbox/ScienceQASharedCache/' regentsDataPath = cache + 'Regents_Train.tsv' trainData = cache + 'training_set.tsv' validationData = cache + 'validation_set.tsv' # second order keywords if os.path.isfile(cache + 'keywords.p'): localKeywords = utils.loadData(cache + 'keywords.p') else: localKeywords = {} class WordGraph: def __init__(self, question, N): # print('Question:', question) self.graph = {} self.N = N self.questionKeywords = util.getKeywords(question) # print('Question keywords extracted:', self.questionKeywords) self.importance = {kw: 1/len(self.questionKeywords) for kw in self.questionKeywords} # self.importance = util.getImportanceDict(question) # print('Keyword importance:', self.importance) self.secondOrderKeywords = localKeywords[question] if question in localKeywords else self.bestWords()