Example #1
0
	def __init__(self, start, end, dataType, N):
		self.LETTERS = ['A', 'B', 'C', 'D']
		self.fullTest = self.validationSet()
		self.dataType = dataType
		self.test = [q for i, q in enumerate(self.fullTest) if (i < end and i >= start)]
		self.correct = 0
		self.incorrect = 0
		self.answerReport = []
		self.searchAnswerReport = []
		self.timeReport = []
		self.N = N

		# instantiate mindmaps
		if os.path.isfile(cache + 'mindmaps.p'): utils.loadData(cache + 'mindmaps.p')
		else:
			self.mindmaps = {}
Example #2
0
def getSearchFromFile():
	'''Opens local copy of search results'''
	searchResults = utils.loadData(cache + 'searchResults.p')
	searchObject = json.loads(searchResults)
	snippetDoc = ''
	items = searchObject['items']
	for i in items:
		snippetDoc += i['snippet']
	return snippetDoc
import searchText as scraper
import util
import QAUtils as utils
from Models import Test
import pickle, os, time

cache = '../Dropbox/ScienceQASharedCache/'

# Get local copy of freebase
if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p')
else:
	freebaseRelations = {}

# Setup for worker pool
poolWorkerNum = 200
poolIterations = 2
poolRedundancies = False

# Get all keywords
eightGradeExam = Test(start=0, end=8132, dataType='val', N=6)

keywords = eightGradeExam.getSecondOrderKeywords()

# save second order keywords
utils.saveData(keywords, cache + 'SecondOrderKeywords.p')
print('Keywords saved.')

# Filter keywords already in local freebaseRelations
keywords = [kw for kw in keywords if kw not in freebaseRelations]
print('Number of first order keywords left: {}'.format(len(keywords)))
Example #4
0
#  - spacy word2vec cosine distance between question and answer (own and average of four)
#  - spacy word2vec cosine distance between answer option and other options (own and average of four)
print('- Basic formatting')
trainX = extractor.basicFormatFeatures(trainPairedQA)
valX = extractor.basicFormatFeatures(valPairedQA)
print(trainX.shape)


# Feature measuring proximity of a given Q-A pair to authoritative texts
#  - Q-A combined into a single statement then search carried out to see distance to closest sentence in text
#  - Authoritative text from wikipedia and CK12 free online textbooks for elementary school children
#  - Two measures given--one requiring relatively strict matches, one allowing loose matches
#  - return both absolute value as well as average of other 3 answers
print('- Text match features')
if os.path.isfile(cache + 'trainX'): 
    train_textMatch = utils.loadData(cache + 'trainX')
    print(train_textMatch.shape)
else:
    trainX = extractor.getTextMatchFeatures(trainPairedQA, kList=[100, 10, 100, 1000, 3])
trainX = extractor.concat(trainX, utils.loadData(cache + 'trainX'))

# if os.path.isfile(cache + 'valX'): 
#     valX = extractor.concat(valX, utils.loadData(cache + 'valX'))
# else:
#     valX = extractor.getTextMatchFeatures(valPairedQA, kList=[100, 10, 100, 1000, 3])
# print(trainX.shape)

# Features from the keyword graph from the Aristo paper
#  - size of question graph, size of answer graph, coherence score of answers, coherence score
#    of question keywords, number of pruned words for each Q-A pair
print('- Keyword graph features')
import os, sys
import QAUtils as utils
from whoosh.fields import *
from whoosh.index import *
from whoosh.query import *
from whoosh.qparser import QueryParser

# 0. Set global parameters
cache = '../Dropbox/ScienceQASharedCache/'

# 1. Get corpus
corpus = utils.loadData(cache + 'allTextLines')[:100]

# 2. Index using whoosh
schema = Schema(content=TEXT, stored_content=TEXT(stored=True))
if not os.path.exists(cache + 'IRindex'):
	os.mkdir(cache + 'IRindex')
ix = create_in(cache + 'IRindex', schema)
ix = open_dir(cache + 'IRindex')

writer = ix.writer()
for i, line in enumerate(corpus):
	sys.stdout.write('\rAdding line {} of {} to index'.format(i+1, len(corpus)))
	sys.stdout.flush()
	writer.add_document(content = line, stored_content = line)
writer.commit()

# Try out a search
with ix.searcher() as searcher:
	query = QueryParser('content', ix.schema).parse('Turkey')
	results = searcher.search(query)
Example #6
0
import util
import pickle
import copy
import time
import os
import QAUtils as utils

cache = '../Dropbox/ScienceQASharedCache/'

regentsDataPath = cache + 'Regents_Train.tsv'
trainData = cache + 'training_set.tsv'
validationData = cache + 'validation_set.tsv'

# second order keywords
if os.path.isfile(cache + 'keywords.p'): localKeywords = utils.loadData(cache + 'keywords.p')
else: localKeywords = {}

class WordGraph:
	def __init__(self, question, N):
		# print('Question:', question)
		self.graph = {}
		self.N = N
		self.questionKeywords = util.getKeywords(question)
		# print('Question keywords extracted:', self.questionKeywords)

		self.importance = {kw: 1/len(self.questionKeywords) for kw in self.questionKeywords}
		# self.importance = util.getImportanceDict(question)
		# print('Keyword importance:', self.importance)

		self.secondOrderKeywords = localKeywords[question] if question in localKeywords else self.bestWords()