def HasFollowingPreposition(article, text_to_span, text_tokens): num_answer_tokens = len(text_tokens) text = ReconstructStrFromSpan(text_tokens, (0, num_answer_tokens)) for span_text, span_tuple in text_to_span.iteritems(): paragraph_i, sentence_i, span = span_tuple span_length = span[1] - span[0] if span_length > num_answer_tokens and text in span_text: span_tokens = article.paragraphs[paragraph_i].context.sentence[ sentence_i].token[span[0]:span[1]] if ReconstructStrFromSpan( span_tokens[0:num_answer_tokens], (0, num_answer_tokens) ) == text and span_tokens[num_answer_tokens].pos in ['IN', 'TO']: return True return False
def GetSentence(paragraph, text): for sentence in paragraph.context.sentence: sentence_text = ReconstructStrFromSpan(sentence.token, (0, len(sentence.token))) if text in sentence_text: return sentence return None
def __init__(self, title, qaId, query=None, context=None, ans=None, ansToken=None, paraId=None, senId=None, pAnsId=None, ansStr=None): ''' @param title: indicate which article it belongs to. @param id: the unique id for qa pairs @param query, context, ans: list of int representation for RNN use @param ansToken: the tokens of the answer (for evaluation purpose) @param paraId: which paragraph the qa is from @param senId: which sentence the qa is from ''' self.title = title self.id = qaId self.query = query self.context = context self.ans = ans self.ansToken = ansToken if ansStr == None and self.ansToken != None and len( self.ansToken) != 0: self.ansStr = ReconstructStrFromSpan(ansToken, (0, len(ansToken))) else: self.ansStr = ansStr self.pAnsParaId = paraId self.pAnsSenId = senId self.pAnsId = pAnsId
def DebugIncomplete(sentence, text): PUNCT = [' ', '.', ','] sentence_text = ReconstructStrFromSpan(sentence.token, (0, len(sentence.token))) pos = sentence_text.find(text) pos_length = len(text) pos_changed = False while pos - 1 >= 0 and sentence_text[pos - 1] not in PUNCT: pos -= 1 pos_length += 1 pos_changed = True while pos + pos_length < len(sentence_text) and sentence_text[ pos + pos_length] not in PUNCT: pos_length += 1 pos_changed = True if pos_changed and sentence_text[pos:pos + pos_length] in text_to_span: print text, '---', sentence_text[pos:pos + pos_length]
def HasNounPrefix(article, text_to_span, text_tokens): num_answer_tokens = len(text_tokens) text = ReconstructStrFromSpan(text_tokens, (0, num_answer_tokens)) for span_text, span_tuple in text_to_span.iteritems(): paragraph_i, sentence_i, span = span_tuple span_length = span[1] - span[0] if span_length > num_answer_tokens and text in span_text: span_tokens = article.paragraphs[paragraph_i].context.sentence[ sentence_i].token[span[0]:span[1]] last_span_tokens = span_tokens[-len(text_tokens):] if ReconstructStrFromSpan(last_span_tokens, (0, len(last_span_tokens))) == text: # Note: PRP$ gives a few percent increase. if all([ token.pos in [ 'PRP$', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS' ] for token in span_tokens[:-num_answer_tokens] ]): return True return False
def PrintAnswer(candidate_index, features, prefix): span = example.candidate_answers[ candidate_index] sentence_tokens = annotations.article.paragraphs[ span.paragraphIndex].context.sentence[ span.sentenceIndex].token print ' ' + prefix + ' Sentence:', ReconstructStrFromSpan( sentence_tokens) print ' ' + prefix + ' Span:', BuildPrediction( annotations, span) total_weight = 0 sorted_weights = [] for feature_index in features: total_weight += weights[feature_index] sorted_weights.append( (weights[feature_index], dictionary.GetName(feature_index))) print ' ' + prefix + ' Score:', total_weight print ' ' + prefix + ' Features:' sorted_weights.sort(reverse=True) for weight, name in sorted_weights: print ' ' + str(weight), name
flags.DEFINE_string('input-featuredict', 'dataset/featuredictbucketized-25000.proto', '') flags.DEFINE_integer('min-articles', None, '') if __name__ == '__main__': dictionary = Dictionary(FLAGS.input_featuredict) feature_index = dictionary.GetIndex('Dep Path NN - conj -> NN') examples = ReadExamples(FLAGS.input_features, dictionary, FLAGS.min_articles) question_annotations = ReadQuestionAnnotations(FLAGS.input_articles) for example in examples: for i in xrange(example.input_indices.shape[0]): question_index = example.input_indices[i][0] if example.input_indices[i][2] == feature_index: correct = example.input_indices[i][1] == example.label[ question_index] annotations = question_annotations[ example.question_ids[question_index]] span = example.candidate_answers[example.input_indices[i][1]] sentence_tokens = annotations.article.paragraphs[ span.paragraphIndex].context.sentence[ span.sentenceIndex].token print 'Sentence:', ReconstructStrFromSpan(sentence_tokens) print 'Question:', annotations.qa.question.text print 'Span:', BuildPrediction(annotations, span) print 'Correct!' if correct else 'Wrong!' print
def TestPredict(articleLevel=True): # pass agent = GetContextRnnAgent() agent.ConstructInputNode(batchSize=512) agent.ConstructEvalInputNode() agent.ConstructGraph() agent.ConstructEvalGraph() agent.articleLevel = articleLevel # prepare for evaluation on both training and agent.PrepareEvalInput(onTrain=True) agent.PrepareEvalInput(onTrain=False) # assert the prediction is in the correct scope candInput = agent.trainCandInput candGlobalId = agent.trainCandGlobalId candidates = agent.trainCandidates origData = agent.trainOrigData sess = tf.Session() agent.session = sess agent.session.run(tf.initialize_all_variables()) predictionTrain = agent.PredictTrainSamples() candInput = agent.evalCandInput candGlobalId = agent.evalCandGlobalId candidates = agent.evalCandidates origData = agent.evalOrigData predictionEval = agent.PredictEvalSamples() print "start eval predictions" # for samples, origData, prediction in zip([agent.trainSamples, agent.evalSamples], [agent.trainOrigData, agent.evalOrigData], [predictionTrain, predictionEval] ): for samples, origData, prediction in zip([agent.evalSamples], [agent.evalOrigData], [predictionEval]): # samples = agent.trainSamples # origData = agent.trainOrigData for sample in samples: idx = sample.id title = sample.title pred = prediction[idx] found = False if articleLevel: for para in origData[title].paragraphs: for sen in para.context.sentence: contextStr = ReconstructStrFromSpan( sen.token, (0, len(sen.token))) if pred.ansStr in contextStr: found = True break if found: break assert found else: paraId = sample.pAnsParaId for sen in origData[title].paragraphs[paraId].context.sentence: contextStr = ReconstructStrFromSpan( sen.token, (0, len(sen.token))) if pred.ansStr in contextStr: found = True break assert found print "ContextRnnPrediction test passed!"
def PredictPerArticle(self, title, returnDict): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' print "Predicting for " + title # get unigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # flatten version of contextUni context = [] if self.articleLevel: for paraUni in contextUni: for sentenceUni in paraUni: context += sentenceUni else: for paraUni in contextUni: contextPara = [] for sentenceUni in paraUni: contextPara += sentenceUni context.append(contextPara) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: # pred = dict() preds = list() scores = list() # bestScore = sys.float_info.min qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] if self.articleLevel: for iPara, para, unigrams, spans \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextSpan) ): # traverse each sentence in the paragraph for iSen, (s, uni, spanList) in enumerate( zip(para.context.sentence, unigrams, spans)): assert len(s.token) == len(uni) for span in spanList: beginId = span[0] endId = span[1] aUnigram = uni[beginId:endId] score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \ - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram) scores.append(score) ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] preds.append( QaPrediction(title, qaId, ansStr, iPara, iSen, ansToken=ansToken)) else: iPara = qaParaId para = self.data[title].paragraphs[iPara] unigrams = contextUni[iPara] spans = contextSpan[iPara] for iSen, (s, uni, spanList) in enumerate( zip(para.context.sentence, unigrams, spans)): assert len(s.token) == len(uni) for span in spanList: beginId = span[0] endId = span[1] aUnigram = uni[beginId:endId] score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \ - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram) scores.append(score) ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] preds.append( QaPrediction(title, qaId, ansStr, iPara, iSen, ansToken=ansToken, score=score)) scores = np.array(scores) preds = np.array(preds) scoreOrder = np.argsort(-scores) predictions[qaId] = preds[scoreOrder][ 0:min(self.topK, preds.size)].tolist() returnDict[title] = predictions
def PredictPerArticle(self, title, returnDict): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' # get unigram and bigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) contextBi = self.GetContextBigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() print "Predicting for " + title for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: pred = dict() bestScore = sys.float_info.min bestSentence = None qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] qBigram = self.GetBigramBySentence(qS.token) # traverse over paragraphs pred = dict() pred["answer"] = [] pred["token"] = [] # if ansSentence not in pred["sentence"]: pred["sentence"] = [] pred["sentenceToken"] = [] pred["paraId"] = [] pred["senId"] = [] for iPara, (para, unigrams, bigrams, spans) \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ): # traverse each sentence in the paragraph if self.articleLevel == False and iPara != qaParaId: continue for iSen, (s, uni, bi, spanList) in enumerate( zip(para.context.sentence, unigrams, bigrams, spans)): assert len(s.token) == len(uni) assert len(s.token) == len(bi) + 1 for span in spanList: beginId = span[0] endId = span[1] cbUnigram = uni[0:beginId] caUnigram = uni[endId:] if beginId == 0: cbBigram = [] else: cbBigram = bi[0:(beginId - 1)] caBigram = bi[endId:] aUnigram = uni[beginId:endId] aBigram = bi[beginId:(endId - 1)] # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"): if len(aUnigram ) == 1 and aUnigram[0] in self.stopWords: continue if len(aUnigram) > 3: continue ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] assert len(s.token) != 0 # ansSentenceToken = [token.word for token in s.token] pred["answer"].append(ansStr) pred["token"].append(ansToken) # if ansSentence not in pred["sentence"]: pred["sentence"].append(ansSentence) pred["sentenceToken"].append(ansSentenceToken) pred["paraId"].append(iPara) pred["senId"].append(iSen) # filter from the candidates for best choice preds = list() for ansToken, sentenceToken, ansStr, iPara, iSen \ in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ): ansToken = [token.word.lower() for token in ansToken] if ansToken[-1] == ".": ansToken = ansToken[:-1] if ansToken[0] == "The" or ansToken[0] == "the": ansToken = ansToken[1:] if len(ansToken) == 0: print " zero length ans token" preds.append( QaPrediction(title, qa.id, ansStr, iPara, iSen, ansToken=ansToken)) random.shuffle(preds) predictions[qa.id] = preds[0:min(self.topK, len(preds))] predictions[qa.id] = predictions[qa.id][0].ansStr returnDict[title] = predictions
num_appears_in_long_sentence = 0 num_has_noun_prefix = 0 num_has_following_preposition = 0 num_broken = 0 for article in articles: text_to_span = {} spans = GetContextConstituentSpan(article) num_spans = 0 for paragraph_i in xrange(len(spans)): paragraph = article.paragraphs[paragraph_i] for sentence_i in xrange(len(spans[paragraph_i])): tokens = paragraph.context.sentence[sentence_i].token for span in spans[paragraph_i][sentence_i]: num_spans += 1 text = ReconstructStrFromSpan(tokens, span) text_to_span[text] = (paragraph_i, sentence_i, span) bad = [] for paragraph in article.paragraphs: for qa in paragraph.qas: num_answers += 1 num_answer_span_pairs += num_spans if len(qa.answer.sentence) > 1: num_multiple_sentences += 1 continue text_tokens = qa.answer.sentence[0].token num_answer_tokens = len(text_tokens) text = ReconstructStrFromSpan(text_tokens, (0, num_answer_tokens))
def PredictPerArticle(self, title, returnDict): # def PredictPerArticle(self, title): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' # get unigram and bigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) contextBi = self.GetContextBigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() # print "Predicting for " + title for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: pred = dict() bestScore = sys.float_info.min bestSentence = None qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] qBigram = self.GetBigramBySentence(qS.token) # traverse over paragraphs pred = dict() for iPara, (para, unigrams, bigrams, spans) \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ): # traverse each sentence in the paragraph if self.articleLevel == False and iPara != qaParaId: continue for iSen, (s, uni, bi, spanList) in enumerate( zip(para.context.sentence, unigrams, bigrams, spans)): assert len(s.token) == len(uni) assert len(s.token) == len(bi) + 1 for span in spanList: beginId = span[0] endId = span[1] cbUnigram = uni[0:beginId] caUnigram = uni[endId:] if beginId == 0: cbBigram = [] else: cbBigram = bi[0:(beginId - 1)] caBigram = bi[endId:] aUnigram = uni[beginId:endId] aBigram = bi[beginId:(endId - 1)] # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"): if len(aUnigram) == 1 and aUnigram[ 0] in self.slidingWindowAgent.stopWords: continue score = self.GetContextScore( qUnigram, qBigram, cbUnigram, caUnigram, cbBigram, caBigram, aUnigram, aBigram, self.stopWords) if score > bestScore or len(pred) == 0: ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] pred = { "id": qa.id, "answer": [ ansStr, ], "token": [ ansToken, ], "sentence": [ ansSentence, ], "sentenceToken": [ ansSentenceToken, ], "paraId": [ iPara, ], "senId": [ iSen, ] } bestScore = score # the current dataset has empty s.text field # assert ansStr in s.text # note we permit multiple answers elif score == bestScore: ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] assert len(s.token) != 0 # ansSentenceToken = [token.word for token in s.token] pred["answer"].append(ansStr) pred["token"].append(ansToken) # if ansSentence not in pred["sentence"]: pred["sentence"].append(ansSentence) pred["sentenceToken"].append(ansSentenceToken) pred["paraId"].append(iPara) pred["senId"].append(iSen) # filter from the candidates for best choice preds = list() if self.slidingWindowAgent != None: subAgent = self.slidingWindowAgent slidingScores = [] for ansToken, sentenceToken, ansStr, iPara, iSen \ in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ): ansToken = [token.word.lower() for token in ansToken] if ansToken[-1] == ".": ansToken = ansToken[:-1] if ansToken[0] == "The" or ansToken[0] == "the": ansToken = ansToken[1:] if len(ansToken) == 0: print " zero length ans token" sentenceToken = [ token.word.lower() for token in sentenceToken ] slidingScores.append( subAgent.GetSlidingDistScore( sentenceToken, qUnigram, ansToken)) preds.append( QaPrediction(title, qa.id, ansStr, iPara, iSen, ansToken=ansToken, score=slidingScores[-1])) slidingScores = np.array(slidingScores) preds = np.array(preds) scoreOrder = np.argsort(-slidingScores) predictions[qa.id] = preds[scoreOrder[ 0:min(self.topK, scoreOrder.size)]].tolist() # cut for the top 1 prediction predictions[qa.id] = predictions[qa.id][0].ansStr returnDict[title] = predictions
wh_counts = defaultdict(int) for article, paragraph, qa in questions: whToken = None question = qa.question.sentence[0] for token in question.token: if token.pos in ['WRB', 'WP', 'WDT', 'WP$'] and token.lemma != 'that': whToken = token break if whToken is None: questionRoot = question.basicDependencies.root[0] - 1 else: print whToken.lemma, whToken.pos print ReconstructStrFromSpan(question.token) DisplayDepTree(question) print print ReconstructStrFromSpan(qa.answer.sentence[0].token) for answer_token in qa.answer.sentence[0].token: sys.stdout.write(answer_token.pos + ' ') print last_sentence = None for sentence in paragraph.context.sentence: if sentence.characterOffsetBegin <= qa.answerOffset: last_sentence = sentence if last_sentence is not None: print ReconstructStrFromSpan(last_sentence.token) DisplayDepTree(last_sentence) DisplayParseTree(last_sentence.parseTree) print
for i in xrange(len(sentence.token)): if sentence.token[i].lemma in questionLemmas: questionI = questionLemmas[sentence.token[i].lemma] path, token = GetPath(i, correctAnswer.spanBeginIndex, correctAnswer.spanBeginIndex + correctAnswer.spanLength, sentence) root_path, _ = GetPath(sentenceRoot - 1, i, i + 1, sentence) question_root_path, _ = GetPath(questionRoot - 1, questionI, questionI + 1, question) if path is not None: output_paths = [] output_paths.append(sentence.token[i].pos + path + ' ' + token.pos) if root_path is not None: output_paths.append('S ' + sentence.token[sentenceRoot - 1].pos + root_path + ' ' + sentence.token[i].pos + path + ' ' + token.pos) if question_root_path is not None: output_paths.append('Q ' + question.token[questionRoot - 1].pos + question_root_path + ' ' + sentence.token[i].pos + path + ' ' + token.pos) for output_path in output_paths: match_dep_paths[output_path] += 1 match_dep_examples[output_path].append((sentence.token[sentenceRoot - 1].lemma, question.token[questionRoot - 1].lemma, sentence.token[i].lemma, token.lemma, ReconstructStrFromSpan(sentence.token), ReconstructStrFromSpan(question.token), ReconstructStrFromSpan(sentence.token, (correctAnswer.spanBeginIndex, correctAnswer.spanBeginIndex + correctAnswer.spanLength)))) print same_root, contains_root, total total_cnt = 0 for dep_path, cnt in sorted(match_dep_paths.items(), key=lambda x: x[1], reverse=True): total_cnt += cnt print dep_path, cnt, total_cnt examples = match_dep_examples[dep_path] for ex in xrange(0, min(5, len(examples))): sentence_root_lemma, question_root_lemma, sentence_lemma, answer_lemma, sentence, question, answer = examples[ex] if dep_path.startswith('S '): print sentence_root_lemma, '---', sentence_lemma, '---', answer_lemma, '---', sentence, '---', question, '---', answer
def Predict(self, samples, candInput, candGlobalId, candData, origData, session): ''' predict each sample in samples with the related data in candInput. @param candInput: it can be either self.trainCandInput or self.evalCandInput candInput is produced via self.PrepareEvalInput we reuse the interface of paRnnInput and pcRnnInput to get the scores @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId @param candData: either self.traincandData or self.evalcandData @param origData: either self.trainOrigData or self.evalOrigData ''' candPadded, contextPadded, candLen, contextLen = candInput prediction = dict() #set this to reuse training computational graph for evaluation batchSize = self.batchSize for iSample, sample in enumerate(samples): title = sample.title qaId = sample.id if self.articleLevel: nCand = len(candPadded[title] ) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) ) qSeqLen = np.array( (len(sample.query), ) ) paRnnInput = candPadded[title] paSeqLen = candLen[title] pcRnnInput = contextPadded[title] pcSeqLen = contextLen[title] else: paraId = sample.pAnsParaId nCand = len(candPadded[title][paraId] ) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) ) qSeqLen = np.array( (len(sample.query), ) ) paRnnInput = candPadded[title][paraId] paSeqLen = candLen[title][paraId] pcRnnInput = contextPadded[title][paraId] pcSeqLen = contextLen[title][paraId] dataBatch = {self.qRnnInputEval : qRnnInput, self.aRnnInputEval : paRnnInput, self.cRnnInputEval : pcRnnInput, self.qSeqLenEval : qSeqLen, self.aSeqLenEval : paSeqLen, self.cSeqLenEval : pcSeqLen} dataBatch = self.GetPredictBatch(dataBatch) scores = session.run(self.evalScore, feed_dict=dataBatch) predId = np.argmax(scores) if self.articleLevel == False: # from paragraph level span id to article level span id globalId = [idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen] predId = globalId[predId] predInfo = candData[title].candidateAnswers[predId] predParaId = predInfo.paragraphIndex predSenId = predInfo.sentenceIndex predSpanStart = predInfo.spanBeginIndex predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength tokens = origData[title].paragraphs[predParaId].context.sentence[predSenId].token[predSpanStart:predSpanEnd] predStr = ReconstructStrFromSpan(tokens, (0, len(tokens) ) ) prediction[qaId] = QaPrediction(title, qaId, predStr, predParaId, predSenId) if (iSample + 1) % 500 == 0 or iSample == len(samples) - 1: print "predicted ", str(iSample + 1), " / " , str(len(samples) ), " samples!" predForDump = dict() for qaId in prediction.keys(): predForDump[qaId] = prediction[qaId].ansStr return prediction, predForDump
def Predict(self, samples, candInput, candGlobalId, candData, origData, session): ''' predict each sample in samples with the related data in candInput. @param candInput: it can be either self.trainCandInput or self.evalCandInput candInput is produced via self.PrepareEvalInput we reuse the interface of paRnnInput and pcRnnInput to get the scores @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId @param candData: either self.traincandData or self.evalcandData @param origData: either self.trainOrigData or self.evalOrigData ''' candPadded, contextPadded, candLen, contextLen = candInput prediction = dict() topK = self.predTopK for iSample, sample in enumerate(samples): title = sample.title qaId = sample.id if self.articleLevel: nCand = len(candPadded[title]) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query))) qSeqLen = np.array((len(sample.query), )) paRnnInput = candPadded[title] paSeqLen = candLen[title] pcRnnInput = contextPadded[title] pcSeqLen = contextLen[title] else: paraId = sample.pAnsParaId nCand = len(candPadded[title][paraId]) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query))) qSeqLen = np.array((len(sample.query), )) paRnnInput = candPadded[title][paraId] paSeqLen = candLen[title][paraId] pcRnnInput = contextPadded[title][paraId] pcSeqLen = contextLen[title][paraId] batchData = { self.qRnnInputEval: qRnnInput, self.aRnnInputEval: paRnnInput, self.cRnnInputEval: pcRnnInput, self.qSeqLenEval: qSeqLen, self.aSeqLenEval: paSeqLen, self.cSeqLenEval: pcSeqLen } # # # DEBUG # # print qaId, # # print "pred data sum", np.sum(batchData[self.aRnnInputEval] ), \ # # np.sum(batchData[self.cRnnInputEval] ) # print "test input ", paRnnInput[123, :], paSeqLen[123] # print "test input 2 ", pcRnnInput[123, :], pcSeqLen[123] # # # END of DEBUG batchData = self.GetPredictBatch(batchData) scores = session.run(self.evalScore, feed_dict=batchData) # # # DEBUG # print "\n\n\n\n test pa score ", np.argmax(scores), np.max(scores) # for i in range(batchData[self.aRnnInputEval].shape[0] ): # print "\n test data na ", i, scores[0, i] # print self.IdToWord(batchData[self.aRnnInputEval][i, :].tolist() ) # print self.IdToWord(batchData[self.cRnnInputEval][i, :].tolist() ) # # raw_input("done") # print self.IdToWord(paRnnInput[np.argmax(scores), :].tolist() ) # print self.IdToWord(pcRnnInput[np.argmax(scores), :].tolist() ) # print self.IdToWord(qRnnInput[0, :].tolist() ) # predict a topK list predIdSort = np.argsort(-scores[0, :]) prediction[qaId] = list() for i in range(min(topK, scores.size)): predId = predIdSort[i] if self.articleLevel == False: # from paragraph level span id to article level span id globalId = [ idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen ] predId = globalId[predId] predInfo = candData[title].candidateAnswers[predId] predParaId = predInfo.paragraphIndex predSenId = predInfo.sentenceIndex predSpanStart = predInfo.spanBeginIndex predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength tokens = origData[title].paragraphs[ predParaId].context.sentence[predSenId].token[ predSpanStart:predSpanEnd] predStr = ReconstructStrFromSpan(tokens, (0, len(tokens))) prediction[qaId].append( QaPrediction(title, qaId, predStr, predParaId, predSenId, ansToken=tokens)) return prediction