コード例 #1
0
def HasFollowingPreposition(article, text_to_span, text_tokens):
    num_answer_tokens = len(text_tokens)
    text = ReconstructStrFromSpan(text_tokens, (0, num_answer_tokens))

    for span_text, span_tuple in text_to_span.iteritems():
        paragraph_i, sentence_i, span = span_tuple
        span_length = span[1] - span[0]
        if span_length > num_answer_tokens and text in span_text:
            span_tokens = article.paragraphs[paragraph_i].context.sentence[
                sentence_i].token[span[0]:span[1]]

            if ReconstructStrFromSpan(
                    span_tokens[0:num_answer_tokens], (0, num_answer_tokens)
            ) == text and span_tokens[num_answer_tokens].pos in ['IN', 'TO']:
                return True
    return False
コード例 #2
0
def GetSentence(paragraph, text):
    for sentence in paragraph.context.sentence:
        sentence_text = ReconstructStrFromSpan(sentence.token,
                                               (0, len(sentence.token)))
        if text in sentence_text:
            return sentence
    return None
コード例 #3
0
ファイル: agent.py プロジェクト: Barachiel/qna
 def __init__(self,
              title,
              qaId,
              query=None,
              context=None,
              ans=None,
              ansToken=None,
              paraId=None,
              senId=None,
              pAnsId=None,
              ansStr=None):
     '''
     @param title: indicate which article it belongs to.
     @param id: the unique id for qa pairs
     @param query, context, ans: list of int representation for RNN use
     @param ansToken: the tokens of the answer (for evaluation purpose)
     @param paraId: which paragraph the qa is from
     @param senId: which sentence the qa is from
     '''
     self.title = title
     self.id = qaId
     self.query = query
     self.context = context
     self.ans = ans
     self.ansToken = ansToken
     if ansStr == None and self.ansToken != None and len(
             self.ansToken) != 0:
         self.ansStr = ReconstructStrFromSpan(ansToken, (0, len(ansToken)))
     else:
         self.ansStr = ansStr
     self.pAnsParaId = paraId
     self.pAnsSenId = senId
     self.pAnsId = pAnsId
コード例 #4
0
def DebugIncomplete(sentence, text):
    PUNCT = [' ', '.', ',']

    sentence_text = ReconstructStrFromSpan(sentence.token,
                                           (0, len(sentence.token)))
    pos = sentence_text.find(text)
    pos_length = len(text)
    pos_changed = False
    while pos - 1 >= 0 and sentence_text[pos - 1] not in PUNCT:
        pos -= 1
        pos_length += 1
        pos_changed = True
    while pos + pos_length < len(sentence_text) and sentence_text[
            pos + pos_length] not in PUNCT:
        pos_length += 1
        pos_changed = True

    if pos_changed and sentence_text[pos:pos + pos_length] in text_to_span:
        print text, '---', sentence_text[pos:pos + pos_length]
コード例 #5
0
def HasNounPrefix(article, text_to_span, text_tokens):
    num_answer_tokens = len(text_tokens)
    text = ReconstructStrFromSpan(text_tokens, (0, num_answer_tokens))

    for span_text, span_tuple in text_to_span.iteritems():
        paragraph_i, sentence_i, span = span_tuple
        span_length = span[1] - span[0]
        if span_length > num_answer_tokens and text in span_text:
            span_tokens = article.paragraphs[paragraph_i].context.sentence[
                sentence_i].token[span[0]:span[1]]
            last_span_tokens = span_tokens[-len(text_tokens):]
            if ReconstructStrFromSpan(last_span_tokens,
                                      (0, len(last_span_tokens))) == text:
                # Note: PRP$ gives a few percent increase.
                if all([
                        token.pos in [
                            'PRP$', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR',
                            'JJS'
                        ] for token in span_tokens[:-num_answer_tokens]
                ]):
                    return True

    return False
コード例 #6
0
ファイル: eval.py プロジェクト: sheetalsh456/q-net
                            def PrintAnswer(candidate_index, features, prefix):
                                span = example.candidate_answers[
                                    candidate_index]
                                sentence_tokens = annotations.article.paragraphs[
                                    span.paragraphIndex].context.sentence[
                                        span.sentenceIndex].token
                                print '  ' + prefix + ' Sentence:', ReconstructStrFromSpan(
                                    sentence_tokens)
                                print '  ' + prefix + ' Span:', BuildPrediction(
                                    annotations, span)

                                total_weight = 0
                                sorted_weights = []
                                for feature_index in features:
                                    total_weight += weights[feature_index]
                                    sorted_weights.append(
                                        (weights[feature_index],
                                         dictionary.GetName(feature_index)))

                                print '  ' + prefix + ' Score:', total_weight
                                print '  ' + prefix + ' Features:'
                                sorted_weights.sort(reverse=True)
                                for weight, name in sorted_weights:
                                    print '    ' + str(weight), name
コード例 #7
0
flags.DEFINE_string('input-featuredict',
                    'dataset/featuredictbucketized-25000.proto', '')
flags.DEFINE_integer('min-articles', None, '')

if __name__ == '__main__':
    dictionary = Dictionary(FLAGS.input_featuredict)
    feature_index = dictionary.GetIndex('Dep Path NN - conj -> NN')

    examples = ReadExamples(FLAGS.input_features, dictionary,
                            FLAGS.min_articles)
    question_annotations = ReadQuestionAnnotations(FLAGS.input_articles)

    for example in examples:
        for i in xrange(example.input_indices.shape[0]):
            question_index = example.input_indices[i][0]
            if example.input_indices[i][2] == feature_index:
                correct = example.input_indices[i][1] == example.label[
                    question_index]

                annotations = question_annotations[
                    example.question_ids[question_index]]
                span = example.candidate_answers[example.input_indices[i][1]]
                sentence_tokens = annotations.article.paragraphs[
                    span.paragraphIndex].context.sentence[
                        span.sentenceIndex].token
                print 'Sentence:', ReconstructStrFromSpan(sentence_tokens)
                print 'Question:', annotations.qa.question.text
                print 'Span:', BuildPrediction(annotations, span)
                print 'Correct!' if correct else 'Wrong!'
                print
コード例 #8
0
ファイル: test_context_rnn.py プロジェクト: Barachiel/qna
def TestPredict(articleLevel=True):
    # pass
    agent = GetContextRnnAgent()
    agent.ConstructInputNode(batchSize=512)
    agent.ConstructEvalInputNode()
    agent.ConstructGraph()
    agent.ConstructEvalGraph()
    agent.articleLevel = articleLevel
    # prepare for evaluation on both training and
    agent.PrepareEvalInput(onTrain=True)
    agent.PrepareEvalInput(onTrain=False)
    # assert the prediction is in the correct scope
    candInput = agent.trainCandInput
    candGlobalId = agent.trainCandGlobalId
    candidates = agent.trainCandidates
    origData = agent.trainOrigData

    sess = tf.Session()
    agent.session = sess
    agent.session.run(tf.initialize_all_variables())
    predictionTrain = agent.PredictTrainSamples()
    candInput = agent.evalCandInput
    candGlobalId = agent.evalCandGlobalId
    candidates = agent.evalCandidates
    origData = agent.evalOrigData
    predictionEval = agent.PredictEvalSamples()

    print "start eval predictions"

    # for samples, origData, prediction in zip([agent.trainSamples, agent.evalSamples], [agent.trainOrigData, agent.evalOrigData], [predictionTrain, predictionEval] ):
    for samples, origData, prediction in zip([agent.evalSamples],
                                             [agent.evalOrigData],
                                             [predictionEval]):
        # samples = agent.trainSamples
        # origData = agent.trainOrigData
        for sample in samples:
            idx = sample.id
            title = sample.title
            pred = prediction[idx]
            found = False
            if articleLevel:
                for para in origData[title].paragraphs:
                    for sen in para.context.sentence:
                        contextStr = ReconstructStrFromSpan(
                            sen.token, (0, len(sen.token)))
                        if pred.ansStr in contextStr:
                            found = True
                            break
                    if found:
                        break
                assert found
            else:
                paraId = sample.pAnsParaId
                for sen in origData[title].paragraphs[paraId].context.sentence:
                    contextStr = ReconstructStrFromSpan(
                        sen.token, (0, len(sen.token)))
                    if pred.ansStr in contextStr:
                        found = True
                        break
                assert found

    print "ContextRnnPrediction test passed!"
コード例 #9
0
ファイル: sliding_window.py プロジェクト: Barachiel/qna
    def PredictPerArticle(self, title, returnDict):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        print "Predicting for " + title
        # get unigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # flatten version of contextUni
        context = []

        if self.articleLevel:
            for paraUni in contextUni:
                for sentenceUni in paraUni:
                    context += sentenceUni
        else:
            for paraUni in contextUni:
                contextPara = []
                for sentenceUni in paraUni:
                    contextPara += sentenceUni
                context.append(contextPara)

        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                # pred = dict()
                preds = list()
                scores = list()
                # bestScore = sys.float_info.min
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                if self.articleLevel:
                    for iPara, para, unigrams, spans \
                        in enumerate(zip(self.data[title].paragraphs, contextUni, contextSpan) ):
                        # traverse each sentence in the paragraph
                        for iSen, (s, uni, spanList) in enumerate(
                                zip(para.context.sentence, unigrams, spans)):
                            assert len(s.token) == len(uni)
                            for span in spanList:
                                beginId = span[0]
                                endId = span[1]
                                aUnigram = uni[beginId:endId]

                                score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \
                                    - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram)

                                scores.append(score)
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                preds.append(
                                    QaPrediction(title,
                                                 qaId,
                                                 ansStr,
                                                 iPara,
                                                 iSen,
                                                 ansToken=ansToken))
                else:
                    iPara = qaParaId
                    para = self.data[title].paragraphs[iPara]
                    unigrams = contextUni[iPara]
                    spans = contextSpan[iPara]
                    for iSen, (s, uni, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, spans)):
                        assert len(s.token) == len(uni)
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            aUnigram = uni[beginId:endId]

                            score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \
                                - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram)

                            scores.append(score)
                            ansStr = ReconstructStrFromSpan(s.token, span)
                            ansToken = s.token[span[0]:span[1]]
                            preds.append(
                                QaPrediction(title,
                                             qaId,
                                             ansStr,
                                             iPara,
                                             iSen,
                                             ansToken=ansToken,
                                             score=score))

                scores = np.array(scores)
                preds = np.array(preds)
                scoreOrder = np.argsort(-scores)
                predictions[qaId] = preds[scoreOrder][
                    0:min(self.topK, preds.size)].tolist()
        returnDict[title] = predictions
コード例 #10
0
    def PredictPerArticle(self, title, returnDict):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        # get unigram and bigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        contextBi = self.GetContextBigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        print "Predicting for " + title
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                pred = dict()
                bestScore = sys.float_info.min
                bestSentence = None
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                qBigram = self.GetBigramBySentence(qS.token)
                # traverse over paragraphs
                pred = dict()
                pred["answer"] = []
                pred["token"] = []
                # if ansSentence not in pred["sentence"]:
                pred["sentence"] = []
                pred["sentenceToken"] = []
                pred["paraId"] = []
                pred["senId"] = []
                for iPara, (para, unigrams, bigrams, spans) \
                    in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ):
                    # traverse each sentence in the paragraph
                    if self.articleLevel == False and iPara != qaParaId:
                        continue
                    for iSen, (s, uni, bi, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, bigrams,
                                spans)):
                        assert len(s.token) == len(uni)
                        assert len(s.token) == len(bi) + 1
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            cbUnigram = uni[0:beginId]
                            caUnigram = uni[endId:]
                            if beginId == 0:
                                cbBigram = []
                            else:
                                cbBigram = bi[0:(beginId - 1)]
                            caBigram = bi[endId:]
                            aUnigram = uni[beginId:endId]
                            aBigram = bi[beginId:(endId - 1)]
                            # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"):
                            if len(aUnigram
                                   ) == 1 and aUnigram[0] in self.stopWords:
                                continue
                            if len(aUnigram) > 3:
                                continue

                            ansStr = ReconstructStrFromSpan(s.token, span)
                            ansToken = s.token[span[0]:span[1]]
                            ansSentence = ReconstructStrFromSpan(
                                s.token, (0, len(s.token)))
                            ansSentenceToken = s.token[0:len(s.token)]
                            assert len(s.token) != 0
                            # ansSentenceToken = [token.word for token in s.token]
                            pred["answer"].append(ansStr)
                            pred["token"].append(ansToken)
                            # if ansSentence not in pred["sentence"]:
                            pred["sentence"].append(ansSentence)
                            pred["sentenceToken"].append(ansSentenceToken)
                            pred["paraId"].append(iPara)
                            pred["senId"].append(iSen)
                # filter from the candidates for best choice
                preds = list()
                for ansToken, sentenceToken, ansStr, iPara, iSen \
                    in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ):
                    ansToken = [token.word.lower() for token in ansToken]
                    if ansToken[-1] == ".":
                        ansToken = ansToken[:-1]
                    if ansToken[0] == "The" or ansToken[0] == "the":
                        ansToken = ansToken[1:]
                    if len(ansToken) == 0:
                        print " zero length ans token"
                    preds.append(
                        QaPrediction(title,
                                     qa.id,
                                     ansStr,
                                     iPara,
                                     iSen,
                                     ansToken=ansToken))
                random.shuffle(preds)

                predictions[qa.id] = preds[0:min(self.topK, len(preds))]
                predictions[qa.id] = predictions[qa.id][0].ansStr
        returnDict[title] = predictions
コード例 #11
0
    num_appears_in_long_sentence = 0
    num_has_noun_prefix = 0
    num_has_following_preposition = 0
    num_broken = 0
    for article in articles:
        text_to_span = {}

        spans = GetContextConstituentSpan(article)
        num_spans = 0
        for paragraph_i in xrange(len(spans)):
            paragraph = article.paragraphs[paragraph_i]
            for sentence_i in xrange(len(spans[paragraph_i])):
                tokens = paragraph.context.sentence[sentence_i].token
                for span in spans[paragraph_i][sentence_i]:
                    num_spans += 1
                    text = ReconstructStrFromSpan(tokens, span)
                    text_to_span[text] = (paragraph_i, sentence_i, span)

        bad = []
        for paragraph in article.paragraphs:
            for qa in paragraph.qas:
                num_answers += 1
                num_answer_span_pairs += num_spans
                if len(qa.answer.sentence) > 1:
                    num_multiple_sentences += 1
                    continue

                text_tokens = qa.answer.sentence[0].token
                num_answer_tokens = len(text_tokens)
                text = ReconstructStrFromSpan(text_tokens,
                                              (0, num_answer_tokens))
コード例 #12
0
ファイル: predictor.py プロジェクト: Barachiel/qna
    def PredictPerArticle(self, title, returnDict):
        # def PredictPerArticle(self, title):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        # get unigram and bigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        contextBi = self.GetContextBigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        # print "Predicting for " + title
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                pred = dict()
                bestScore = sys.float_info.min
                bestSentence = None
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                qBigram = self.GetBigramBySentence(qS.token)
                # traverse over paragraphs
                pred = dict()
                for iPara, (para, unigrams, bigrams, spans) \
                    in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ):
                    # traverse each sentence in the paragraph
                    if self.articleLevel == False and iPara != qaParaId:
                        continue
                    for iSen, (s, uni, bi, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, bigrams,
                                spans)):
                        assert len(s.token) == len(uni)
                        assert len(s.token) == len(bi) + 1
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            cbUnigram = uni[0:beginId]
                            caUnigram = uni[endId:]
                            if beginId == 0:
                                cbBigram = []
                            else:
                                cbBigram = bi[0:(beginId - 1)]
                            caBigram = bi[endId:]
                            aUnigram = uni[beginId:endId]
                            aBigram = bi[beginId:(endId - 1)]
                            # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"):
                            if len(aUnigram) == 1 and aUnigram[
                                    0] in self.slidingWindowAgent.stopWords:
                                continue
                            score = self.GetContextScore(
                                qUnigram, qBigram, cbUnigram, caUnigram,
                                cbBigram, caBigram, aUnigram, aBigram,
                                self.stopWords)

                            if score > bestScore or len(pred) == 0:
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                ansSentence = ReconstructStrFromSpan(
                                    s.token, (0, len(s.token)))
                                ansSentenceToken = s.token[0:len(s.token)]
                                pred = {
                                    "id": qa.id,
                                    "answer": [
                                        ansStr,
                                    ],
                                    "token": [
                                        ansToken,
                                    ],
                                    "sentence": [
                                        ansSentence,
                                    ],
                                    "sentenceToken": [
                                        ansSentenceToken,
                                    ],
                                    "paraId": [
                                        iPara,
                                    ],
                                    "senId": [
                                        iSen,
                                    ]
                                }
                                bestScore = score
                                # the current dataset has empty s.text field
                                # assert ansStr in s.text
                            # note we permit multiple answers
                            elif score == bestScore:
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                ansSentence = ReconstructStrFromSpan(
                                    s.token, (0, len(s.token)))
                                ansSentenceToken = s.token[0:len(s.token)]
                                assert len(s.token) != 0
                                # ansSentenceToken = [token.word for token in s.token]
                                pred["answer"].append(ansStr)
                                pred["token"].append(ansToken)
                                # if ansSentence not in pred["sentence"]:
                                pred["sentence"].append(ansSentence)
                                pred["sentenceToken"].append(ansSentenceToken)
                                pred["paraId"].append(iPara)
                                pred["senId"].append(iSen)
                # filter from the candidates for best choice
                preds = list()
                if self.slidingWindowAgent != None:
                    subAgent = self.slidingWindowAgent
                    slidingScores = []
                    for ansToken, sentenceToken, ansStr, iPara, iSen \
                        in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ):
                        ansToken = [token.word.lower() for token in ansToken]
                        if ansToken[-1] == ".":
                            ansToken = ansToken[:-1]
                        if ansToken[0] == "The" or ansToken[0] == "the":
                            ansToken = ansToken[1:]
                        if len(ansToken) == 0:
                            print " zero length ans token"
                        sentenceToken = [
                            token.word.lower() for token in sentenceToken
                        ]
                        slidingScores.append(
                            subAgent.GetSlidingDistScore(
                                sentenceToken, qUnigram, ansToken))
                        preds.append(
                            QaPrediction(title,
                                         qa.id,
                                         ansStr,
                                         iPara,
                                         iSen,
                                         ansToken=ansToken,
                                         score=slidingScores[-1]))

                    slidingScores = np.array(slidingScores)
                    preds = np.array(preds)
                    scoreOrder = np.argsort(-slidingScores)

                    predictions[qa.id] = preds[scoreOrder[
                        0:min(self.topK, scoreOrder.size)]].tolist()
                    # cut for the top 1 prediction
                    predictions[qa.id] = predictions[qa.id][0].ansStr
        returnDict[title] = predictions
コード例 #13
0
ファイル: dep2.py プロジェクト: Barachiel/qna
wh_counts = defaultdict(int)
for article, paragraph, qa in questions:
    whToken = None    
    question = qa.question.sentence[0]
    
    for token in question.token:
        if token.pos in ['WRB', 'WP', 'WDT', 'WP$'] and token.lemma != 'that':
            whToken = token
            break
    
    if whToken is None:
        questionRoot = question.basicDependencies.root[0] - 1

    else:
        print whToken.lemma, whToken.pos
        print ReconstructStrFromSpan(question.token)
        DisplayDepTree(question)
        print
        print ReconstructStrFromSpan(qa.answer.sentence[0].token)
        for answer_token in qa.answer.sentence[0].token:
            sys.stdout.write(answer_token.pos + ' ')
        print
        last_sentence = None
        for sentence in paragraph.context.sentence:
            if sentence.characterOffsetBegin <= qa.answerOffset:
                last_sentence = sentence
        if last_sentence is not None:
            print ReconstructStrFromSpan(last_sentence.token)
            DisplayDepTree(last_sentence)
            DisplayParseTree(last_sentence.parseTree)
        print
コード例 #14
0
ファイル: dep.py プロジェクト: Barachiel/qna
            for i in xrange(len(sentence.token)):
                if sentence.token[i].lemma in questionLemmas:
                    questionI = questionLemmas[sentence.token[i].lemma]
                    path, token = GetPath(i, correctAnswer.spanBeginIndex, correctAnswer.spanBeginIndex + correctAnswer.spanLength, sentence)
                    root_path, _ = GetPath(sentenceRoot - 1, i, i + 1, sentence)
                    question_root_path, _ = GetPath(questionRoot - 1, questionI, questionI + 1, question)
                    if path is not None:
                        output_paths = []
                        output_paths.append(sentence.token[i].pos + path + ' ' + token.pos)
                        if root_path is not None:
                            output_paths.append('S ' + sentence.token[sentenceRoot - 1].pos + root_path + ' ' + sentence.token[i].pos + path + ' ' + token.pos)
                        if question_root_path is not None:
                            output_paths.append('Q ' + question.token[questionRoot - 1].pos + question_root_path + ' ' + sentence.token[i].pos + path + ' ' + token.pos)
                        for output_path in output_paths:
                            match_dep_paths[output_path] += 1
                            match_dep_examples[output_path].append((sentence.token[sentenceRoot - 1].lemma, question.token[questionRoot - 1].lemma, sentence.token[i].lemma, token.lemma, ReconstructStrFromSpan(sentence.token), ReconstructStrFromSpan(question.token), ReconstructStrFromSpan(sentence.token, (correctAnswer.spanBeginIndex, correctAnswer.spanBeginIndex + correctAnswer.spanLength))))
                        
                
            

print same_root, contains_root, total

total_cnt = 0
for dep_path, cnt in sorted(match_dep_paths.items(), key=lambda x: x[1], reverse=True):
    total_cnt += cnt
    print dep_path, cnt, total_cnt
    examples = match_dep_examples[dep_path]
    for ex in xrange(0, min(5, len(examples))):
        sentence_root_lemma, question_root_lemma, sentence_lemma, answer_lemma, sentence, question, answer = examples[ex]
        if dep_path.startswith('S '):
            print sentence_root_lemma, '---', sentence_lemma, '---', answer_lemma, '---', sentence, '---', question, '---', answer
コード例 #15
0
    def Predict(self, samples, candInput, candGlobalId, candData, origData, session):
        '''
        predict each sample in samples with the related data in candInput.
        @param candInput: it can be either self.trainCandInput or self.evalCandInput
        candInput is produced via self.PrepareEvalInput
        we reuse the interface of paRnnInput and pcRnnInput to get the scores
        @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId
        @param candData: either self.traincandData or self.evalcandData
        @param origData: either self.trainOrigData or self.evalOrigData
        '''
        candPadded, contextPadded, candLen, contextLen = candInput
        prediction = dict()
        #set this to reuse training computational graph for evaluation
        batchSize = self.batchSize
        for iSample, sample in enumerate(samples):
            title = sample.title
            qaId = sample.id
            if self.articleLevel:
                nCand = len(candPadded[title] )
                qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) )
                qSeqLen = np.array( (len(sample.query), ) )
                paRnnInput = candPadded[title]
                paSeqLen = candLen[title]
                pcRnnInput = contextPadded[title]
                pcSeqLen = contextLen[title]
            else:
                paraId = sample.pAnsParaId
                nCand = len(candPadded[title][paraId] )
                qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) )
                qSeqLen = np.array( (len(sample.query), ) )
                paRnnInput = candPadded[title][paraId]
                paSeqLen = candLen[title][paraId]
                pcRnnInput = contextPadded[title][paraId]
                pcSeqLen = contextLen[title][paraId]

            dataBatch = {self.qRnnInputEval : qRnnInput,
                self.aRnnInputEval : paRnnInput, 
                self.cRnnInputEval : pcRnnInput,
                self.qSeqLenEval : qSeqLen,
                self.aSeqLenEval : paSeqLen,
                self.cSeqLenEval : pcSeqLen}
            dataBatch = self.GetPredictBatch(dataBatch)

            scores = session.run(self.evalScore, feed_dict=dataBatch)
            predId = np.argmax(scores)
            if self.articleLevel == False:
                # from paragraph level span id to article level span id
                globalId = [idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen]
                predId = globalId[predId]
            predInfo = candData[title].candidateAnswers[predId]
            predParaId = predInfo.paragraphIndex
            predSenId = predInfo.sentenceIndex
            predSpanStart = predInfo.spanBeginIndex
            predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength

            tokens = origData[title].paragraphs[predParaId].context.sentence[predSenId].token[predSpanStart:predSpanEnd]
            predStr = ReconstructStrFromSpan(tokens, (0, len(tokens) ) )
            prediction[qaId] = QaPrediction(title, qaId, predStr, predParaId, predSenId)
            if (iSample + 1) % 500 == 0 or iSample == len(samples) - 1:
                print "predicted ", str(iSample + 1), " / " , str(len(samples) ), " samples!"
            predForDump = dict()
            for qaId in prediction.keys():
                predForDump[qaId] = prediction[qaId].ansStr
        return prediction, predForDump
コード例 #16
0
    def Predict(self, samples, candInput, candGlobalId, candData, origData,
                session):
        '''
        predict each sample in samples with the related data in candInput.
        @param candInput: it can be either self.trainCandInput or self.evalCandInput
        candInput is produced via self.PrepareEvalInput
        we reuse the interface of paRnnInput and pcRnnInput to get the scores
        @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId
        @param candData: either self.traincandData or self.evalcandData
        @param origData: either self.trainOrigData or self.evalOrigData
        '''
        candPadded, contextPadded, candLen, contextLen = candInput
        prediction = dict()
        topK = self.predTopK

        for iSample, sample in enumerate(samples):
            title = sample.title
            qaId = sample.id
            if self.articleLevel:
                nCand = len(candPadded[title])
                qRnnInput = np.array(sample.query).reshape(
                    (1, len(sample.query)))
                qSeqLen = np.array((len(sample.query), ))
                paRnnInput = candPadded[title]
                paSeqLen = candLen[title]
                pcRnnInput = contextPadded[title]
                pcSeqLen = contextLen[title]
            else:
                paraId = sample.pAnsParaId
                nCand = len(candPadded[title][paraId])
                qRnnInput = np.array(sample.query).reshape(
                    (1, len(sample.query)))
                qSeqLen = np.array((len(sample.query), ))
                paRnnInput = candPadded[title][paraId]
                paSeqLen = candLen[title][paraId]
                pcRnnInput = contextPadded[title][paraId]
                pcSeqLen = contextLen[title][paraId]

            batchData = {
                self.qRnnInputEval: qRnnInput,
                self.aRnnInputEval: paRnnInput,
                self.cRnnInputEval: pcRnnInput,
                self.qSeqLenEval: qSeqLen,
                self.aSeqLenEval: paSeqLen,
                self.cSeqLenEval: pcSeqLen
            }

            # # # DEBUG
            # # print qaId,
            # # print "pred data sum", np.sum(batchData[self.aRnnInputEval] ), \
            # #     np.sum(batchData[self.cRnnInputEval] )
            # print "test input ", paRnnInput[123, :], paSeqLen[123]
            # print "test input 2 ", pcRnnInput[123, :], pcSeqLen[123]
            # # # END of DEBUG

            batchData = self.GetPredictBatch(batchData)
            scores = session.run(self.evalScore, feed_dict=batchData)

            # # # DEBUG
            # print "\n\n\n\n test pa score ", np.argmax(scores), np.max(scores)
            # for i in range(batchData[self.aRnnInputEval].shape[0] ):
            #     print "\n test data na ", i, scores[0, i]
            #     print self.IdToWord(batchData[self.aRnnInputEval][i, :].tolist() )
            #     print self.IdToWord(batchData[self.cRnnInputEval][i, :].tolist() )
            # # raw_input("done")

            # print self.IdToWord(paRnnInput[np.argmax(scores), :].tolist() )
            # print self.IdToWord(pcRnnInput[np.argmax(scores), :].tolist() )
            # print self.IdToWord(qRnnInput[0, :].tolist() )

            # predict a topK list
            predIdSort = np.argsort(-scores[0, :])
            prediction[qaId] = list()
            for i in range(min(topK, scores.size)):
                predId = predIdSort[i]
                if self.articleLevel == False:
                    # from paragraph level span id to article level span id
                    globalId = [
                        idx for idSen in candGlobalId[title][sample.pAnsParaId]
                        for idx in idSen
                    ]
                    predId = globalId[predId]
                predInfo = candData[title].candidateAnswers[predId]
                predParaId = predInfo.paragraphIndex
                predSenId = predInfo.sentenceIndex
                predSpanStart = predInfo.spanBeginIndex
                predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength
                tokens = origData[title].paragraphs[
                    predParaId].context.sentence[predSenId].token[
                        predSpanStart:predSpanEnd]
                predStr = ReconstructStrFromSpan(tokens, (0, len(tokens)))
                prediction[qaId].append(
                    QaPrediction(title,
                                 qaId,
                                 predStr,
                                 predParaId,
                                 predSenId,
                                 ansToken=tokens))
        return prediction