Beispiel #1
0
def get_text_question(question):
    driver = QABase()

    qid = question["qid"]
    q = driver.get_question(qid)
    question = q["text"]

    raw_tags = get_sentences(question)[0]
    word_array = []
    tag_array = []

    for word, tag in raw_tags:
        word_array.append(word)
        tag_array.append(tag)

    questions = []

    for i in range(len(word_array) - 1):

        regex_vb = re.findall(r'(VB\w?)', tag_array[i])
        regex_nn = re.findall(r'(NN\w?)', tag_array[i])
        regex_jj = re.findall(r'(JJ\w?)', tag_array[i])

        """
        if tag_array[i] == 'WRB':
            word = str(word_array[i]).lower()
            questions.append((word, tag_array[i]))

        if tag_array[i] == 'WP':
            word = str(word_array[i]).lower()
            questions.append((word, tag_array[i]))

        """

        if len(regex_vb) > 0:
            if tag_array[i] == regex_vb[0]:
                word = str(word_array[i]).lower()
                questions.append((word, tag_array[i]))

        if len(regex_nn) > 0:
            if tag_array[i] == regex_nn[0]:
                word = str(word_array[i]).lower()
                questions.append((word, tag_array[i]))

        if len(regex_jj) > 0:
            if tag_array[i] == regex_jj[0]:
                word = str(word_array[i]).lower()
                questions.append((word, tag_array[i]))

    # print(len(questions))
    # print(questions)
    # print("\n")

    return questions
Beispiel #2
0
def get_better_answer(q):
    crow_sentences = find_sentences([subj, verb], sentences)
    chunker = nltk.RegexpParser(GRAMMAR)
    locations = find_candidates(crow_sentences, chunker)
    answer = None
    driver = QABase()
    q = driver.get_question(question_id)
    story = driver.get_story(q["sid"])
    text = story["text"]
    sentences = get_sentences(text)
    state_question = baseline_stub.reformulate_question(q)
    parsed_dic = parsed_question_dic(q)
    if 'somewhere' in state_question:
        subj = parsed_dic["nsubj"]
        verb = parsed_dic["verb"]
        # loc = None
        answer = find_locations(tree)
    if 'sometime' in state_question:
        subj = parsed_dic["nsubj"]
        verb = parsed_dic["verb"]
    if 'someone' in state_question:
        if state_question.startwith('someone'):
            answer = find_subj(sentences)
        # else:
        #     dobj = None
    if 'somewhat' in state_question:
        if 'direct_object' in state_question:
            subj = parsed_dic["nsubj"]
            verb = parsed_dic["verb"]
        if 'indirect_object' in state_question:
            subj = parsed_dic["nsubj"]
            verb = parsed_dic["verb"]
        if 'verb' in state_question:
            subj = parsed_dic["nsubj"]

    if 'somewhy' in state_question:
        subj = parsed_dic["nsubj"]
        verb = parsed_dic["verb"]
    return answer
Beispiel #3
0
def main():
    driver = QABase()
    q = driver.get_question("fables-01_Q1")
    story = driver.get_story(q["storyid"])
    print("sentence selected:{}".format(story))

    tree = story[0]['const_parse']
    print("const tree:{}".format(tree))

    # Create our pattern
    pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))")

    # # Match our pattern to the tree
    subtree = pattern_matcher(pattern, tree)
    print(" ".join(subtree.leaves()))

    # create a new pattern to match a smaller subset of subtree
    pattern2 = nltk.ParentedTree.fromstring("(PP)")

    # Find and print the answer
    subtree2 = pattern_matcher(pattern2, subtree)
    print(" ".join(subtree2.leaves()))
Beispiel #4
0
if __name__ == '__main__':

    # # Our tools
    chunker = nltk.RegexpParser(GRAMMAR)
    # lmtzr = WordNetLemmatizer()

    question_id = "blogs-01-3"
    # question_id = "fables-02-1"
    # question_id = "mc500.train.0.12"
    # question_id = "fables-02-3"
    # question_id = "blogs-01-5"
    # question_id = "fables-02-1"
    # question_id = "fables-01-3"

    driver = QABase()
    q = driver.get_question(question_id)
    story = driver.get_story(q["sid"])
    # sentences = story["story_par"]
    text = story["text"]
    # print(text)
    # Apply the standard NLP pipeline we've seen before

    # sentences = get_sentences(text)
    sentences = get_sentences_without_quotes(text)
    # print(sentences)
    # answer = find_subj(sentences)
    where = find_where(sentences)
    for whe in where:
        print(" ".join([token[0] for token in whe.leaves()]))
Beispiel #5
0
    for node in sgraph.nodes.values():
        if node.get('head', None) == snode["address"]:
            print("===node=====:{}".format(node))
            if node['rel'] == "nmod":
                deps = get_dependents(node, sgraph)
                deps = sorted(deps + [node],
                              key=operator.itemgetter("address"))
                return " ".join(dep["word"] for dep in deps)

    #raise NotImplemented

    return None


if __name__ == '__main__':
    driver = QABase()

    # Get the first question and its story
    q = driver.get_question("fables-01_Q1")
    print("question:", q["question"])
    qgraph = q['dep_parse']

    #print(qgraph)
    #raise NotImplemented

    story = driver.get_story(q["storyid"])
    print("sentence selected: ", story[0]['sentence'])

    sgraph = story[0]['dep_parse']
    nodes = list(sgraph.nodes.values())
Beispiel #6
0
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools
    stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)
    lmtzr = WordNetLemmatizer()

    driver = QABase()
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])
    text = story["text"]

    # Apply the standard NLP pipeline we've seen before
    sentences = get_sentences(text)
    # print(sentences)
    # print(question["text"])

    # tokenize questions, also removing punctuations to extract keywords
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_question_text = tokenizer.tokenize(question["text"])
    tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text)
    # remove stopwords
    tagged_keywords_list = []
    stopwords = set(nltk.corpus.stopwords.words("english"))
    for word, tag in tagged_tokenized_question_text:
        if word not in stopwords:
            tagged_keywords_list.append((word, tag))

    # lemmatize keywords
    ######################### KEYWORDS MUST BE IN A SPECIFIC ORDER, THIS IS RANDOM
    ######################### TAGGING FOR SINGLE WORDS ARE USUALLY TREATED AS NOUNS EVEN IF THEY SHOULD BE VERBS
    lemmatized_keywords_list = []

    for keyword, tag in tagged_keywords_list:
        lemmatized_keywords_list.append(stemmer.stem(keyword))

    # sort into noun, verb order

    crow_sentences = find_sentences(lemmatized_keywords_list, sentences)
    # crow_sentences = find_sentences(keywords_list, sentences)
    # print(crow_sentences)
    # Extract the candidate locations from these sentences
    locations = find_candidates(crow_sentences, chunker, question["text"])
    print("sentences:", len(sentences))
    print("orignal keywords:", tagged_keywords_list)
    print("keywords:", lemmatized_keywords_list)

    print("crow_sentences:", len(crow_sentences))
    print(question["text"], locations)

    if question["difficulty"] == 'Easy' and len(locations) != 0:
        '''
        if story["sid"] == "fables-01":
            print("-----------------------------------------------------")
            print(crow_sentences)
            print("keywords:", keywords_list)
            print("questions:", question["text"])
            print("loc:", locations)
            # Print them out
            for loc in locations:
                print(loc)
                print(" ".join([token[0] for token in loc.leaves()]))
            print("-----------------------------------------------------")
        '''
        answer = []

        for loc in locations:
            answer.append(" ".join([token[0] for token in loc.leaves()]))
        answer = " ".join(answer)

    elif question["difficulty"] == 'Easy':
        text = story["text"]
        questions = question["text"]
        stopwords = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(questions)[0], stopwords)
        # get_bow = filters stopwords, returns
        # get_sentences returns tagged question, in this case, only the first question
        # qbow therefore is a a list of tagged words from the question without stopwords

        sentences = get_sentences(text)
        answer_tuples = baseline(qbow, sentences, stopwords)
        answer = " ".join(t[0] for t in answer_tuples)

        # print("question:", questions)
        # print(answer)
    elif question["difficulty"] == 'Medium':


    ###     End of Your Code         ###
    return answer



#############################################################
###     Dont change the code below here
#############################################################

class QAEngine(QABase):
    @staticmethod
    def answer_question(question, story):
        answer = get_answer(question, story)
        return answer


def run_qa():
    QA = QAEngine()
    QA.run() #reads questions, iterates over questions
    QA.save_answers()

def main():
    run_qa()
    # You can uncomment this next line to evaluate your
    # answers, or you can run score_answers.py
    score_answers()

if __name__ == "__main__":
    main()
Beispiel #7
0
def get_sentence():  #Eventual change: input question_id, if type= sch use scheherezade interp.
    tag_list = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ']
    qfile = open('data/hw6-answers.csv')
    readCSV = csv.reader(qfile, delimiter=',')
    qids = []

    for row in readCSV:
        qids.append(row[2])

    del qids[0]


    driver = QABase()
    lemmatizer = WordNetLemmatizer()

    correct = 0
    equal_scores = []

    for question_id in qids:

        q = driver.get_question(question_id)
        story = driver.get_story(q['sid'])
        text = story['text']
        question = q['text']

        text_sentences = nltk.sent_tokenize(text)
        words = [nltk.word_tokenize(x) for x in text_sentences]

        question_words = nltk.word_tokenize(question)
        question_tagged = nltk.pos_tag(question_words)

        scores = [0 for i in range(0,len(text_sentences))]

        i = 0
        for x in words:
            for y in x:
                if y not in ['was']:
                    for z in question_tagged:
                        if lemmatizer.lemmatize(y) in [lemmatizer.lemmatize(z[0])]:
                            if z[1] in tag_list:
                                scores[i] += 5
                            else:
                                scores[i] += 1
            i+=1

        skip = False
        sent_index = np.argmax(scores)
        if sent_index != len(scores) - 1:
            for i in range(sent_index + 1,len(scores)):
                if scores[i] == scores[sent_index]:
                    equal_scores.append(question_id)
                    skip = True

        if not skip:
            print(sent_index)
            print(scores)
            print('question: ' + question)
            print(text_sentences[sent_index])

            is_corr = input('correct? ')
            correct += int(is_corr)

            print('*************')


            print(correct)
Beispiel #8
0
def get_better_answer(q):


    chunker = nltk.RegexpParser(GRAMMAR)
    locations = find_answer(crow_sentences, chunker)
    answer = None
    answers =[]
    driver = QABase()
    
    q = driver.get_question(question_id)
    story = driver.get_story(q["sid"])
    text = story["text"]
    question = q['text']
    #unparsed_sent contains the sentence containing the answer
    unparsed_sent = QAmatching_combined(question, text)
    #sentences = sentence.strip('')for sentence in text.split('\n')
    index = find_index(unparsed_sent, text)
    
    lmtzr = WordNetLemmatizer()
    subj_stem = lmtzr.lemmatize(subj, "n")
    verb_stem = lmtzr.lemmatize(verb, "v")
    crow_sentences = find_sentences([subj_stem, verb_stem], sentences)
    #crow_sentences = find_sentences([subj, verb], sentences)
    
    state_question = baseline_stub.reformulate_question(q)
    parsed_dic = parsed_question_dic(q)
    if 'story' and 'about' in state_question:
        special_cases(q)
    if 'somewhere' in state_question:
        answers.append(find_where_answer(q["dep"],story["sch_dep"][index]))
        if len(answers) == 0:
            #if verb exits, then we perform find_locations from that verb
            if (verb_stem):
                find_locations(tree)# needs to be changed
            elif(subj_stem):
                find_locations(tree)
        else:
            answers.append(unparsed_sent)
        # loc = None        
    if 'sometime' in state_question:
        answers.append(unparsed_sent)
        
    if 'someone' in state_question:
        if state_question.startwith('someone'):
            answers.append(find_subj_answer(q["dep"],story["sch_dep"][index]))
        #chunk_demo for an alternate answer
        else:
            answers.append(unparsed_sent)
            dobj = None
    if 'something' in state_question:
        if state_question.startswith("something"):
            answer = find_dobj(q, unparsed_sent, text, story)
            answers.append(answer)
        elif "do" in question and ("did" in question or "does" in question ):
            answer= find_verb(sentences)
            answer.append(answer)
        else:
            answer= find_iobj(q, sentence_with_answer, text, story)
            answer.append(answer)
    if 'somewhy' in state_question:
        subj = parsed_dic["nsubj"]
        verb = parsed_dic["verb"]
        answers.append(subj)
        answers.append(verb)
    return answer
Beispiel #9
0
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools

    # stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)

    driver = QABase()

    # question["qid"] returns the form: "fables-04-7"
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])

    #############################################
    # if question["qid"] == 'blogs-03-1':
    #     print(question["text"])
    #     print(sent_tokenized_text[0])
    #     print("++++++++++++++++++++++++++++++++++++++++++++++")
    ############################################

    stopwords = set(
        nltk.corpus.stopwords.words("english") + list(string.punctuation))

    if question["difficulty"] == 'Medium' or question["difficulty"] == 'Easy':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)

        # prepare qbow for word-overlapping
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        # print(stemmed_qbow)

        # make ordered qbow for bigram and trigram matching
        stemmed_ordered_qbow = get_ordered_bow(
            get_sentences(question["text"])[0], stopwords)

        # prepare pattern_qbow for pattern overlapping
        # pattern_qbow = get_pattern_bow(get_sentences(question["text"])[0], stopwords)

        # if question["qid"] == 'mc500.train.18.18':
        #     print("stemmed_qbow:", stemmed_qbow)
        #     print("pattern_qbow:", pattern_qbow)

        best_idx = best_overlap_index(stemmed_ordered_qbow, stemmed_qbow,
                                      all_stemmed_sentences, stopwords,
                                      question)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if ('where' in Q) or ('when' in Q):
            pattern = nltk.ParentedTree.fromstring("(PP)")
        elif 'who' in Q or ('which' in Q):
            pattern = nltk.ParentedTree.fromstring("(NP (DT) (*) (NN))")
        elif ('what' in Q):
            pattern = nltk.ParentedTree.fromstring("(VP (*) (NP))")
        elif 'why' in Q:
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif 'how' in Q:
            pattern = nltk.ParentedTree.fromstring("(RB)")
            # don't know how to deal with 'did' questions
        elif 'did' in Q:
            pattern = nltk.ParentedTree.fromstring("(ROOT)")
        else:
            return doBaseline(question, story)

        subtree1 = pattern_matcher(pattern, tree)

        ############################################
        # if question["qid"] == 'mc500.train.25.3':
        #     print(Q)
        #     print(tree)
        #     print("subtree1")
        #     print(subtree1)
        ############################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            if ('where' in Q) or ('when' in Q):
                pattern = nltk.ParentedTree.fromstring("(PP)")
            elif 'who' in Q or ('which' in Q):
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif 'what' in Q:
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif 'why' in Q:
                pattern = nltk.ParentedTree.fromstring("(SBAR)")
            elif 'how' in Q:
                pattern = nltk.ParentedTree.fromstring("(RB)")

                # don't know how to deal with 'did' questions
            elif 'did' in Q:
                pattern = nltk.ParentedTree.fromstring("(ROOT)")

            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)
            if subtree2 == None:
                #######################################
                answer = doBaseline(question, story)
                # answer = "doBaseline"
                #######################################
            else:
                answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                negations = len(set(nltk.word_tokenize(answer)) & NEGATIONS)
                if negations > 0:
                    answer = "no"
                else:
                    answer = "yes"

    elif question["difficulty"] == 'Hard' or question[
            "difficulty"] == 'Discourse':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)

        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        ordered_qbow = get_ordered_bow(
            get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)

        stemmed_ordered_qbow = get_ordered_bow(
            get_sentences(question["text"])[0], stopwords)

        joined_grams = []
        # create bigrams and trigrams, then find collocations
        if len(stemmed_qbow) >= 2:
            bigrams = list(nltk.bigrams(stemmed_ordered_qbow))
            joined_grams += ['_'.join(b) for b in bigrams]
        if len(stemmed_qbow) > 2:
            trigrams = list(nltk.trigrams(stemmed_ordered_qbow))
            joined_grams += ['_'.join(t) for t in trigrams]

        stemmed_qbow = stemmed_qbow.union(set(joined_grams))

        #######################################
        # Collect hypernyms, hyponyms, lemmas #
        #######################################
        noun_ids = load_wordnet_ids("{}/{}".format(DATA_DIR,
                                                   "Wordnet_nouns.csv"))
        verb_ids = load_wordnet_ids("{}/{}".format(DATA_DIR,
                                                   "Wordnet_verbs.csv"))

        # {synset_id : {synset_offset: X, noun/verb: Y, stories: set(Z)}}, ...}
        # e.g. {help.v.01: {synset_offset: 2547586, noun: aid, stories: set(Z)}}, ...
        # noun_ids = pickle.load(open("Wordnet_nouns.dict", "rb"))
        # verb_ids = pickle.load(open("Wordnet_verbs.dict", "rb"))

        ####################################################################################
        # My own code documentations:
        # items is a dictionary, per synset_id, we have:
        #                   {'synset_offset': '7-digit number',
        #                   'story_noun': 'each noun word correlated with synset_id',
        #                   'stories': "'story-id.vgl'"}
        ####################################################################################

        # iterate through dictionary
        for synset_id, items in noun_ids.items():
            noun = items['story_noun']
            stories = items['stories']
            # print(noun, stories)
            # get lemmas, hyponyms, hypernyms

        for synset_id, items in verb_ids.items():
            verb = items['story_verb']
            stories = items['stories']
            # print(verb, stories)
            # get lemmas, hyponyms, hypernyms

        hypo_dict = {}
        hyper_dict = {}
        lemma_dict = {}

        for word in stemmed_qbow:
            word_synsets = wn.synsets(word)

            # hyponyms
            temp_this_word_hyponyms = []
            for word_synset in word_synsets:
                word_hypo = word_synset.hyponyms()
                temp_curr_hyponyms = []
                for hypo in word_hypo:
                    temp_curr_hyponyms.append(
                        hypo.name()[0:hypo.name().index(".")])
                temp_this_word_hyponyms += temp_curr_hyponyms
            hypo_dict[word] = temp_this_word_hyponyms

            # hyperyms
            temp_this_word_hypernyms = []
            for word_synset in word_synsets:
                word_hyper = word_synset.hypernyms()
                temp_curr_hypernyms = []
                for hyper in word_hyper:
                    temp_curr_hypernyms.append(
                        hyper.name()[0:hyper.name().index(".")])
                temp_this_word_hypernyms += temp_curr_hypernyms
            hyper_dict[word] = temp_this_word_hypernyms

            # lemmas
            temp_this_word_lemmas = [word]
            for word_synset in word_synsets:
                temp_this_word_lemmas.append(
                    word_synset.name()[0:word_synset.name().index(".")])
            lemma_dict[word] = temp_this_word_lemmas

        # combine hyponyms, hypernyms, lemmas with stemmed_qbow

        # hyponyms
        syn_list = set([])
        for stemmed_qbow_word in stemmed_qbow:
            for hypo in hypo_dict[stemmed_qbow_word]:
                syn_list.add(hypo)

        # hypernyms
        for stemmed_qbow_word in stemmed_qbow:
            for hyper in hyper_dict[stemmed_qbow_word]:
                syn_list.add(hyper)
                # if question["qid"] == 'fables-06-14':
                #     print(stemmed_qbow_word)
                #     print(hyper)

        # lemmas
        for stemmed_qbow_word in stemmed_qbow:
            for lemma in lemma_dict[stemmed_qbow_word]:
                syn_list.add(lemma)

        stemmed_qbow = stemmed_qbow.union(syn_list)

        best_idx = best_overlap_index(stemmed_ordered_qbow, stemmed_qbow,
                                      all_stemmed_sentences, stopwords,
                                      question)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if ('where' in Q) or ('when' in Q):
            pattern = nltk.ParentedTree.fromstring("(PP (*) (NP))")
        elif 'who' in Q or ('which' in Q):
            pattern = nltk.ParentedTree.fromstring("(NP (DT) (*) (NN))")
        elif 'what' in Q:
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif 'why' in Q:
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif 'how' in Q:
            pattern = nltk.ParentedTree.fromstring("(RB)")
            # don't know how to deal with 'did' questions
        elif 'did' in Q:
            pattern = nltk.ParentedTree.fromstring("(ROOT)")
        else:
            return doBaseline(question, story)

        subtree1 = pattern_matcher(pattern, tree)

        #################################################
        # who_qs = ["fables-03-22", "fables-03-23", "fables-03-25", "fables-03-26", "mc500.train.25.3"]
        where_qs = [
            "blogs-03-15", "blogs-03-19", "blogs-05-18", "fables-03-27",
            "mc500.train.0.23", "mc500.train.0.24", "mc500.train.18.23",
            "mc500.train.18.25", "mc500.train.111.5"
        ]
        if question["qid"] in where_qs:
            print(Q)
            print(tree)
            print("subtree1")
            print(subtree1)
        ######################################################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            # create a new pattern to match a smaller subset of subtrees
            if ('where' in Q) or ('when' in Q):
                pattern = nltk.ParentedTree.fromstring("(PP)")
            elif 'who' in Q or ('which' in Q):
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif 'what' in Q:
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif 'why' in Q:
                pattern = nltk.ParentedTree.fromstring("(SBAR)")
            elif 'how' in Q:
                pattern = nltk.ParentedTree.fromstring("(RB)")

                # don't know how to deal with 'did' questions
            elif 'did' in Q:
                pattern = nltk.ParentedTree.fromstring("(ROOT)")

            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)

            ####################################
            if question["qid"] in where_qs:
                print(pattern)
                print("subtree2")
                print(subtree2)
            ###################################

            if subtree2 == None:
                #######################################
                answer = doBaseline(question, story)
                # answer = "doBaseline"
                #######################################
            else:
                answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                negations = len(set(nltk.word_tokenize(answer)) & NEGATIONS)
                if negations > 0:
                    answer = "no"
                else:
                    answer = "yes"

    else:
        #########################################
        answer = doBaseline(question, story)
        # answer = "doBaseline"
        #########################################

    ###     End of Your Code         ###

    return answer
Beispiel #10
0
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools

    stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)
    lmtzr = WordNetLemmatizer()

    driver = QABase()

    # question["qid"] returns the form: "fables-04-7"
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])
    text = story["text"]

    # Apply the standard NLP pipeline we've seen before
    sentences = get_sentences(text)

    # tokenize questions, also removing punctuations to extract keywords
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_question_text = tokenizer.tokenize(question["text"])
    tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text)

    # remove stopwords
    tagged_keywords_list = []
    stopwords = set(nltk.corpus.stopwords.words("english"))
    for word, tag in tagged_tokenized_question_text:
        if word not in stopwords:
            tagged_keywords_list.append((word, tag))

    # lemmatize keywords
    lemmatized_keywords_list = []
    for keyword, tag in tagged_keywords_list:
        lemmatized_keywords_list.append(stemmer.stem(keyword))

    # Find the sentences that have all of our keywords in them
    target_sentences = find_sentences(lemmatized_keywords_list, sentences)
    # Extract the candidate locations from these sentences
    candidates_forest = find_candidates(target_sentences, chunker,
                                        question["text"])

    if (question["difficulty"] == 'Easy' and len(candidates_forest) != 0):

        possible_answers_list = []

        # locations is a list of trees
        for candidate in candidates_forest:
            # candidate.draw()
            possible_answers_list.append(" ".join(
                [token[0] for token in candidate.leaves()]))
        answer = " ".join(possible_answers_list)

        ###########################################
        # currently, possible_answer contains the actual needed answer,
        # plus some garbage words around it from chunking,
        # we might be able to filter this out SOMEHOW
        # possible_answer is a list of strings
        ###########################################

    elif question["difficulty"] == 'Medium':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences,
                                      stop_words)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        # First level subtree matching
        # candidate_sents = []
        #
        # for sub in tree:
        #     subsent = " ".join(sub.leaves())
        #     candidate_sents.append(subsent)
        #
        # stemmed_candidate_sents = []
        # for s in candidate_sents:
        #     temp_candidate_sents = []
        #     s = nltk.word_tokenize(s)
        #     s = nltk.pos_tag(s)
        #
        #     for w, p in s:
        #         temp_candidate_sents.append((stemmer.stem(w), p))
        #     stemmed_candidate_sents.append(temp_candidate_sents)
        #
        # best_idx = best_overlap_index(stemmed_qbow, stemmed_candidate_sents, stopwords)
        # tree = tree[best_idx]
        # if question["qid"] == 'mc500.train.18.18':
        #     print(tree)

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if Q[0] == 'where' or Q[0] == 'when':
            pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))")
        elif Q[0] == 'who':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'what':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'why':
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif Q[0] == 'how':
            pattern = nltk.ParentedTree.fromstring("(RB)")

        # don't know how to deal with 'did' questions
        elif Q[0] == 'did':
            pattern = nltk.ParentedTree.fromstring("(S)")

        subtree1 = pattern_matcher(pattern, tree)

        ############################################
        # if question["qid"] == 'blogs-03-13':
        #     print("subtree1")
        #     print(subtree1)
        ############################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            # create a new pattern to match a smaller subset of subtrees
            if Q[0] == 'where' or Q[0] == 'when':
                pattern = nltk.ParentedTree.fromstring("(VP)")
            elif Q[0] == 'who':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'what':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'why':
                pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))")
            elif Q[0] == 'how':
                pattern = nltk.ParentedTree.fromstring("(RB)")

            # don't know how to deal with 'did' questions
            elif Q[0] == 'did':
                pattern = nltk.ParentedTree.fromstring("(S)")

            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)
            answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                answer = "yes"

    else:
        #########################################
        answer = doBaseline(question, story)
        # answer = "doBaseline"
        #########################################

    ###     End of Your Code         ###
    return answer
#!/usr/bin/env python
'''
Created on May 14, 2014
@author: reid

Modified on May 21, 2015
'''

import sys, nltk, operator
from qa_engine.base import QABase
porterrrr = nltk.PorterStemmer()
#porterrrr = nltk.stem.snowball.SnowballStemmer("english")
driver = QABase()
preStemmed = {}

debug = False


# The standard NLTK pipeline for POS tagging a document
def get_sentences(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]

    stemmedSents = []
    for sent in sentences:
        stemmedSents.append([word for word in sent])
    sentences = [nltk.pos_tag(sent) for sent in sentences]

    return sentences