def get_text_question(question): driver = QABase() qid = question["qid"] q = driver.get_question(qid) question = q["text"] raw_tags = get_sentences(question)[0] word_array = [] tag_array = [] for word, tag in raw_tags: word_array.append(word) tag_array.append(tag) questions = [] for i in range(len(word_array) - 1): regex_vb = re.findall(r'(VB\w?)', tag_array[i]) regex_nn = re.findall(r'(NN\w?)', tag_array[i]) regex_jj = re.findall(r'(JJ\w?)', tag_array[i]) """ if tag_array[i] == 'WRB': word = str(word_array[i]).lower() questions.append((word, tag_array[i])) if tag_array[i] == 'WP': word = str(word_array[i]).lower() questions.append((word, tag_array[i])) """ if len(regex_vb) > 0: if tag_array[i] == regex_vb[0]: word = str(word_array[i]).lower() questions.append((word, tag_array[i])) if len(regex_nn) > 0: if tag_array[i] == regex_nn[0]: word = str(word_array[i]).lower() questions.append((word, tag_array[i])) if len(regex_jj) > 0: if tag_array[i] == regex_jj[0]: word = str(word_array[i]).lower() questions.append((word, tag_array[i])) # print(len(questions)) # print(questions) # print("\n") return questions
def get_better_answer(q): crow_sentences = find_sentences([subj, verb], sentences) chunker = nltk.RegexpParser(GRAMMAR) locations = find_candidates(crow_sentences, chunker) answer = None driver = QABase() q = driver.get_question(question_id) story = driver.get_story(q["sid"]) text = story["text"] sentences = get_sentences(text) state_question = baseline_stub.reformulate_question(q) parsed_dic = parsed_question_dic(q) if 'somewhere' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] # loc = None answer = find_locations(tree) if 'sometime' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] if 'someone' in state_question: if state_question.startwith('someone'): answer = find_subj(sentences) # else: # dobj = None if 'somewhat' in state_question: if 'direct_object' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] if 'indirect_object' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] if 'verb' in state_question: subj = parsed_dic["nsubj"] if 'somewhy' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] return answer
def main(): driver = QABase() q = driver.get_question("fables-01_Q1") story = driver.get_story(q["storyid"]) print("sentence selected:{}".format(story)) tree = story[0]['const_parse'] print("const tree:{}".format(tree)) # Create our pattern pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))") # # Match our pattern to the tree subtree = pattern_matcher(pattern, tree) print(" ".join(subtree.leaves())) # create a new pattern to match a smaller subset of subtree pattern2 = nltk.ParentedTree.fromstring("(PP)") # Find and print the answer subtree2 = pattern_matcher(pattern2, subtree) print(" ".join(subtree2.leaves()))
if __name__ == '__main__': # # Our tools chunker = nltk.RegexpParser(GRAMMAR) # lmtzr = WordNetLemmatizer() question_id = "blogs-01-3" # question_id = "fables-02-1" # question_id = "mc500.train.0.12" # question_id = "fables-02-3" # question_id = "blogs-01-5" # question_id = "fables-02-1" # question_id = "fables-01-3" driver = QABase() q = driver.get_question(question_id) story = driver.get_story(q["sid"]) # sentences = story["story_par"] text = story["text"] # print(text) # Apply the standard NLP pipeline we've seen before # sentences = get_sentences(text) sentences = get_sentences_without_quotes(text) # print(sentences) # answer = find_subj(sentences) where = find_where(sentences) for whe in where: print(" ".join([token[0] for token in whe.leaves()])) #
if node['rel'] == "nmod": deps = get_dependents(node, sgraph) deps = sorted(deps + [node], key=operator.itemgetter("address")) return " ".join(dep["word"] for dep in deps) #raise NotImplemented return None if __name__ == '__main__': driver = QABase() # Get the first question and its story q = driver.get_question("fables-01_Q1") print("question:", q["question"]) qgraph = q['dep_parse'] #print(qgraph) #raise NotImplemented story = driver.get_story(q["storyid"]) print("sentence selected: ", story[0]['sentence']) sgraph = story[0]['dep_parse'] nodes = list(sgraph.nodes.values()) #print(nodes) #raise NotImplemented
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) lmtzr = WordNetLemmatizer() driver = QABase() q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) text = story["text"] # Apply the standard NLP pipeline we've seen before sentences = get_sentences(text) # print(sentences) # print(question["text"]) # tokenize questions, also removing punctuations to extract keywords tokenizer = RegexpTokenizer(r'\w+') tokenized_question_text = tokenizer.tokenize(question["text"]) tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text) # remove stopwords tagged_keywords_list = [] stopwords = set(nltk.corpus.stopwords.words("english")) for word, tag in tagged_tokenized_question_text: if word not in stopwords: tagged_keywords_list.append((word, tag)) # lemmatize keywords ######################### KEYWORDS MUST BE IN A SPECIFIC ORDER, THIS IS RANDOM ######################### TAGGING FOR SINGLE WORDS ARE USUALLY TREATED AS NOUNS EVEN IF THEY SHOULD BE VERBS lemmatized_keywords_list = [] for keyword, tag in tagged_keywords_list: lemmatized_keywords_list.append(stemmer.stem(keyword)) # sort into noun, verb order crow_sentences = find_sentences(lemmatized_keywords_list, sentences) # crow_sentences = find_sentences(keywords_list, sentences) # print(crow_sentences) # Extract the candidate locations from these sentences locations = find_candidates(crow_sentences, chunker, question["text"]) print("sentences:", len(sentences)) print("orignal keywords:", tagged_keywords_list) print("keywords:", lemmatized_keywords_list) print("crow_sentences:", len(crow_sentences)) print(question["text"], locations) if question["difficulty"] == 'Easy' and len(locations) != 0: ''' if story["sid"] == "fables-01": print("-----------------------------------------------------") print(crow_sentences) print("keywords:", keywords_list) print("questions:", question["text"]) print("loc:", locations) # Print them out for loc in locations: print(loc) print(" ".join([token[0] for token in loc.leaves()])) print("-----------------------------------------------------") ''' answer = [] for loc in locations: answer.append(" ".join([token[0] for token in loc.leaves()])) answer = " ".join(answer) elif question["difficulty"] == 'Easy': text = story["text"] questions = question["text"] stopwords = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(questions)[0], stopwords) # get_bow = filters stopwords, returns # get_sentences returns tagged question, in this case, only the first question # qbow therefore is a a list of tagged words from the question without stopwords sentences = get_sentences(text) answer_tuples = baseline(qbow, sentences, stopwords) answer = " ".join(t[0] for t in answer_tuples) # print("question:", questions) # print(answer) elif question["difficulty"] == 'Medium': ### End of Your Code ### return answer ############################################################# ### Dont change the code below here ############################################################# class QAEngine(QABase): @staticmethod def answer_question(question, story): answer = get_answer(question, story) return answer def run_qa(): QA = QAEngine() QA.run() #reads questions, iterates over questions QA.save_answers() def main(): run_qa() # You can uncomment this next line to evaluate your # answers, or you can run score_answers.py score_answers() if __name__ == "__main__": main()
def get_sentence(): #Eventual change: input question_id, if type= sch use scheherezade interp. tag_list = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ'] qfile = open('data/hw6-answers.csv') readCSV = csv.reader(qfile, delimiter=',') qids = [] for row in readCSV: qids.append(row[2]) del qids[0] driver = QABase() lemmatizer = WordNetLemmatizer() correct = 0 equal_scores = [] for question_id in qids: q = driver.get_question(question_id) story = driver.get_story(q['sid']) text = story['text'] question = q['text'] text_sentences = nltk.sent_tokenize(text) words = [nltk.word_tokenize(x) for x in text_sentences] question_words = nltk.word_tokenize(question) question_tagged = nltk.pos_tag(question_words) scores = [0 for i in range(0,len(text_sentences))] i = 0 for x in words: for y in x: if y not in ['was']: for z in question_tagged: if lemmatizer.lemmatize(y) in [lemmatizer.lemmatize(z[0])]: if z[1] in tag_list: scores[i] += 5 else: scores[i] += 1 i+=1 skip = False sent_index = np.argmax(scores) if sent_index != len(scores) - 1: for i in range(sent_index + 1,len(scores)): if scores[i] == scores[sent_index]: equal_scores.append(question_id) skip = True if not skip: print(sent_index) print(scores) print('question: ' + question) print(text_sentences[sent_index]) is_corr = input('correct? ') correct += int(is_corr) print('*************') print(correct)
def get_better_answer(q): chunker = nltk.RegexpParser(GRAMMAR) locations = find_answer(crow_sentences, chunker) answer = None answers =[] driver = QABase() q = driver.get_question(question_id) story = driver.get_story(q["sid"]) text = story["text"] question = q['text'] #unparsed_sent contains the sentence containing the answer unparsed_sent = QAmatching_combined(question, text) #sentences = sentence.strip('')for sentence in text.split('\n') index = find_index(unparsed_sent, text) lmtzr = WordNetLemmatizer() subj_stem = lmtzr.lemmatize(subj, "n") verb_stem = lmtzr.lemmatize(verb, "v") crow_sentences = find_sentences([subj_stem, verb_stem], sentences) #crow_sentences = find_sentences([subj, verb], sentences) state_question = baseline_stub.reformulate_question(q) parsed_dic = parsed_question_dic(q) if 'story' and 'about' in state_question: special_cases(q) if 'somewhere' in state_question: answers.append(find_where_answer(q["dep"],story["sch_dep"][index])) if len(answers) == 0: #if verb exits, then we perform find_locations from that verb if (verb_stem): find_locations(tree)# needs to be changed elif(subj_stem): find_locations(tree) else: answers.append(unparsed_sent) # loc = None if 'sometime' in state_question: answers.append(unparsed_sent) if 'someone' in state_question: if state_question.startwith('someone'): answers.append(find_subj_answer(q["dep"],story["sch_dep"][index])) #chunk_demo for an alternate answer else: answers.append(unparsed_sent) dobj = None if 'something' in state_question: if state_question.startswith("something"): answer = find_dobj(q, unparsed_sent, text, story) answers.append(answer) elif "do" in question and ("did" in question or "does" in question ): answer= find_verb(sentences) answer.append(answer) else: answer= find_iobj(q, sentence_with_answer, text, story) answer.append(answer) if 'somewhy' in state_question: subj = parsed_dic["nsubj"] verb = parsed_dic["verb"] answers.append(subj) answers.append(verb) return answer
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools # stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) driver = QABase() # question["qid"] returns the form: "fables-04-7" q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) ############################################# # if question["qid"] == 'blogs-03-1': # print(question["text"]) # print(sent_tokenized_text[0]) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ stopwords = set( nltk.corpus.stopwords.words("english") + list(string.punctuation)) if question["difficulty"] == 'Medium' or question["difficulty"] == 'Easy': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) # prepare qbow for word-overlapping qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) # print(stemmed_qbow) # make ordered qbow for bigram and trigram matching stemmed_ordered_qbow = get_ordered_bow( get_sentences(question["text"])[0], stopwords) # prepare pattern_qbow for pattern overlapping # pattern_qbow = get_pattern_bow(get_sentences(question["text"])[0], stopwords) # if question["qid"] == 'mc500.train.18.18': # print("stemmed_qbow:", stemmed_qbow) # print("pattern_qbow:", pattern_qbow) best_idx = best_overlap_index(stemmed_ordered_qbow, stemmed_qbow, all_stemmed_sentences, stopwords, question) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if ('where' in Q) or ('when' in Q): pattern = nltk.ParentedTree.fromstring("(PP)") elif 'who' in Q or ('which' in Q): pattern = nltk.ParentedTree.fromstring("(NP (DT) (*) (NN))") elif ('what' in Q): pattern = nltk.ParentedTree.fromstring("(VP (*) (NP))") elif 'why' in Q: pattern = nltk.ParentedTree.fromstring("(SBAR)") elif 'how' in Q: pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif 'did' in Q: pattern = nltk.ParentedTree.fromstring("(ROOT)") else: return doBaseline(question, story) subtree1 = pattern_matcher(pattern, tree) ############################################ # if question["qid"] == 'mc500.train.25.3': # print(Q) # print(tree) # print("subtree1") # print(subtree1) ############################################ if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: if ('where' in Q) or ('when' in Q): pattern = nltk.ParentedTree.fromstring("(PP)") elif 'who' in Q or ('which' in Q): pattern = nltk.ParentedTree.fromstring("(NP)") elif 'what' in Q: pattern = nltk.ParentedTree.fromstring("(NP)") elif 'why' in Q: pattern = nltk.ParentedTree.fromstring("(SBAR)") elif 'how' in Q: pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif 'did' in Q: pattern = nltk.ParentedTree.fromstring("(ROOT)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) if subtree2 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': negations = len(set(nltk.word_tokenize(answer)) & NEGATIONS) if negations > 0: answer = "no" else: answer = "yes" elif question["difficulty"] == 'Hard' or question[ "difficulty"] == 'Discourse': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) qbow = get_bow(get_sentences(question["text"])[0], stopwords) ordered_qbow = get_ordered_bow( get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) stemmed_ordered_qbow = get_ordered_bow( get_sentences(question["text"])[0], stopwords) joined_grams = [] # create bigrams and trigrams, then find collocations if len(stemmed_qbow) >= 2: bigrams = list(nltk.bigrams(stemmed_ordered_qbow)) joined_grams += ['_'.join(b) for b in bigrams] if len(stemmed_qbow) > 2: trigrams = list(nltk.trigrams(stemmed_ordered_qbow)) joined_grams += ['_'.join(t) for t in trigrams] stemmed_qbow = stemmed_qbow.union(set(joined_grams)) ####################################### # Collect hypernyms, hyponyms, lemmas # ####################################### noun_ids = load_wordnet_ids("{}/{}".format(DATA_DIR, "Wordnet_nouns.csv")) verb_ids = load_wordnet_ids("{}/{}".format(DATA_DIR, "Wordnet_verbs.csv")) # {synset_id : {synset_offset: X, noun/verb: Y, stories: set(Z)}}, ...} # e.g. {help.v.01: {synset_offset: 2547586, noun: aid, stories: set(Z)}}, ... # noun_ids = pickle.load(open("Wordnet_nouns.dict", "rb")) # verb_ids = pickle.load(open("Wordnet_verbs.dict", "rb")) #################################################################################### # My own code documentations: # items is a dictionary, per synset_id, we have: # {'synset_offset': '7-digit number', # 'story_noun': 'each noun word correlated with synset_id', # 'stories': "'story-id.vgl'"} #################################################################################### # iterate through dictionary for synset_id, items in noun_ids.items(): noun = items['story_noun'] stories = items['stories'] # print(noun, stories) # get lemmas, hyponyms, hypernyms for synset_id, items in verb_ids.items(): verb = items['story_verb'] stories = items['stories'] # print(verb, stories) # get lemmas, hyponyms, hypernyms hypo_dict = {} hyper_dict = {} lemma_dict = {} for word in stemmed_qbow: word_synsets = wn.synsets(word) # hyponyms temp_this_word_hyponyms = [] for word_synset in word_synsets: word_hypo = word_synset.hyponyms() temp_curr_hyponyms = [] for hypo in word_hypo: temp_curr_hyponyms.append( hypo.name()[0:hypo.name().index(".")]) temp_this_word_hyponyms += temp_curr_hyponyms hypo_dict[word] = temp_this_word_hyponyms # hyperyms temp_this_word_hypernyms = [] for word_synset in word_synsets: word_hyper = word_synset.hypernyms() temp_curr_hypernyms = [] for hyper in word_hyper: temp_curr_hypernyms.append( hyper.name()[0:hyper.name().index(".")]) temp_this_word_hypernyms += temp_curr_hypernyms hyper_dict[word] = temp_this_word_hypernyms # lemmas temp_this_word_lemmas = [word] for word_synset in word_synsets: temp_this_word_lemmas.append( word_synset.name()[0:word_synset.name().index(".")]) lemma_dict[word] = temp_this_word_lemmas # combine hyponyms, hypernyms, lemmas with stemmed_qbow # hyponyms syn_list = set([]) for stemmed_qbow_word in stemmed_qbow: for hypo in hypo_dict[stemmed_qbow_word]: syn_list.add(hypo) # hypernyms for stemmed_qbow_word in stemmed_qbow: for hyper in hyper_dict[stemmed_qbow_word]: syn_list.add(hyper) # if question["qid"] == 'fables-06-14': # print(stemmed_qbow_word) # print(hyper) # lemmas for stemmed_qbow_word in stemmed_qbow: for lemma in lemma_dict[stemmed_qbow_word]: syn_list.add(lemma) stemmed_qbow = stemmed_qbow.union(syn_list) best_idx = best_overlap_index(stemmed_ordered_qbow, stemmed_qbow, all_stemmed_sentences, stopwords, question) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if ('where' in Q) or ('when' in Q): pattern = nltk.ParentedTree.fromstring("(PP (*) (NP))") elif 'who' in Q or ('which' in Q): pattern = nltk.ParentedTree.fromstring("(NP (DT) (*) (NN))") elif 'what' in Q: pattern = nltk.ParentedTree.fromstring("(NP)") elif 'why' in Q: pattern = nltk.ParentedTree.fromstring("(SBAR)") elif 'how' in Q: pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif 'did' in Q: pattern = nltk.ParentedTree.fromstring("(ROOT)") else: return doBaseline(question, story) subtree1 = pattern_matcher(pattern, tree) ################################################# # who_qs = ["fables-03-22", "fables-03-23", "fables-03-25", "fables-03-26", "mc500.train.25.3"] where_qs = [ "blogs-03-15", "blogs-03-19", "blogs-05-18", "fables-03-27", "mc500.train.0.23", "mc500.train.0.24", "mc500.train.18.23", "mc500.train.18.25", "mc500.train.111.5" ] if question["qid"] in where_qs: print(Q) print(tree) print("subtree1") print(subtree1) ###################################################################### if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: # create a new pattern to match a smaller subset of subtrees if ('where' in Q) or ('when' in Q): pattern = nltk.ParentedTree.fromstring("(PP)") elif 'who' in Q or ('which' in Q): pattern = nltk.ParentedTree.fromstring("(NP)") elif 'what' in Q: pattern = nltk.ParentedTree.fromstring("(NP)") elif 'why' in Q: pattern = nltk.ParentedTree.fromstring("(SBAR)") elif 'how' in Q: pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif 'did' in Q: pattern = nltk.ParentedTree.fromstring("(ROOT)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) #################################### if question["qid"] in where_qs: print(pattern) print("subtree2") print(subtree2) ################################### if subtree2 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': negations = len(set(nltk.word_tokenize(answer)) & NEGATIONS) if negations > 0: answer = "no" else: answer = "yes" else: ######################################### answer = doBaseline(question, story) # answer = "doBaseline" ######################################### ### End of Your Code ### return answer
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) lmtzr = WordNetLemmatizer() driver = QABase() # question["qid"] returns the form: "fables-04-7" q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) text = story["text"] # Apply the standard NLP pipeline we've seen before sentences = get_sentences(text) # tokenize questions, also removing punctuations to extract keywords tokenizer = RegexpTokenizer(r'\w+') tokenized_question_text = tokenizer.tokenize(question["text"]) tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text) # remove stopwords tagged_keywords_list = [] stopwords = set(nltk.corpus.stopwords.words("english")) for word, tag in tagged_tokenized_question_text: if word not in stopwords: tagged_keywords_list.append((word, tag)) # lemmatize keywords lemmatized_keywords_list = [] for keyword, tag in tagged_keywords_list: lemmatized_keywords_list.append(stemmer.stem(keyword)) # Find the sentences that have all of our keywords in them target_sentences = find_sentences(lemmatized_keywords_list, sentences) # Extract the candidate locations from these sentences candidates_forest = find_candidates(target_sentences, chunker, question["text"]) if (question["difficulty"] == 'Easy' and len(candidates_forest) != 0): possible_answers_list = [] # locations is a list of trees for candidate in candidates_forest: # candidate.draw() possible_answers_list.append(" ".join( [token[0] for token in candidate.leaves()])) answer = " ".join(possible_answers_list) ########################################### # currently, possible_answer contains the actual needed answer, # plus some garbage words around it from chunking, # we might be able to filter this out SOMEHOW # possible_answer is a list of strings ########################################### elif question["difficulty"] == 'Medium': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern # First level subtree matching # candidate_sents = [] # # for sub in tree: # subsent = " ".join(sub.leaves()) # candidate_sents.append(subsent) # # stemmed_candidate_sents = [] # for s in candidate_sents: # temp_candidate_sents = [] # s = nltk.word_tokenize(s) # s = nltk.pos_tag(s) # # for w, p in s: # temp_candidate_sents.append((stemmer.stem(w), p)) # stemmed_candidate_sents.append(temp_candidate_sents) # # best_idx = best_overlap_index(stemmed_qbow, stemmed_candidate_sents, stopwords) # tree = tree[best_idx] # if question["qid"] == 'mc500.train.18.18': # print(tree) ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR)") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") subtree1 = pattern_matcher(pattern, tree) ############################################ # if question["qid"] == 'blogs-03-13': # print("subtree1") # print(subtree1) ############################################ if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: # create a new pattern to match a smaller subset of subtrees if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP)") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': answer = "yes" else: ######################################### answer = doBaseline(question, story) # answer = "doBaseline" ######################################### ### End of Your Code ### return answer