def answer_where(s): parsed_s = tree_parser.sent_to_tree(s) pps = tree_parser.get_phrases(parsed_s, "PP", False, True) for pp in pps: sent_pp = tree_parser.tree_to_sent(pp) tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp)) for tup in tagged_pp: if tup[1] == "LOCATION" or tup[1] == "ORGANIZATION": return sent_pp.strip() return ""
def preprocess_sents(sents): preds = [] for sent in sents: tree = tree_parser.sent_to_tree(sent) if tree_parser.contains_appos(tree): preds += tree_parser.appps_to_sents(tree) else: pred = tree_parser.sent_to_predicate(tree) preds.append(pred) return preds
def answer_who(q, s): parsed_q = tree_parser.sent_to_tree(q) sq = tree_parser.get_phrases(parsed_q, "SQ", False, False) what_type = get_what_type(sq) main_nps = tree_parser.get_phrases(parsed_q, "NP", True, True) if what_type == "definition": return answer_definitions(s, main_nps) elif what_type == "specific": return answer_non_definitions(s, main_nps) else: return ""
def answer_non_definitions(s, main_nps): if is_definition(s): return answer_definitions(s, main_nps) if len(main_nps) == 0: return "" main_np = tree_parser.tree_to_sent(main_nps[0]) parsed_s = tree_parser.sent_to_tree(s) vps = tree_parser.get_phrases(parsed_s, "VP", True, True) if len(vps) > 0: for vp in vps: if vp.label() != "VBN": main_vp = vp break else: return "" verb = get_main_verb(main_vp) candidates = s.split(" "+verb) if len(candidates) > 1: # if main_np in candidates[1]: if is_overlap(main_np, candidates[1]): ans_tree = tree_parser.sent_to_tree(candidates[1]) s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False) if len(s_nps) > 0: return tree_parser.tree_to_sent(s_nps[0]) else: return candidates[0] else: ans_tree = tree_parser.sent_to_tree(candidates[0]) s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False) if len(s_nps) > 0: return tree_parser.tree_to_sent(s_nps[0]) else: return candidates[1] else: return ""
def answer_when(s): parsed_s = tree_parser.sent_to_tree(s) pps = tree_parser.get_phrases(parsed_s, "PP", False, True) for pp in pps: sent_pp = tree_parser.tree_to_sent(pp) tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp)) for tup in tagged_pp: if tup[1] == "DATE" or tup[1] == "TIME": return sent_pp.strip() tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(s)) ans = "" for i in xrange(0, len(tagged_sent)): tup = tagged_sent[i] if tup[1] == "DATE" or tup[1] == "TIME": j = i while tagged_sent[j][1] == "DATE" or tagged_sent[j][1] == "TIME": ans += tagged_sent[j][0] + " " j += 1 return ans.strip() return ""
import nltk import stanford_utils import tree_parser ner_tagger = stanford_utils.new_NERtagger() sent = "The Second World War happens from 1953 to 1962." # sent = "I will have classes on Tuesday and Thursday" # sent = "I will have breakfast at home" sent_tokens = nltk.word_tokenize(sent) pos_sent = nltk.pos_tag(sent_tokens) ner_sent = ner_tagger.tag(sent_tokens) parsed_sent = tree_parser.sent_to_tree(sent) print pos_sent[:] print ner_sent parsed_sent.draw() def get_when(tree): question = "" return question print get_when(parsed_sent)
# "With Prisoner of Azkaban, production of the Harry Potter films switched to an eighteen-month cycle, which producer David Heyman explained was \"to give each [film] the time it required.\"", # "A small section of the triple-decker bus scene, where it weaves in between traffic, was filmed in North London's Palmers Green.", # "Tottenham Hotspur was the first club he played for.", # "Harry then threatens to curse Vernon when he tries to discipline him but flees, fed up with his life at Privet Drive.", # "They unknowingly share a compartment with the new Defence Against the Dark Arts teacher, Remus Lupin, who is sleeping.", # "As the Gryffindor Dormitory has been compromised, the students sleep in the main hall which allows Harry to overhear an argument between Snape and Dumbledore about Lupin's suspected role.", # "Hermione reveals that she possesses a time-turner that she has used all year to take multiple classes simultaneously."] s = [ "It is a West Germanic language that was first spoken in early medieval England and is now a global lingua franca.", "As the Dementors overpower Black and his earlier self, Harry realises that he himself was the one to cast the Patronus, and rushes to do so.", "I am a student", "Beckham is a master" ] for q in s: print q tree = tree_parser.sent_to_tree(q) for t in tree: print(t) print # test = "Harry, Ron and Hermione head back to school on the Hogwarts Express. " # ner_tagger = stanford_utils.new_NERtagger() # tagged = ner_tagger.tag(test.split(" ")) # parsed = tree_parser.sent_to_tree(test) # # def contains_name(tagged_sent): # for tup in tagged_sent: # if tup[1] == "PERSON": # return True # elif tup[0].lower() == "he" or tup[0].lower() == "she": # return True
def main(wiki_path, n): title, sents = doc_parser.doc_to_sents(wiki_path) questions = [] sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30] sents = sents[:3 * n] # preds = [] # for sent in sents: # tree = tree_parser.sent_to_tree(sent) # if tree_parser.contains_appos(tree): # preds += tree_parser.appps_to_sents(tree) # else: # pred = tree_parser.sent_to_predicate(tree) # if 10 <= pred.count(" ") <= 30: # preds.append(pred) # if len(preds) > 2*n: # break # for pred in preds: # print pred for sent in sents: parsed_sent = tree_parser.sent_to_tree(sent) pps = tree_parser.get_phrases(parsed_sent, "PP", False, False) tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent)) # bonus for average len score = (20 - math.fabs(sent.count(" ") - 10)) * 0.5 # bonus for more pps score += len(pps) - 1 # bonus for question difficulties # distribute sents to generators # why if contains_reason(tagged_sent): question = ask.get_why(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 5)) # how-many elif contains_quant(sent, tagged_sent): question = ask.get_howmany(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 5)) # when if contains_time(tagged_sent): question = ask.get_when(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors if (len(question) > 29): questions.append((question, score - errs + 4)) # where if contains_loc(tagged_sent): question = ask.get_where(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 4)) # who/what if contains_name(tagged_sent): question = ask.get_who(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 3)) else: question = ask.get_what(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 2)) # binary question binary_q = ask.get_binary(sent, twist=False).capitalize() binary_q, errs = grammar_checker.correct_sent(binary_q) # deductions for errors questions.append((binary_q, score - errs + 2)) ranked_questions = sorted(questions, key=lambda x: (-x[1], x[0])) ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n] for question in ranked_questions: sys.stdout.write(question[0] + " " + "\n")
# "In Ancient Rome, Manilius and Ovid called the constellation Litoreus (shore-inhabiting).", # "With Prisoner of Azkaban, production of the Harry Potter films switched to an eighteen-month cycle, which producer David Heyman explained was \"to give each [film] the time it required.\"", # "A small section of the triple-decker bus scene, where it weaves in between traffic, was filmed in North London's Palmers Green.", # "Tottenham Hotspur was the first club he played for.", # "Harry then threatens to curse Vernon when he tries to discipline him but flees, fed up with his life at Privet Drive.", # "They unknowingly share a compartment with the new Defence Against the Dark Arts teacher, Remus Lupin, who is sleeping.", # "As the Gryffindor Dormitory has been compromised, the students sleep in the main hall which allows Harry to overhear an argument between Snape and Dumbledore about Lupin's suspected role.", # "Hermione reveals that she possesses a time-turner that she has used all year to take multiple classes simultaneously."] s = ["It is a West Germanic language that was first spoken in early medieval England and is now a global lingua franca.", "As the Dementors overpower Black and his earlier self, Harry realises that he himself was the one to cast the Patronus, and rushes to do so.", "I am a student", "Beckham is a master"] for q in s: print q tree = tree_parser.sent_to_tree(q) for t in tree: print(t) print # test = "Harry, Ron and Hermione head back to school on the Hogwarts Express. " # ner_tagger = stanford_utils.new_NERtagger() # tagged = ner_tagger.tag(test.split(" ")) # parsed = tree_parser.sent_to_tree(test) # # def contains_name(tagged_sent): # for tup in tagged_sent: # if tup[1] == "PERSON": # return True # elif tup[0].lower() == "he" or tup[0].lower() == "she": # return True
def main(wiki_path, n): title, sents = doc_parser.doc_to_sents(wiki_path) questions = [] sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30] sents = sents[:3*n] # preds = [] # for sent in sents: # tree = tree_parser.sent_to_tree(sent) # if tree_parser.contains_appos(tree): # preds += tree_parser.appps_to_sents(tree) # else: # pred = tree_parser.sent_to_predicate(tree) # if 10 <= pred.count(" ") <= 30: # preds.append(pred) # if len(preds) > 2*n: # break # for pred in preds: # print pred for sent in sents: parsed_sent = tree_parser.sent_to_tree(sent) pps = tree_parser.get_phrases(parsed_sent, "PP", False, False) tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent)) # bonus for average len score = (20 - math.fabs(sent.count(" ")-10))*0.5 # bonus for more pps score += len(pps)-1 # bonus for question difficulties # distribute sents to generators # why if contains_reason(tagged_sent): question = ask.get_why(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+5)) # how-many elif contains_quant(sent, tagged_sent): question = ask.get_howmany(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+5)) # when if contains_time(tagged_sent): question = ask.get_when(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors if (len(question) > 29): questions.append((question, score-errs+4)) # where if contains_loc(tagged_sent): question = ask.get_where(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+4)) # who/what if contains_name(tagged_sent): question = ask.get_who(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+3)) else: question = ask.get_what(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+2)) # binary question binary_q = ask.get_binary(sent, twist=False).capitalize() binary_q, errs = grammar_checker.correct_sent(binary_q) # deductions for errors questions.append((binary_q, score-errs+2)) ranked_questions = sorted(questions, key=lambda x:(-x[1],x[0])) ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n] for question in ranked_questions: sys.stdout.write(question[0]+" "+"\n")