def answer_who(q, s): parsed_q = tree_parser.sent_to_tree(q) sq = tree_parser.get_phrases(parsed_q, "SQ", False, False) what_type = get_what_type(sq) main_nps = tree_parser.get_phrases(parsed_q, "NP", True, True) if what_type == "definition": return answer_definitions(s, main_nps) elif what_type == "specific": return answer_non_definitions(s, main_nps) else: return ""
def answer_where(s): parsed_s = tree_parser.sent_to_tree(s) pps = tree_parser.get_phrases(parsed_s, "PP", False, True) for pp in pps: sent_pp = tree_parser.tree_to_sent(pp) tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp)) for tup in tagged_pp: if tup[1] == "LOCATION" or tup[1] == "ORGANIZATION": return sent_pp.strip() return ""
def get_when(tree): when = "" question = get_binary(tree) pps = tree_parser.get_phrases(tree, "PP", sort=True, reversed = True) for pp in pps: ner_pp = ner_tagger.tag(pp.leaves()) for (word, tag) in ner_pp: if tag == "TIME" or tag == "DATE": when = " ".join(pp.leaves()) break return "when "+question.replace(when, "").rstrip(',').rstrip('.') + "?"
def get_where(tree): where = "" question = get_binary(tree) pps = tree_parser.get_phrases(tree, "PP") for pp in pps: ner_pp = ner_tagger.tag(pp.leaves()) for (word, tag) in ner_pp: if tag == "LOCATION" or tag == "ORGANIZATION": where = " ".join(pp.leaves()) break return "where "+question.replace(where, "").strip().rstrip(',').rstrip('.') +"?"
def answer_non_definitions(s, main_nps): if is_definition(s): return answer_definitions(s, main_nps) if len(main_nps) == 0: return "" main_np = tree_parser.tree_to_sent(main_nps[0]) parsed_s = tree_parser.sent_to_tree(s) vps = tree_parser.get_phrases(parsed_s, "VP", True, True) if len(vps) > 0: for vp in vps: if vp.label() != "VBN": main_vp = vp break else: return "" verb = get_main_verb(main_vp) candidates = s.split(" "+verb) if len(candidates) > 1: # if main_np in candidates[1]: if is_overlap(main_np, candidates[1]): ans_tree = tree_parser.sent_to_tree(candidates[1]) s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False) if len(s_nps) > 0: return tree_parser.tree_to_sent(s_nps[0]) else: return candidates[0] else: ans_tree = tree_parser.sent_to_tree(candidates[0]) s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False) if len(s_nps) > 0: return tree_parser.tree_to_sent(s_nps[0]) else: return candidates[1] else: return ""
def get_howmany(tree): num = "" obj = "" question = get_binary(tree) nps = tree_parser.get_phrases(tree, "NP") for np in nps: for i in xrange(len(np)): if i+1 < len(np) and np[i].label() == "CD" and np[i+1].label() == "NNS": num = " ".join(np[i].leaves()) obj = " ".join(np[i+1].leaves()) break if len(num) > 0 and len(obj)>0: break return "How many "+obj+ " "+question.replace(num+" ", "").replace(obj, "").strip().rstrip(',').rstrip('.') +"?"
def answer_when(s): parsed_s = tree_parser.sent_to_tree(s) pps = tree_parser.get_phrases(parsed_s, "PP", False, True) for pp in pps: sent_pp = tree_parser.tree_to_sent(pp) tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp)) for tup in tagged_pp: if tup[1] == "DATE" or tup[1] == "TIME": return sent_pp.strip() tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(s)) ans = "" for i in xrange(0, len(tagged_sent)): tup = tagged_sent[i] if tup[1] == "DATE" or tup[1] == "TIME": j = i while tagged_sent[j][1] == "DATE" or tagged_sent[j][1] == "TIME": ans += tagged_sent[j][0] + " " j += 1 return ans.strip() return ""
def main(wiki_path, n): title, sents = doc_parser.doc_to_sents(wiki_path) questions = [] sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30] sents = sents[:3 * n] # preds = [] # for sent in sents: # tree = tree_parser.sent_to_tree(sent) # if tree_parser.contains_appos(tree): # preds += tree_parser.appps_to_sents(tree) # else: # pred = tree_parser.sent_to_predicate(tree) # if 10 <= pred.count(" ") <= 30: # preds.append(pred) # if len(preds) > 2*n: # break # for pred in preds: # print pred for sent in sents: parsed_sent = tree_parser.sent_to_tree(sent) pps = tree_parser.get_phrases(parsed_sent, "PP", False, False) tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent)) # bonus for average len score = (20 - math.fabs(sent.count(" ") - 10)) * 0.5 # bonus for more pps score += len(pps) - 1 # bonus for question difficulties # distribute sents to generators # why if contains_reason(tagged_sent): question = ask.get_why(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 5)) # how-many elif contains_quant(sent, tagged_sent): question = ask.get_howmany(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 5)) # when if contains_time(tagged_sent): question = ask.get_when(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors if (len(question) > 29): questions.append((question, score - errs + 4)) # where if contains_loc(tagged_sent): question = ask.get_where(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 4)) # who/what if contains_name(tagged_sent): question = ask.get_who(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 3)) else: question = ask.get_what(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score - errs + 2)) # binary question binary_q = ask.get_binary(sent, twist=False).capitalize() binary_q, errs = grammar_checker.correct_sent(binary_q) # deductions for errors questions.append((binary_q, score - errs + 2)) ranked_questions = sorted(questions, key=lambda x: (-x[1], x[0])) ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n] for question in ranked_questions: sys.stdout.write(question[0] + " " + "\n")
def main(wiki_path, n): title, sents = doc_parser.doc_to_sents(wiki_path) questions = [] sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30] sents = sents[:3*n] # preds = [] # for sent in sents: # tree = tree_parser.sent_to_tree(sent) # if tree_parser.contains_appos(tree): # preds += tree_parser.appps_to_sents(tree) # else: # pred = tree_parser.sent_to_predicate(tree) # if 10 <= pred.count(" ") <= 30: # preds.append(pred) # if len(preds) > 2*n: # break # for pred in preds: # print pred for sent in sents: parsed_sent = tree_parser.sent_to_tree(sent) pps = tree_parser.get_phrases(parsed_sent, "PP", False, False) tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent)) # bonus for average len score = (20 - math.fabs(sent.count(" ")-10))*0.5 # bonus for more pps score += len(pps)-1 # bonus for question difficulties # distribute sents to generators # why if contains_reason(tagged_sent): question = ask.get_why(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+5)) # how-many elif contains_quant(sent, tagged_sent): question = ask.get_howmany(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+5)) # when if contains_time(tagged_sent): question = ask.get_when(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors if (len(question) > 29): questions.append((question, score-errs+4)) # where if contains_loc(tagged_sent): question = ask.get_where(sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+4)) # who/what if contains_name(tagged_sent): question = ask.get_who(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+3)) else: question = ask.get_what(parsed_sent).capitalize() # correct grammar and find errors question, errs = grammar_checker.correct_sent(question) # deductions for errors questions.append((question, score-errs+2)) # binary question binary_q = ask.get_binary(sent, twist=False).capitalize() binary_q, errs = grammar_checker.correct_sent(binary_q) # deductions for errors questions.append((binary_q, score-errs+2)) ranked_questions = sorted(questions, key=lambda x:(-x[1],x[0])) ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n] for question in ranked_questions: sys.stdout.write(question[0]+" "+"\n")