Beispiel #1
0
def answer_who(q, s):
    parsed_q = tree_parser.sent_to_tree(q)
    sq = tree_parser.get_phrases(parsed_q, "SQ", False, False)
    what_type = get_what_type(sq)
    main_nps = tree_parser.get_phrases(parsed_q, "NP", True, True)

    if what_type == "definition":
        return answer_definitions(s, main_nps)
    elif what_type == "specific":
        return answer_non_definitions(s, main_nps)
    else:
        return ""
Beispiel #2
0
def answer_where(s):
    parsed_s = tree_parser.sent_to_tree(s)
    pps = tree_parser.get_phrases(parsed_s, "PP", False, True)
    for pp in pps:
        sent_pp = tree_parser.tree_to_sent(pp)
        tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp))
        for tup in tagged_pp:
            if tup[1] == "LOCATION" or tup[1] == "ORGANIZATION":
                return sent_pp.strip()
    return ""
Beispiel #3
0
def get_when(tree):
    when = ""
    question = get_binary(tree)
    pps = tree_parser.get_phrases(tree, "PP", sort=True, reversed = True)
    for pp in pps:
        ner_pp = ner_tagger.tag(pp.leaves())
        for (word, tag) in ner_pp:
            if tag == "TIME" or tag == "DATE":
                when = " ".join(pp.leaves())
                break
    return "when "+question.replace(when, "").rstrip(',').rstrip('.') + "?"
Beispiel #4
0
def get_where(tree):
    where = ""
    question = get_binary(tree)
    pps = tree_parser.get_phrases(tree, "PP")
    for pp in pps:
        ner_pp = ner_tagger.tag(pp.leaves())
        for (word, tag) in ner_pp:
            if tag == "LOCATION" or tag == "ORGANIZATION":
                where = " ".join(pp.leaves())
                break
    return "where "+question.replace(where, "").strip().rstrip(',').rstrip('.') +"?"
Beispiel #5
0
def answer_non_definitions(s, main_nps):
    if is_definition(s):
        return answer_definitions(s, main_nps)
    if len(main_nps) == 0:
        return ""
    main_np = tree_parser.tree_to_sent(main_nps[0])
    parsed_s = tree_parser.sent_to_tree(s)
    vps = tree_parser.get_phrases(parsed_s, "VP", True, True)
    if len(vps) > 0:
        for vp in vps:
            if vp.label() != "VBN":
                main_vp = vp
                break
    else:
        return ""

    verb = get_main_verb(main_vp)

    candidates = s.split(" "+verb)

    if len(candidates) > 1:
        # if main_np in candidates[1]:
        if is_overlap(main_np, candidates[1]):
            ans_tree = tree_parser.sent_to_tree(candidates[1])
            s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False)
            if len(s_nps) > 0:
                return tree_parser.tree_to_sent(s_nps[0])
            else:
                return candidates[0]
        else:
            ans_tree = tree_parser.sent_to_tree(candidates[0])
            s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False)
            if len(s_nps) > 0:
                return tree_parser.tree_to_sent(s_nps[0])
            else:
                return candidates[1]
    else:
        return ""
Beispiel #6
0
def get_howmany(tree):
    num = ""
    obj = ""
    question = get_binary(tree)
    nps = tree_parser.get_phrases(tree, "NP")

    for np in nps:
        for i in xrange(len(np)):
            if i+1 < len(np) and np[i].label() == "CD" and np[i+1].label() == "NNS":
                num = " ".join(np[i].leaves())
                obj = " ".join(np[i+1].leaves())
                break
        if len(num) > 0 and len(obj)>0:
            break
    return "How many "+obj+ " "+question.replace(num+" ", "").replace(obj, "").strip().rstrip(',').rstrip('.') +"?"
Beispiel #7
0
def answer_when(s):
    parsed_s = tree_parser.sent_to_tree(s)
    pps = tree_parser.get_phrases(parsed_s, "PP", False, True)
    for pp in pps:
        sent_pp = tree_parser.tree_to_sent(pp)
        tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp))
        for tup in tagged_pp:
            if tup[1] == "DATE" or tup[1] == "TIME":
                return sent_pp.strip()
    tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(s))
    ans = ""
    for i in xrange(0, len(tagged_sent)):
        tup = tagged_sent[i]
        if tup[1] == "DATE" or tup[1] == "TIME":
            j = i
            while tagged_sent[j][1] == "DATE" or tagged_sent[j][1] == "TIME":
                ans += tagged_sent[j][0] + " "
                j += 1
            return ans.strip()
    return ""
Beispiel #8
0
def main(wiki_path, n):
    title, sents = doc_parser.doc_to_sents(wiki_path)
    questions = []

    sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30]
    sents = sents[:3 * n]
    # preds = []
    # for sent in sents:
    #     tree = tree_parser.sent_to_tree(sent)
    #     if tree_parser.contains_appos(tree):
    #         preds += tree_parser.appps_to_sents(tree)
    #     else:
    #         pred = tree_parser.sent_to_predicate(tree)
    #         if 10 <= pred.count(" ") <= 30:
    #             preds.append(pred)
    #         if len(preds) > 2*n:
    #             break
    # for pred in preds:
    #     print pred
    for sent in sents:
        parsed_sent = tree_parser.sent_to_tree(sent)
        pps = tree_parser.get_phrases(parsed_sent, "PP", False, False)

        tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent))

        # bonus for average len
        score = (20 - math.fabs(sent.count(" ") - 10)) * 0.5
        # bonus for more pps
        score += len(pps) - 1

        # bonus for question difficulties
        # distribute sents to generators
        # why
        if contains_reason(tagged_sent):
            question = ask.get_why(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 5))

        # how-many
        elif contains_quant(sent, tagged_sent):
            question = ask.get_howmany(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 5))

        # when
        if contains_time(tagged_sent):
            question = ask.get_when(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            if (len(question) > 29):
                questions.append((question, score - errs + 4))
        # where
        if contains_loc(tagged_sent):
            question = ask.get_where(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 4))

        # who/what
        if contains_name(tagged_sent):
            question = ask.get_who(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 3))
        else:
            question = ask.get_what(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 2))

        # binary question
        binary_q = ask.get_binary(sent, twist=False).capitalize()
        binary_q, errs = grammar_checker.correct_sent(binary_q)
        # deductions for errors
        questions.append((binary_q, score - errs + 2))

    ranked_questions = sorted(questions, key=lambda x: (-x[1], x[0]))
    ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n]
    for question in ranked_questions:
        sys.stdout.write(question[0] + " " + "\n")
Beispiel #9
0
def main(wiki_path, n):
    title, sents = doc_parser.doc_to_sents(wiki_path)
    questions = []

    sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30]
    sents = sents[:3*n]
    # preds = []
    # for sent in sents:
    #     tree = tree_parser.sent_to_tree(sent)
    #     if tree_parser.contains_appos(tree):
    #         preds += tree_parser.appps_to_sents(tree)
    #     else:
    #         pred = tree_parser.sent_to_predicate(tree)
    #         if 10 <= pred.count(" ") <= 30:
    #             preds.append(pred)
    #         if len(preds) > 2*n:
    #             break
    # for pred in preds:
    #     print pred
    for sent in sents:
        parsed_sent = tree_parser.sent_to_tree(sent)
        pps = tree_parser.get_phrases(parsed_sent, "PP", False, False)

        tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent))

        # bonus for average len
        score = (20 - math.fabs(sent.count(" ")-10))*0.5
        # bonus for more pps
        score += len(pps)-1

        # bonus for question difficulties
        # distribute sents to generators
        # why
        if contains_reason(tagged_sent):
            question = ask.get_why(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+5))

        # how-many
        elif contains_quant(sent, tagged_sent):
            question = ask.get_howmany(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+5))

        # when
        if contains_time(tagged_sent):
            question = ask.get_when(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            if (len(question) > 29):
                questions.append((question, score-errs+4))
        # where
        if contains_loc(tagged_sent):
            question = ask.get_where(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+4))

        # who/what
        if contains_name(tagged_sent):
            question = ask.get_who(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+3))
        else:
            question = ask.get_what(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+2))

        # binary question
        binary_q = ask.get_binary(sent, twist=False).capitalize()
        binary_q, errs = grammar_checker.correct_sent(binary_q)
        # deductions for errors
        questions.append((binary_q, score-errs+2))

    ranked_questions = sorted(questions, key=lambda x:(-x[1],x[0]))
    ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n]
    for question in ranked_questions:
        sys.stdout.write(question[0]+" "+"\n")