Esempio n. 1
0
def answer_where(s):
    parsed_s = tree_parser.sent_to_tree(s)
    pps = tree_parser.get_phrases(parsed_s, "PP", False, True)
    for pp in pps:
        sent_pp = tree_parser.tree_to_sent(pp)
        tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp))
        for tup in tagged_pp:
            if tup[1] == "LOCATION" or tup[1] == "ORGANIZATION":
                return sent_pp.strip()
    return ""
Esempio n. 2
0
def preprocess_sents(sents):
    preds = []
    for sent in sents:
        tree = tree_parser.sent_to_tree(sent)
    if tree_parser.contains_appos(tree):
        preds += tree_parser.appps_to_sents(tree)
    else:
        pred = tree_parser.sent_to_predicate(tree)
        preds.append(pred)
    return preds
Esempio n. 3
0
def preprocess_sents(sents):
    preds = []
    for sent in sents:
        tree = tree_parser.sent_to_tree(sent)
    if tree_parser.contains_appos(tree):
        preds += tree_parser.appps_to_sents(tree)
    else:
        pred = tree_parser.sent_to_predicate(tree)
        preds.append(pred)
    return preds
Esempio n. 4
0
def answer_who(q, s):
    parsed_q = tree_parser.sent_to_tree(q)
    sq = tree_parser.get_phrases(parsed_q, "SQ", False, False)
    what_type = get_what_type(sq)
    main_nps = tree_parser.get_phrases(parsed_q, "NP", True, True)

    if what_type == "definition":
        return answer_definitions(s, main_nps)
    elif what_type == "specific":
        return answer_non_definitions(s, main_nps)
    else:
        return ""
Esempio n. 5
0
def answer_non_definitions(s, main_nps):
    if is_definition(s):
        return answer_definitions(s, main_nps)
    if len(main_nps) == 0:
        return ""
    main_np = tree_parser.tree_to_sent(main_nps[0])
    parsed_s = tree_parser.sent_to_tree(s)
    vps = tree_parser.get_phrases(parsed_s, "VP", True, True)
    if len(vps) > 0:
        for vp in vps:
            if vp.label() != "VBN":
                main_vp = vp
                break
    else:
        return ""

    verb = get_main_verb(main_vp)

    candidates = s.split(" "+verb)

    if len(candidates) > 1:
        # if main_np in candidates[1]:
        if is_overlap(main_np, candidates[1]):
            ans_tree = tree_parser.sent_to_tree(candidates[1])
            s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False)
            if len(s_nps) > 0:
                return tree_parser.tree_to_sent(s_nps[0])
            else:
                return candidates[0]
        else:
            ans_tree = tree_parser.sent_to_tree(candidates[0])
            s_nps = tree_parser.get_phrases(ans_tree, "NP", True, False)
            if len(s_nps) > 0:
                return tree_parser.tree_to_sent(s_nps[0])
            else:
                return candidates[1]
    else:
        return ""
Esempio n. 6
0
def answer_when(s):
    parsed_s = tree_parser.sent_to_tree(s)
    pps = tree_parser.get_phrases(parsed_s, "PP", False, True)
    for pp in pps:
        sent_pp = tree_parser.tree_to_sent(pp)
        tagged_pp = tagger.tag(nltk.tokenize.word_tokenize(sent_pp))
        for tup in tagged_pp:
            if tup[1] == "DATE" or tup[1] == "TIME":
                return sent_pp.strip()
    tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(s))
    ans = ""
    for i in xrange(0, len(tagged_sent)):
        tup = tagged_sent[i]
        if tup[1] == "DATE" or tup[1] == "TIME":
            j = i
            while tagged_sent[j][1] == "DATE" or tagged_sent[j][1] == "TIME":
                ans += tagged_sent[j][0] + " "
                j += 1
            return ans.strip()
    return ""
Esempio n. 7
0
import nltk
import stanford_utils
import tree_parser

ner_tagger = stanford_utils.new_NERtagger()
sent = "The Second World War happens from 1953 to 1962."
# sent = "I will have classes on Tuesday and Thursday"
# sent = "I will have breakfast at home"
sent_tokens = nltk.word_tokenize(sent)
pos_sent = nltk.pos_tag(sent_tokens)
ner_sent = ner_tagger.tag(sent_tokens)
parsed_sent = tree_parser.sent_to_tree(sent)

print pos_sent[:]
print ner_sent
parsed_sent.draw()

def get_when(tree):
    question = ""

    return question

print get_when(parsed_sent)

Esempio n. 8
0
#     "With Prisoner of Azkaban, production of the Harry Potter films switched to an eighteen-month cycle, which producer David Heyman explained was \"to give each [film] the time it required.\"",
#     "A small section of the triple-decker bus scene, where it weaves in between traffic, was filmed in North London's Palmers Green.",
#     "Tottenham Hotspur was the first club he played for.",
#     "Harry then threatens to curse Vernon when he tries to discipline him but flees, fed up with his life at Privet Drive.",
#     "They unknowingly share a compartment with the new Defence Against the Dark Arts teacher, Remus Lupin, who is sleeping.",
#     "As the Gryffindor Dormitory has been compromised, the students sleep in the main hall which allows Harry to overhear an argument between Snape and Dumbledore about Lupin's suspected role.",
#     "Hermione reveals that she possesses a time-turner that she has used all year to take multiple classes simultaneously."]

s = [
    "It is a West Germanic language that was first spoken in early medieval England and is now a global lingua franca.",
    "As the Dementors overpower Black and his earlier self, Harry realises that he himself was the one to cast the Patronus, and rushes to do so.",
    "I am a student", "Beckham is a master"
]
for q in s:
    print q
    tree = tree_parser.sent_to_tree(q)
    for t in tree:
        print(t)
    print

# test = "Harry, Ron and Hermione head back to school on the Hogwarts Express. "
# ner_tagger = stanford_utils.new_NERtagger()
# tagged = ner_tagger.tag(test.split(" "))
# parsed = tree_parser.sent_to_tree(test)
#
# def contains_name(tagged_sent):
#     for tup in tagged_sent:
#         if tup[1] == "PERSON":
#             return True
#         elif tup[0].lower() == "he" or tup[0].lower() == "she":
#             return True
Esempio n. 9
0
def main(wiki_path, n):
    title, sents = doc_parser.doc_to_sents(wiki_path)
    questions = []

    sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30]
    sents = sents[:3 * n]
    # preds = []
    # for sent in sents:
    #     tree = tree_parser.sent_to_tree(sent)
    #     if tree_parser.contains_appos(tree):
    #         preds += tree_parser.appps_to_sents(tree)
    #     else:
    #         pred = tree_parser.sent_to_predicate(tree)
    #         if 10 <= pred.count(" ") <= 30:
    #             preds.append(pred)
    #         if len(preds) > 2*n:
    #             break
    # for pred in preds:
    #     print pred
    for sent in sents:
        parsed_sent = tree_parser.sent_to_tree(sent)
        pps = tree_parser.get_phrases(parsed_sent, "PP", False, False)

        tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent))

        # bonus for average len
        score = (20 - math.fabs(sent.count(" ") - 10)) * 0.5
        # bonus for more pps
        score += len(pps) - 1

        # bonus for question difficulties
        # distribute sents to generators
        # why
        if contains_reason(tagged_sent):
            question = ask.get_why(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 5))

        # how-many
        elif contains_quant(sent, tagged_sent):
            question = ask.get_howmany(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 5))

        # when
        if contains_time(tagged_sent):
            question = ask.get_when(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            if (len(question) > 29):
                questions.append((question, score - errs + 4))
        # where
        if contains_loc(tagged_sent):
            question = ask.get_where(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 4))

        # who/what
        if contains_name(tagged_sent):
            question = ask.get_who(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 3))
        else:
            question = ask.get_what(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score - errs + 2))

        # binary question
        binary_q = ask.get_binary(sent, twist=False).capitalize()
        binary_q, errs = grammar_checker.correct_sent(binary_q)
        # deductions for errors
        questions.append((binary_q, score - errs + 2))

    ranked_questions = sorted(questions, key=lambda x: (-x[1], x[0]))
    ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n]
    for question in ranked_questions:
        sys.stdout.write(question[0] + " " + "\n")
Esempio n. 10
0
import nltk
import stanford_utils
import tree_parser

ner_tagger = stanford_utils.new_NERtagger()
sent = "The Second World War happens from 1953 to 1962."
# sent = "I will have classes on Tuesday and Thursday"
# sent = "I will have breakfast at home"
sent_tokens = nltk.word_tokenize(sent)
pos_sent = nltk.pos_tag(sent_tokens)
ner_sent = ner_tagger.tag(sent_tokens)
parsed_sent = tree_parser.sent_to_tree(sent)

print pos_sent[:]
print ner_sent
parsed_sent.draw()


def get_when(tree):
    question = ""

    return question


print get_when(parsed_sent)
Esempio n. 11
0
#     "In Ancient Rome, Manilius and Ovid called the constellation Litoreus (shore-inhabiting).",
#     "With Prisoner of Azkaban, production of the Harry Potter films switched to an eighteen-month cycle, which producer David Heyman explained was \"to give each [film] the time it required.\"",
#     "A small section of the triple-decker bus scene, where it weaves in between traffic, was filmed in North London's Palmers Green.",
#     "Tottenham Hotspur was the first club he played for.",
#     "Harry then threatens to curse Vernon when he tries to discipline him but flees, fed up with his life at Privet Drive.",
#     "They unknowingly share a compartment with the new Defence Against the Dark Arts teacher, Remus Lupin, who is sleeping.",
#     "As the Gryffindor Dormitory has been compromised, the students sleep in the main hall which allows Harry to overhear an argument between Snape and Dumbledore about Lupin's suspected role.",
#     "Hermione reveals that she possesses a time-turner that she has used all year to take multiple classes simultaneously."]

s = ["It is a West Germanic language that was first spoken in early medieval England and is now a global lingua franca.",
     "As the Dementors overpower Black and his earlier self, Harry realises that he himself was the one to cast the Patronus, and rushes to do so.",
     "I am a student",
     "Beckham is a master"]
for q in s:
    print q
    tree = tree_parser.sent_to_tree(q)
    for t in tree:
        print(t)
    print

# test = "Harry, Ron and Hermione head back to school on the Hogwarts Express. "
# ner_tagger = stanford_utils.new_NERtagger()
# tagged = ner_tagger.tag(test.split(" "))
# parsed = tree_parser.sent_to_tree(test)
#
# def contains_name(tagged_sent):
#     for tup in tagged_sent:
#         if tup[1] == "PERSON":
#             return True
#         elif tup[0].lower() == "he" or tup[0].lower() == "she":
#             return True
Esempio n. 12
0
def main(wiki_path, n):
    title, sents = doc_parser.doc_to_sents(wiki_path)
    questions = []

    sents = [sent for sent in sents if 10 <= sent.count(" ") <= 30]
    sents = sents[:3*n]
    # preds = []
    # for sent in sents:
    #     tree = tree_parser.sent_to_tree(sent)
    #     if tree_parser.contains_appos(tree):
    #         preds += tree_parser.appps_to_sents(tree)
    #     else:
    #         pred = tree_parser.sent_to_predicate(tree)
    #         if 10 <= pred.count(" ") <= 30:
    #             preds.append(pred)
    #         if len(preds) > 2*n:
    #             break
    # for pred in preds:
    #     print pred
    for sent in sents:
        parsed_sent = tree_parser.sent_to_tree(sent)
        pps = tree_parser.get_phrases(parsed_sent, "PP", False, False)

        tagged_sent = tagger.tag(nltk.tokenize.word_tokenize(sent))

        # bonus for average len
        score = (20 - math.fabs(sent.count(" ")-10))*0.5
        # bonus for more pps
        score += len(pps)-1

        # bonus for question difficulties
        # distribute sents to generators
        # why
        if contains_reason(tagged_sent):
            question = ask.get_why(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+5))

        # how-many
        elif contains_quant(sent, tagged_sent):
            question = ask.get_howmany(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+5))

        # when
        if contains_time(tagged_sent):
            question = ask.get_when(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            if (len(question) > 29):
                questions.append((question, score-errs+4))
        # where
        if contains_loc(tagged_sent):
            question = ask.get_where(sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+4))

        # who/what
        if contains_name(tagged_sent):
            question = ask.get_who(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+3))
        else:
            question = ask.get_what(parsed_sent).capitalize()
            # correct grammar and find errors
            question, errs = grammar_checker.correct_sent(question)
            # deductions for errors
            questions.append((question, score-errs+2))

        # binary question
        binary_q = ask.get_binary(sent, twist=False).capitalize()
        binary_q, errs = grammar_checker.correct_sent(binary_q)
        # deductions for errors
        questions.append((binary_q, score-errs+2))

    ranked_questions = sorted(questions, key=lambda x:(-x[1],x[0]))
    ranked_questions = [q for q in ranked_questions if len(q[0]) > 0][:n]
    for question in ranked_questions:
        sys.stdout.write(question[0]+" "+"\n")