Python get_ner_per_token Beispiele, util_service.get_ner_per_token Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: wh_question_answerer.py Projekt: WanNJ/Wiki-QA-Magic

def localized_statement_pipeline(localized_statement):
    localized_dep_parse = util_service.get_dependency_parse(
        localized_statement)
    root_idx = localized_dep_parse.index("ROOT")
    ner_tokens = util_service.get_ner_per_token(localized_statement)

    return localized_dep_parse, root_idx, ner_tokens

Beispiel #2

0

Datei anzeigen

def get_answer(question, localized_statement):
    """
    This is super bad. It finds the possible answers,
    (aka everything past the first named entity, then splits
    that for stuff before and after the or statements in the question),
    then looks for the statement that occurred first. I want to make it
    so it looks for similarity and not a perfect comparison of strings
    """
    ner_only = util_service.get_ner(question)

    probably_the_subject = ner_only[0][0]

    subject_start_i = question.find(probably_the_subject)
    subject_end_i = subject_start_i + len(probably_the_subject)  # removes the first named entity
    q_no_sub = question[subject_end_i:len(question)]
    ner_tokens_q_no_sub = util_service.get_ner_per_token(q_no_sub)

    # This gets the possible answers by finding the words right before "or"
    # Does this on the string without the named entity, ie:
    #   "Is Pittsburgh a city" --> "a city"
    possible_answers = []
    last_or_index = 0
    j = 0
    for i in range(len(ner_tokens_q_no_sub)):
        if ner_tokens_q_no_sub[i][0] == "or":
            last_or_index = i
            question_str = ""
            while j < i:
                question_str = question_str + ner_tokens_q_no_sub[j][0] + " "
                j += 1
            possible_answers.append(question_str)
            j = last_or_index

    # For the last answer after the last or
    j = last_or_index + 1
    question_str = ""
    while j < len(ner_tokens_q_no_sub):
        question_str = question_str + ner_tokens_q_no_sub[j][0] + " "
        j += 1
    question_str = question_str[0:len(question_str) - 2]  # gets rids of ?
    possible_answers.append(question_str)

    # idk calling .strip() on the list elements for possible_answers = possible_answers.strips()
    # didn't work
    new_pos_answers = []
    for answer in possible_answers:
        new_pos_answers.append(answer.strip())  # MAKE BETTER

    possible_ans_index = 0

    current_closest = len(localized_statement)
    for index, answer in enumerate(new_pos_answers):
        answer_index = localized_statement.find(answer)
        if answer_index < current_closest and answer_index > 0:
            current_closest = answer_index
            possible_ans_index = index

    # return probably_the_subject + " " + new_pos_answers[possible_ans_index]
    return new_pos_answers[possible_ans_index]

Beispiel #3

0

Datei anzeigen

Datei: is_generator.py Projekt: WanNJ/Wiki-QA-Magic

def generate_question(sentence):
    """
    Takes in a string sentence and generates a "Is" question.
    Puts "Is the " + named entity + rest_of_sentence + "?"
    """

    ner_tags = util_service.get_ner_per_token(sentence)
    # ner_only = util_service.get_ner(sentence)

    # # No questions to be made without any named entities
    # if len(ner_only) == 0:
    #     return []

    # # Right now instead of this block it just adds "the " in the question later
    # # This block would add "the " to the named entity if "the" is
    # # in the sentence right before where the named entity is
    # ner_start = ner_only[0][1]
    # list_of_the = ["The", "the"]
    # # sentence[ner_start - 4:ner_start] looks for "the"
    # if sentence[ner_start - 4 : ner_start-1] in list_of_the:
    #     ner_only[0][0] = "the " + ner_only[0][0]

    # print(ner_only)
    # print(ner_tokens_sentence)

    # Finds where the is word is, then is just going to append everything after it
    # into a question
    is_are_index = -1
    # is_was_are_were is a string (one of "Is ", "Are ", "Was ", or "Were ")
    # is_are_index is where this string occurs in the tokens
    # is_was_are_were, is_are_index = which_acomp(ner_tokens_sentence)

    is_was_are_were, is_are_index = which_acomp(ner_tags)
    before_sentence_2, after_sentence_2 = build_sides(ner_tags, is_are_index)
    sentence_2 = is_was_are_were + before_sentence_2 + after_sentence_2

    # # Could be elif here
    # if len(ner_only) > 0:
    #     new_question = is_was_are_were

    #     new_question += ner_only[0][0] + " "

    # if ("the" in ner_only[0][0]
    #     or "The" in ner_only[0][0]):
    #     new_question += ner_only[0][0] + " "
    # else:
    #     new_question += ner_only[0][0] + " "

    #     for j in range(is_are_index+1, len(ner_tokens_sentence) -1): #up to -1 as the last token is "."
    #         new_question += ner_tokens_sentence[j][0] + " "
    # new_question = new_question[0:len(new_question)-1] # removes whitespace at end
    # new_question += "?"

    # return_list = [] # Is the return supposed to be a list? I guess multiple possible questions
    # return_list.append(new_question)
    sentence_2 = sentence_2[0:len(sentence_2) - 1] + "?"
    return [sentence_2]

Beispiel #4

0

Datei anzeigen

Datei: what_generator.py Projekt: WanNJ/Wiki-QA-Magic

def generate_question(sentence):

    # "is", "as a", "FAC",
    # NEEDS "was", "were", "are"

    ner_tokens_sentence = util_service.get_ner_per_token(sentence)
    ner_only = util_service.get_ner(sentence)

    if len(ner_only) == 0:
        return []

    ner_start = ner_only[0][1]
    list_of_the = ["The", "the"]
    # sentence[ner_start - 4:ner_start] looks for "the"
    if sentence[ner_start - 4:ner_start - 1] in list_of_the:
        ner_only[0][0] = "the " + ner_only[0][0]

    # Finds where the is word is, then is just going to append everything after it
    # into a question
    is_are_index = -1
    # flag is a string (one of "Is ", "Are ", "Was ", or "Were ")
    # is_are_index is where this string occurs in the tokens

    flag, is_are_index = which_acomp(ner_tokens_sentence)
    if flag == "":
        return []

    flag = flag.lower()
    new_question = "What " + flag + ner_only[0][0] + "?"

    # q_dep_parse = util_service.get_dependency_parse(sentence)
    # print(q_dep_parse)
    # print(ner_tokens_sentence)

    return_list = [
    ]  # Is the return supposed to be a list? I guess multiple possible questions
    return_list.append(new_question)
    return return_list

Beispiel #5

0

Datei anzeigen

def generate_question(sentence):
    """
    generates what question based on the sentence
        :param sentence: 

    
    """
    # sentence = "Old Kingdom is most commonly regarded as the period from the Third Dynasty through to the Sixth Dynasty ."
    # sentence = "King Djoser's architect, Imhotep is credited with the development of building with stone and with the conception of the new architectural form—the Step Pyramid."
    # sentence = "The Old Kingdom is perhaps best known for the large number of pyramids constructed at this time as burial places for Egypt's kings."
    # sentence = 'For this reason, the Old Kingdom is frequently referred to as "the Age of the Pyramids."'
    # sentence = "The first is called the Meidum pyramid, named for its location in Egypt."
    # sentence = "There were military expeditions into Canaan and Nubia, with Egyptian influence reaching up the Nile into what is today the Sudan."
    # sentence = "She is a forward for the Orlando Pride and the United States women's national soccer team."
    # sentence = """Alexandra "Alex" Patricia Morgan Carrasco (born July 2, 1989), née Alexandra Patricia Morgan, is an American soccer player, Olympic gold medalist, and FIFA Women's World Cup champion."""

    # sentence = "List of Olympic medalists in football"
    # util_service.get_dep_parse_tree(sentence)[1]
    # util_service.get_pos(sentence)

    is_idx = -1
    was_idx = -1
    born_idx = -1
    in_idx = -1

    sent_tokens = sentence.split()

    # get index of is
    try:
        is_idx = sent_tokens.index("is")
    except:
        pass

    try:
        born_idx = sent_tokens.index("born")
    except:
        pass

    try:
        was_idx = sent_tokens.index("was")
    except:
        pass

    try:
        in_idx = sent_tokens.index("in")
    except:
        pass

    ner_only = util_service.get_ner(sentence)
    # print(ner_only)
    ner_tags = util_service.get_ner_per_token(sentence)
    # print(ner_tags)
    # dep_parse = util_service.get_dep_parse_tree_Evan(sentence) #heads children
    # print(dep_parse)

    was_idx_ner = -1
    is_idx_ner = -1

    if was_idx != -1:
        was_idx_ner = get_was_idx_from_ner(ner_tags)
    if is_idx != -1:
        is_idx_ner = get_is_idx_from_ner(ner_tags)
    if in_idx != -1:
        in_idx_ner = get_in_idx_from_ner(ner_tags)

    # This is specific to where someone was born (rather, who was born in _location_)
    idx_ner = max(was_idx_ner, is_idx_ner)
    if (ner_tags[idx_ner - 1][1] == "PERSON"):
        if born_idx != -1:
            location = get_location(ner_tags, in_idx_ner, ner_only)
            # print("Loc: ", location)
            q = "Who was born in " + location + "?"
            # print([q])
            return [q]
        else:
            q_type = ""

    # This tries to flip the end to the beginning, see following example:
    # "Some sets for the film were built in Glen Coe, Scotland, near the Clachaig Inn." # sentence 1
    # "Were some sets for the film built in Glen Coe, Scotland, near the Clachaig Inn?" # sentence 2
    # "Were some sets for the film built in Glen Coe, Scotland, near where?" # sentence 3
    # "Where were some sets for the film built in Glen Coe, Scotland, near?" # sentence 4

    try:
        is_was_are_were, is_are_index = which_acomp(ner_tags)
        before_sentence_2, after_sentence_2 = build_sides(
            ner_tags, is_are_index)
        sentence_2 = is_was_are_were + before_sentence_2 + after_sentence_2
        # print(sentence_2)
        last_named_entity = ner_only[len(ner_only) - 1]
        sentence_4, ended_with_where_or_what = change_2_to_4(
            sentence_2, last_named_entity)
        if not ended_with_where_or_what:
            return []
        return [sentence_4]
    # sentence_4 = change_2_to_4(sentence_3)
    # print(sentence_4)
    except:
        return []
    return []

Beispiel #6

0

Datei anzeigen

def generate_question(sentence):
    """
    generates what question based on the sentence
        :param sentence: 
    """
    # sentence = "Old Kingdom is most commonly regarded as the period from the Third Dynasty through to the Sixth Dynasty ."
    # sentence = "King Djoser's architect, Imhotep is credited with the development of building with stone and with the conception of the new architectural form—the Step Pyramid."
    # sentence = "The Old Kingdom is perhaps best known for the large number of pyramids constructed at this time as burial places for Egypt's kings."
    # sentence = 'For this reason, the Old Kingdom is frequently referred to as "the Age of the Pyramids."'
    # sentence = "The first is called the Meidum pyramid, named for its location in Egypt."
    # sentence = "There were military expeditions into Canaan and Nubia, with Egyptian influence reaching up the Nile into what is today the Sudan."
    # sentence = "She is a forward for the Orlando Pride and the United States women's national soccer team."
    # sentence = """Alexandra "Alex" Patricia Morgan Carrasco (born July 2, 1989), née Alexandra Patricia Morgan, is an American soccer player, Olympic gold medalist, and FIFA Women's World Cup champion."""

    # sentence = "List of Olympic medalists in football"
    # util_service.get_dep_parse_tree(sentence)[1]
    # util_service.get_pos(sentence)

    is_idx = 0

    sent_tokens = sentence.split()

    # get index of is
    try:
        is_idx = sent_tokens.index("is")
    except:
        return []

    # getting the end of question
    passed_tokens = []
    for i, token in enumerate(sent_tokens):
        if i >= is_idx:
            if (i == is_idx+1):
                if token in ["of"]:
                    continue
            passed_tokens.append(token)

    # replaces . with ? if it is last token or it is a part of last token
    if passed_tokens[-1] == ".":
        passed_tokens[-1] = "?"
    elif passed_tokens[-1].endswith("."):
        passed_tokens[-1] = re.sub("(.*)\.", "\\1?", passed_tokens[-1])
    else:
        passed_tokens.append("?")

    # need to identify q_type
    # could be who, what
    # from NER
    # if it is GPE then "what"
    # if it is "PERSON" or "PRON" then "who"
    # dep parse is not providing expected output Old Kingdom is coming as pnoun
    q_type = ""

    # dep_parse = util_service.get_dep_parse_tree(sentence)[1]
    pos_tags = util_service.get_pos(sentence)
    ner_tags = util_service.get_ner_per_token(sentence)
    is_idx_ner = get_is_idx_from_ner(ner_tags)
    if is_idx_ner != -1:
        if ner_tags[is_idx_ner - 1][1] == "GPE":
            q_type = "What"
        elif ner_tags[is_idx_ner - 1][1] == "PERSON":
            q_type = "Who"
        elif ner_tags[is_idx_ner - 1][0] == "," and ner_tags[is_idx_ner - 2][1] == "PERSON":
            q_type = "Who"
        # checks if there is a pronoun before is word from pos tags
        elif pos_tags[is_idx - 1][1] == "PRON":
            q_type = "Who"
    if q_type == "":
        return []

    q = q_type + " " + " ".join(passed_tokens)
    return [q]

Beispiel #7

0

Datei anzeigen

Datei: eo_generator.py Projekt: WanNJ/Wiki-QA-Magic

def generate_question(sentence):
    # print("ORIGINAL SENTENCE: ", sentence)
    # ner_only = util_service.get_ner(sentence)
    # print(ner_only)
    try:
        is_idx = -1
        was_idx = -1

        sent_tokens = sentence.split()

        # get index of is
        try: is_idx = sent_tokens.index("is")
        except: pass

        try: was_idx = sent_tokens.index("was")
        except: pass
        # print("Passed try except")
        # print(is_idx)
        # print(was_idx)

        # getting the end of question
        if is_idx != -1:
            # print("is idx")
            passed_tokens = []
            for i, token in enumerate(sent_tokens):
                if i > is_idx:
                    if (i == is_idx+1):
                        if token in ["of"]:
                            continue
                    passed_tokens.append(token)

        elif was_idx != -1:
            # print("was idx")
            passed_tokens = []
            for i, token in enumerate(sent_tokens):
                if i > was_idx:
                    if (i == was_idx+1):
                        if token in ["of"]:
                            continue
                    passed_tokens.append(token)

        # print("Replacing last token")
        # print(passed_tokens)

        # replaces . with ? if it is last token or it is a part of last token
        if passed_tokens[-1] == ".":
            passed_tokens[-1] = "?"
        elif passed_tokens[-1].endswith("."):
            passed_tokens[-1] = re.sub("(.*)\.", "\\1?", passed_tokens[-1])
        else:
            passed_tokens.append("?")

        # print("Doing ner_tags")

        ner_tags = util_service.get_ner_per_token(sentence)
        # print("doing ner_only")
        ner_only = util_service.get_ner(sentence)
        # print("doing is_idx")
        is_idx_ner = get_is_idx_from_ner(ner_tags)
        # print("Doing was_idx")
        was_idx_ner = get_was_idx_from_ner(ner_tags)

        # print("substance_of_sent now")

        substance_of_sent = " ".join(passed_tokens)

        # print(ner_only)

        # print(is_idx_ner)
        # print(was_idx_ner)
        acomp_idx_ner = -1
        if is_idx_ner > was_idx_ner:
            acomp_idx_ner = is_idx_ner
            acomp_word = "Is"
        else:
            acomp_idx_ner = was_idx_ner
            acomp_word = "Was"
        # acomp_idx_ner = max(is_idx_ner, was_idx_ner)
        if acomp_idx_ner != -1:
            # print(ner_tags[acomp_idx_ner - 1][1])
            if ner_tags[acomp_idx_ner - 1][1] == "ORG":
                wrong = get_random_org()
            elif ner_tags[acomp_idx_ner - 1][1] == "GPE":
                wrong = get_random_gpe()
            elif ner_tags[acomp_idx_ner - 1][1] == "PERSON":
                # print("Person")
                wrong = get_random_name()
                # print(wrong)
            elif ner_tags[acomp_idx_ner - 1][1] == "DATE":
                wrong = get_diff_date(ner_tags[acomp_idx_ner - 1][0])
            elif ner_tags[acomp_idx_ner - 1][1] == "LOC":
                # q_type = "Who"
                wrong = get_random_loc()
            elif ner_tags[acomp_idx_ner - 1][1] == "QUANTITY":
                # q_type = "Who"
                wrong = get_random_number()
            elif ner_tags[acomp_idx_ner - 1][1] == "MONEY":
                # q_type = "Who"
                wrong = "$42"
            elif ner_tags[acomp_idx_ner - 1][1] == "PERCENT":
                # q_type = "Who"
                wrong = "42%"
        else:
            return []
        # q = "Is " + ner_tags[acomp_idx_ner - 1][0] + " " + " ".join(passed_tokens) + " or " + wrong + "?"
        
        # print(ner_only)
        q = acomp_word + " " + ner_only[0][0] + " or " + wrong + " " + substance_of_sent
        # print(q)
        return [q]
    except:
        return []

Beispiel #8

0

Datei anzeigen

Datei: when_generator.py Projekt: WanNJ/Wiki-QA-Magic

def generate_question(sentence):
    # print("-=-==-=-")
    # print(sentence)
    # print("-=-==-=-")

    ## pattern 1
    # sentence = "In January 2012, Morgan and national teammate Heather Mitts became brand ambassadors for health product company, GNC."
    # sentence = "In July 2011, Morgan signed a one-year endorsement deal with Bank of America."
    # sentence = "In 2013, Morgan appeared in television commercials for Bridgestone."
    # sentence = "In 2015, Morgan starred in a Nationwide Mutual Insurance Company commercial that that was broadcast nationwide in the United States."
    # sentence = "In May 2015, Morgan was featured on the cover of ESPN The Magazine with teammates Abby Wambach and Sydney Leroux."
    # sentence = "In 2013, Morgan appeared in the ESPN documentary series, Nine for IX."
    # sentence = "In May of the same year, Morgan likeness appeared on The Simpsons along with Christen Press and Abby Wambach."
    # sentence = "On August 31, 2013, Portland captured the inaugural National Women’s Soccer League championship title after defeating regular season champions Western New York Flash 2–0."
    # sentence = "Throughout the 2011 season, Morgan played in 14 matches and scored 4 goals."
    # sentence = "At age 17, Morgan was called up to the United States"
    # sentence = "In the 2012 London Olympics She scored the game-winning goal in the 123rd minute of the semifinal game against Canada."

    # TODO:
    # ## pattern 2 to be built later
    # sentence = "Morgan married Servando Carrasco, also a soccer player, on December 31, 2014."

    util_service.get_dep_parse_tree(sentence)[1]
    pos_tags = util_service.get_pos(sentence)
    ner_tags = util_service.get_ner_per_token(sentence)

    sent_tokens = util_service.get_tokenized_form(sentence)

    lemma_tokens = util_service.get_lemmatize_form(sentence)

    ## in {date}, Nsub Verb
    ## look for the above pattern only, if result found return
    ## other patterns to be identified and invoked later

    date_span = []
    for i, sent_token in enumerate(sent_tokens):
        if i == 0 and sent_tokens[i].lower() in [
                "in", "on", "throughout", "at"
        ]:
            date_span.append(i)
            continue
        if ner_tags[i][1] == "DATE":
            date_span.append(i)
            continue
        if sent_tokens[i] == ",":
            date_span.append(i)
            continue
        break

    if not len(date_span):
        return []

    question_sent = []
    first_verb_flag = False
    if pos_tags[date_span[-1] + 1][1] in ["PROPN"]:
        for i, sent_token in enumerate(sent_tokens):
            if i <= date_span[-1]:
                continue
            # convert only 1st verb to lemma
            # overfitting for sentences consiting two verbs, eg second verb defeating, would change to defeat -> On August 31, 2013, Portland captured the inaugural National Women’s Soccer League championship title after defeating regular season champions Western New York Flash 2–0.
            if pos_tags[i][1] == "VERB" and not first_verb_flag:
                if sent_token != "was":  # check for passive voice
                    question_sent.append(lemma_tokens[i])
                    first_verb_flag = True
                    continue
                continue
            if sent_token == "\n":
                continue
            question_sent.append(sent_token)

    if not len(question_sent):
        return []

    # replaces . with ? if it is last token or it is a part of last token
    if question_sent[-1] == ".":
        question_sent[-1] = "?"
    elif question_sent[-1].endswith("."):
        question_sent[-1] = re.sub("(.*)\.", "\\1?", question_sent[-1])
    else:
        question_sent.append("?")

    q_when = "When did " + " ".join(question_sent)
    q_did = "Did " + " ".join(question_sent)

    return [q_when, q_did]