def localized_statement_pipeline(localized_statement): localized_dep_parse = util_service.get_dependency_parse( localized_statement) root_idx = localized_dep_parse.index("ROOT") ner_tokens = util_service.get_ner_per_token(localized_statement) return localized_dep_parse, root_idx, ner_tokens
def get_answer(question, localized_statement): """ This is super bad. It finds the possible answers, (aka everything past the first named entity, then splits that for stuff before and after the or statements in the question), then looks for the statement that occurred first. I want to make it so it looks for similarity and not a perfect comparison of strings """ ner_only = util_service.get_ner(question) probably_the_subject = ner_only[0][0] subject_start_i = question.find(probably_the_subject) subject_end_i = subject_start_i + len(probably_the_subject) # removes the first named entity q_no_sub = question[subject_end_i:len(question)] ner_tokens_q_no_sub = util_service.get_ner_per_token(q_no_sub) # This gets the possible answers by finding the words right before "or" # Does this on the string without the named entity, ie: # "Is Pittsburgh a city" --> "a city" possible_answers = [] last_or_index = 0 j = 0 for i in range(len(ner_tokens_q_no_sub)): if ner_tokens_q_no_sub[i][0] == "or": last_or_index = i question_str = "" while j < i: question_str = question_str + ner_tokens_q_no_sub[j][0] + " " j += 1 possible_answers.append(question_str) j = last_or_index # For the last answer after the last or j = last_or_index + 1 question_str = "" while j < len(ner_tokens_q_no_sub): question_str = question_str + ner_tokens_q_no_sub[j][0] + " " j += 1 question_str = question_str[0:len(question_str) - 2] # gets rids of ? possible_answers.append(question_str) # idk calling .strip() on the list elements for possible_answers = possible_answers.strips() # didn't work new_pos_answers = [] for answer in possible_answers: new_pos_answers.append(answer.strip()) # MAKE BETTER possible_ans_index = 0 current_closest = len(localized_statement) for index, answer in enumerate(new_pos_answers): answer_index = localized_statement.find(answer) if answer_index < current_closest and answer_index > 0: current_closest = answer_index possible_ans_index = index # return probably_the_subject + " " + new_pos_answers[possible_ans_index] return new_pos_answers[possible_ans_index]
def generate_question(sentence): """ Takes in a string sentence and generates a "Is" question. Puts "Is the " + named entity + rest_of_sentence + "?" """ ner_tags = util_service.get_ner_per_token(sentence) # ner_only = util_service.get_ner(sentence) # # No questions to be made without any named entities # if len(ner_only) == 0: # return [] # # Right now instead of this block it just adds "the " in the question later # # This block would add "the " to the named entity if "the" is # # in the sentence right before where the named entity is # ner_start = ner_only[0][1] # list_of_the = ["The", "the"] # # sentence[ner_start - 4:ner_start] looks for "the" # if sentence[ner_start - 4 : ner_start-1] in list_of_the: # ner_only[0][0] = "the " + ner_only[0][0] # print(ner_only) # print(ner_tokens_sentence) # Finds where the is word is, then is just going to append everything after it # into a question is_are_index = -1 # is_was_are_were is a string (one of "Is ", "Are ", "Was ", or "Were ") # is_are_index is where this string occurs in the tokens # is_was_are_were, is_are_index = which_acomp(ner_tokens_sentence) is_was_are_were, is_are_index = which_acomp(ner_tags) before_sentence_2, after_sentence_2 = build_sides(ner_tags, is_are_index) sentence_2 = is_was_are_were + before_sentence_2 + after_sentence_2 # # Could be elif here # if len(ner_only) > 0: # new_question = is_was_are_were # new_question += ner_only[0][0] + " " # if ("the" in ner_only[0][0] # or "The" in ner_only[0][0]): # new_question += ner_only[0][0] + " " # else: # new_question += ner_only[0][0] + " " # for j in range(is_are_index+1, len(ner_tokens_sentence) -1): #up to -1 as the last token is "." # new_question += ner_tokens_sentence[j][0] + " " # new_question = new_question[0:len(new_question)-1] # removes whitespace at end # new_question += "?" # return_list = [] # Is the return supposed to be a list? I guess multiple possible questions # return_list.append(new_question) sentence_2 = sentence_2[0:len(sentence_2) - 1] + "?" return [sentence_2]
def generate_question(sentence): # "is", "as a", "FAC", # NEEDS "was", "were", "are" ner_tokens_sentence = util_service.get_ner_per_token(sentence) ner_only = util_service.get_ner(sentence) if len(ner_only) == 0: return [] ner_start = ner_only[0][1] list_of_the = ["The", "the"] # sentence[ner_start - 4:ner_start] looks for "the" if sentence[ner_start - 4:ner_start - 1] in list_of_the: ner_only[0][0] = "the " + ner_only[0][0] # Finds where the is word is, then is just going to append everything after it # into a question is_are_index = -1 # flag is a string (one of "Is ", "Are ", "Was ", or "Were ") # is_are_index is where this string occurs in the tokens flag, is_are_index = which_acomp(ner_tokens_sentence) if flag == "": return [] flag = flag.lower() new_question = "What " + flag + ner_only[0][0] + "?" # q_dep_parse = util_service.get_dependency_parse(sentence) # print(q_dep_parse) # print(ner_tokens_sentence) return_list = [ ] # Is the return supposed to be a list? I guess multiple possible questions return_list.append(new_question) return return_list
def generate_question(sentence): """ generates what question based on the sentence :param sentence: """ # sentence = "Old Kingdom is most commonly regarded as the period from the Third Dynasty through to the Sixth Dynasty ." # sentence = "King Djoser's architect, Imhotep is credited with the development of building with stone and with the conception of the new architectural form—the Step Pyramid." # sentence = "The Old Kingdom is perhaps best known for the large number of pyramids constructed at this time as burial places for Egypt's kings." # sentence = 'For this reason, the Old Kingdom is frequently referred to as "the Age of the Pyramids."' # sentence = "The first is called the Meidum pyramid, named for its location in Egypt." # sentence = "There were military expeditions into Canaan and Nubia, with Egyptian influence reaching up the Nile into what is today the Sudan." # sentence = "She is a forward for the Orlando Pride and the United States women's national soccer team." # sentence = """Alexandra "Alex" Patricia Morgan Carrasco (born July 2, 1989), née Alexandra Patricia Morgan, is an American soccer player, Olympic gold medalist, and FIFA Women's World Cup champion.""" # sentence = "List of Olympic medalists in football" # util_service.get_dep_parse_tree(sentence)[1] # util_service.get_pos(sentence) is_idx = -1 was_idx = -1 born_idx = -1 in_idx = -1 sent_tokens = sentence.split() # get index of is try: is_idx = sent_tokens.index("is") except: pass try: born_idx = sent_tokens.index("born") except: pass try: was_idx = sent_tokens.index("was") except: pass try: in_idx = sent_tokens.index("in") except: pass ner_only = util_service.get_ner(sentence) # print(ner_only) ner_tags = util_service.get_ner_per_token(sentence) # print(ner_tags) # dep_parse = util_service.get_dep_parse_tree_Evan(sentence) #heads children # print(dep_parse) was_idx_ner = -1 is_idx_ner = -1 if was_idx != -1: was_idx_ner = get_was_idx_from_ner(ner_tags) if is_idx != -1: is_idx_ner = get_is_idx_from_ner(ner_tags) if in_idx != -1: in_idx_ner = get_in_idx_from_ner(ner_tags) # This is specific to where someone was born (rather, who was born in _location_) idx_ner = max(was_idx_ner, is_idx_ner) if (ner_tags[idx_ner - 1][1] == "PERSON"): if born_idx != -1: location = get_location(ner_tags, in_idx_ner, ner_only) # print("Loc: ", location) q = "Who was born in " + location + "?" # print([q]) return [q] else: q_type = "" # This tries to flip the end to the beginning, see following example: # "Some sets for the film were built in Glen Coe, Scotland, near the Clachaig Inn." # sentence 1 # "Were some sets for the film built in Glen Coe, Scotland, near the Clachaig Inn?" # sentence 2 # "Were some sets for the film built in Glen Coe, Scotland, near where?" # sentence 3 # "Where were some sets for the film built in Glen Coe, Scotland, near?" # sentence 4 try: is_was_are_were, is_are_index = which_acomp(ner_tags) before_sentence_2, after_sentence_2 = build_sides( ner_tags, is_are_index) sentence_2 = is_was_are_were + before_sentence_2 + after_sentence_2 # print(sentence_2) last_named_entity = ner_only[len(ner_only) - 1] sentence_4, ended_with_where_or_what = change_2_to_4( sentence_2, last_named_entity) if not ended_with_where_or_what: return [] return [sentence_4] # sentence_4 = change_2_to_4(sentence_3) # print(sentence_4) except: return [] return []
def generate_question(sentence): """ generates what question based on the sentence :param sentence: """ # sentence = "Old Kingdom is most commonly regarded as the period from the Third Dynasty through to the Sixth Dynasty ." # sentence = "King Djoser's architect, Imhotep is credited with the development of building with stone and with the conception of the new architectural form—the Step Pyramid." # sentence = "The Old Kingdom is perhaps best known for the large number of pyramids constructed at this time as burial places for Egypt's kings." # sentence = 'For this reason, the Old Kingdom is frequently referred to as "the Age of the Pyramids."' # sentence = "The first is called the Meidum pyramid, named for its location in Egypt." # sentence = "There were military expeditions into Canaan and Nubia, with Egyptian influence reaching up the Nile into what is today the Sudan." # sentence = "She is a forward for the Orlando Pride and the United States women's national soccer team." # sentence = """Alexandra "Alex" Patricia Morgan Carrasco (born July 2, 1989), née Alexandra Patricia Morgan, is an American soccer player, Olympic gold medalist, and FIFA Women's World Cup champion.""" # sentence = "List of Olympic medalists in football" # util_service.get_dep_parse_tree(sentence)[1] # util_service.get_pos(sentence) is_idx = 0 sent_tokens = sentence.split() # get index of is try: is_idx = sent_tokens.index("is") except: return [] # getting the end of question passed_tokens = [] for i, token in enumerate(sent_tokens): if i >= is_idx: if (i == is_idx+1): if token in ["of"]: continue passed_tokens.append(token) # replaces . with ? if it is last token or it is a part of last token if passed_tokens[-1] == ".": passed_tokens[-1] = "?" elif passed_tokens[-1].endswith("."): passed_tokens[-1] = re.sub("(.*)\.", "\\1?", passed_tokens[-1]) else: passed_tokens.append("?") # need to identify q_type # could be who, what # from NER # if it is GPE then "what" # if it is "PERSON" or "PRON" then "who" # dep parse is not providing expected output Old Kingdom is coming as pnoun q_type = "" # dep_parse = util_service.get_dep_parse_tree(sentence)[1] pos_tags = util_service.get_pos(sentence) ner_tags = util_service.get_ner_per_token(sentence) is_idx_ner = get_is_idx_from_ner(ner_tags) if is_idx_ner != -1: if ner_tags[is_idx_ner - 1][1] == "GPE": q_type = "What" elif ner_tags[is_idx_ner - 1][1] == "PERSON": q_type = "Who" elif ner_tags[is_idx_ner - 1][0] == "," and ner_tags[is_idx_ner - 2][1] == "PERSON": q_type = "Who" # checks if there is a pronoun before is word from pos tags elif pos_tags[is_idx - 1][1] == "PRON": q_type = "Who" if q_type == "": return [] q = q_type + " " + " ".join(passed_tokens) return [q]
def generate_question(sentence): # print("ORIGINAL SENTENCE: ", sentence) # ner_only = util_service.get_ner(sentence) # print(ner_only) try: is_idx = -1 was_idx = -1 sent_tokens = sentence.split() # get index of is try: is_idx = sent_tokens.index("is") except: pass try: was_idx = sent_tokens.index("was") except: pass # print("Passed try except") # print(is_idx) # print(was_idx) # getting the end of question if is_idx != -1: # print("is idx") passed_tokens = [] for i, token in enumerate(sent_tokens): if i > is_idx: if (i == is_idx+1): if token in ["of"]: continue passed_tokens.append(token) elif was_idx != -1: # print("was idx") passed_tokens = [] for i, token in enumerate(sent_tokens): if i > was_idx: if (i == was_idx+1): if token in ["of"]: continue passed_tokens.append(token) # print("Replacing last token") # print(passed_tokens) # replaces . with ? if it is last token or it is a part of last token if passed_tokens[-1] == ".": passed_tokens[-1] = "?" elif passed_tokens[-1].endswith("."): passed_tokens[-1] = re.sub("(.*)\.", "\\1?", passed_tokens[-1]) else: passed_tokens.append("?") # print("Doing ner_tags") ner_tags = util_service.get_ner_per_token(sentence) # print("doing ner_only") ner_only = util_service.get_ner(sentence) # print("doing is_idx") is_idx_ner = get_is_idx_from_ner(ner_tags) # print("Doing was_idx") was_idx_ner = get_was_idx_from_ner(ner_tags) # print("substance_of_sent now") substance_of_sent = " ".join(passed_tokens) # print(ner_only) # print(is_idx_ner) # print(was_idx_ner) acomp_idx_ner = -1 if is_idx_ner > was_idx_ner: acomp_idx_ner = is_idx_ner acomp_word = "Is" else: acomp_idx_ner = was_idx_ner acomp_word = "Was" # acomp_idx_ner = max(is_idx_ner, was_idx_ner) if acomp_idx_ner != -1: # print(ner_tags[acomp_idx_ner - 1][1]) if ner_tags[acomp_idx_ner - 1][1] == "ORG": wrong = get_random_org() elif ner_tags[acomp_idx_ner - 1][1] == "GPE": wrong = get_random_gpe() elif ner_tags[acomp_idx_ner - 1][1] == "PERSON": # print("Person") wrong = get_random_name() # print(wrong) elif ner_tags[acomp_idx_ner - 1][1] == "DATE": wrong = get_diff_date(ner_tags[acomp_idx_ner - 1][0]) elif ner_tags[acomp_idx_ner - 1][1] == "LOC": # q_type = "Who" wrong = get_random_loc() elif ner_tags[acomp_idx_ner - 1][1] == "QUANTITY": # q_type = "Who" wrong = get_random_number() elif ner_tags[acomp_idx_ner - 1][1] == "MONEY": # q_type = "Who" wrong = "$42" elif ner_tags[acomp_idx_ner - 1][1] == "PERCENT": # q_type = "Who" wrong = "42%" else: return [] # q = "Is " + ner_tags[acomp_idx_ner - 1][0] + " " + " ".join(passed_tokens) + " or " + wrong + "?" # print(ner_only) q = acomp_word + " " + ner_only[0][0] + " or " + wrong + " " + substance_of_sent # print(q) return [q] except: return []
def generate_question(sentence): # print("-=-==-=-") # print(sentence) # print("-=-==-=-") ## pattern 1 # sentence = "In January 2012, Morgan and national teammate Heather Mitts became brand ambassadors for health product company, GNC." # sentence = "In July 2011, Morgan signed a one-year endorsement deal with Bank of America." # sentence = "In 2013, Morgan appeared in television commercials for Bridgestone." # sentence = "In 2015, Morgan starred in a Nationwide Mutual Insurance Company commercial that that was broadcast nationwide in the United States." # sentence = "In May 2015, Morgan was featured on the cover of ESPN The Magazine with teammates Abby Wambach and Sydney Leroux." # sentence = "In 2013, Morgan appeared in the ESPN documentary series, Nine for IX." # sentence = "In May of the same year, Morgan likeness appeared on The Simpsons along with Christen Press and Abby Wambach." # sentence = "On August 31, 2013, Portland captured the inaugural National Women’s Soccer League championship title after defeating regular season champions Western New York Flash 2–0." # sentence = "Throughout the 2011 season, Morgan played in 14 matches and scored 4 goals." # sentence = "At age 17, Morgan was called up to the United States" # sentence = "In the 2012 London Olympics She scored the game-winning goal in the 123rd minute of the semifinal game against Canada." # TODO: # ## pattern 2 to be built later # sentence = "Morgan married Servando Carrasco, also a soccer player, on December 31, 2014." util_service.get_dep_parse_tree(sentence)[1] pos_tags = util_service.get_pos(sentence) ner_tags = util_service.get_ner_per_token(sentence) sent_tokens = util_service.get_tokenized_form(sentence) lemma_tokens = util_service.get_lemmatize_form(sentence) ## in {date}, Nsub Verb ## look for the above pattern only, if result found return ## other patterns to be identified and invoked later date_span = [] for i, sent_token in enumerate(sent_tokens): if i == 0 and sent_tokens[i].lower() in [ "in", "on", "throughout", "at" ]: date_span.append(i) continue if ner_tags[i][1] == "DATE": date_span.append(i) continue if sent_tokens[i] == ",": date_span.append(i) continue break if not len(date_span): return [] question_sent = [] first_verb_flag = False if pos_tags[date_span[-1] + 1][1] in ["PROPN"]: for i, sent_token in enumerate(sent_tokens): if i <= date_span[-1]: continue # convert only 1st verb to lemma # overfitting for sentences consiting two verbs, eg second verb defeating, would change to defeat -> On August 31, 2013, Portland captured the inaugural National Women’s Soccer League championship title after defeating regular season champions Western New York Flash 2–0. if pos_tags[i][1] == "VERB" and not first_verb_flag: if sent_token != "was": # check for passive voice question_sent.append(lemma_tokens[i]) first_verb_flag = True continue continue if sent_token == "\n": continue question_sent.append(sent_token) if not len(question_sent): return [] # replaces . with ? if it is last token or it is a part of last token if question_sent[-1] == ".": question_sent[-1] = "?" elif question_sent[-1].endswith("."): question_sent[-1] = re.sub("(.*)\.", "\\1?", question_sent[-1]) else: question_sent.append("?") q_when = "When did " + " ".join(question_sent) q_did = "Did " + " ".join(question_sent) return [q_when, q_did]