def preprocess_mention(self, m): """ Responsible for preprocessing a mention and making sure we find a set of matching candidates in our database. :return: mention """ # TODO: This can be optimised (less db calls required). cur_m = modify_uppercase_phrase(m) freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") if not freq_lookup_cur_m: cur_m = m freq_lookup_m = self.wiki_db.wiki(m, "wiki", "freq") freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") if freq_lookup_m and (freq_lookup_m > freq_lookup_cur_m): # Cases like 'U.S.' are handed badly by modify_uppercase_phrase cur_m = m freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") # If we cannot find the exact mention in our index, we try our luck to # find it in a case insensitive index. if not freq_lookup_cur_m: # cur_m and m both not found, verify if lower-case version can be found. find_lower = self.wiki_db.wiki(m.lower(), "wiki", "lower") if find_lower: cur_m = find_lower freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") # Try and remove first or last characters (e.g. 'Washington,' to 'Washington') # To be error prone, we only try this if no match was found thus far, else # this might get in the way of 'U.S.' converting to 'US'. # Could do this recursively, interesting to explore in future work. if not freq_lookup_cur_m: temp = re.sub(r"[\(.|,|!|')]", "", m).strip() simple_lookup = self.wiki_db.wiki(temp, "wiki", "freq") if simple_lookup: cur_m = temp return cur_m
def process_aida(self, dataset): """ Preprocesses AIDA into format such that it can be used for training and evaluation the local ED model. :return: AIDA dataset with respective ground truth values. In the case of AIDA-A/B (val and test respectively), this function returns both in a dictionary. """ if dataset == "train": dataset = "aida_train.txt" elif dataset == "test": dataset = "testa_testb_aggregate_original" file_path = "{}{}".format(self.aida_path, dataset) sentences = {} sentence = [] gt_sent = [] contents = {} i_sent = 0 total_cnt = 0 missing_gt = 0 doc_name = None prev_doc_name = None doc_cnt = 0 cnt_replaced = 0 with open(file_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if "-DOCSTART-" in line: if len(sentence) > 0: sentence_words = " ".join(sentence) for gt in gt_sent: assert ( sentence_words[gt[2]:gt[2] + len(gt[3])].lower() == gt[3].lower() ), "AIDA ground-truth incorrect position. {};{};{}".format( sentence_words, gt[2], gt[3]) sentences[i_sent] = [sentence_words, gt_sent] for _, _, pos, ment in gt_sent: find_ment = sentence_words[pos:pos + len(ment)] assert ( ment.lower() == find_ment.lower() ), "Mention not found on position.. {}, {}, {}, {}".format( ment, find_ment, pos, sentence_words) if len(sentences) > 0: contents[doc_name] = sentences words = split_in_words_mention(line) for w in words: if ("testa" in w) or ("testb" in w): doc_name = w.replace("(", "").replace(")", "") break else: doc_name = line[12:] if ("testb" in doc_name) and ("testa" in prev_doc_name): self.__save(self.__format(contents), "aida_testA") contents = {} prev_doc_name = doc_name sentences = {} sentence = [] gt_sent = [] i_sent = 0 else: parts = line.split("\t") assert len(parts) in [0, 1, 4, 6, 7], line if len(parts) <= 0: continue if len(parts) in [7, 6] and parts[1] == "B": y = parts[4].find("/wiki/") + len("/wiki/") ent_title = parts[4][y:].replace("_", " ") mention_gt = parts[2] total_cnt += 1 if (ent_title not in self.wikipedia. wiki_id_name_map["ent_name_to_id"]): ent_title_temp = self.wikipedia.preprocess_ent_name( ent_title) if (ent_title_temp in self.wikipedia. wiki_id_name_map["ent_name_to_id"]): ent_title = ent_title_temp cnt_replaced += 1 pos_mention_gt = ( len(" ".join(sentence)) + 1 if len(sentence) > 0 else 0 ) # + 1 for space between mention and sentence gt_sent.append([ self.preprocess_mention(mention_gt), ent_title, pos_mention_gt, mention_gt, ]) words = mention_gt if len(parts) >= 2 and parts[1] == "B": words = [ modify_uppercase_phrase(x) for x in split_in_words_mention(parts[2]) ] elif len(parts) >= 2 and parts[1] == "I": # Continuation of mention, which we have added prior # to this iteration, so we skip it. continue else: words = [ modify_uppercase_phrase(w) for w in split_in_words_mention(parts[0]) ] # WAS _mention if (parts[0] == ".") and (len(sentence) > 0): # End of sentence, store sentence and additional ground truth mentions. sentence_words = " ".join(sentence) if i_sent in sentences: i_sent += 1 sentences[i_sent] = [ sentence_words, gt_sent, ] # unidecode.unidecode(sentence_words) i_sent += 1 sentence = [] gt_sent = [] elif len(words) > 0: sentence += words if len(sentence) > 0: sentence_words = " ".join(sentence) sentences[i_sent] = [sentence_words, gt_sent] if len(sentences) > 0: contents[doc_name] = sentences if "train" in dataset: self.__save(self.__format(contents), "aida_train") else: self.__save(self.__format(contents), "aida_testB") print("Replaced {} ground truth entites".format(cnt_replaced))