def add_mention(self, qid: str, mention: str, score: float): """Add mention to QID with the associated score. The mention already exists, error thrown to call ``set_score`` instead. If there are already max candidates to that mention, the last candidate of the mention is removed in place of QID. Args: qid: QID mention: mention score: score Returns: """ # Cast to lower and stripped for aliases mention = utils.get_lnrm(mention, strip=True, lower=True) # If mention is in mapping, make sure the qid is not if mention in self._alias2qids: if qid in set(map(lambda x: x[0], self._alias2qids[mention])): logger.warning( f"The QID {qid} is already associated with {mention}. Use set_score if you want to change " f"the score of an existing mention-qid pair") return # If mention is not in mapping, add it if mention not in self._alias2qids: self._alias2qids[mention] = [] new_al_id = self.max_alid + 1 self.max_alid += 1 assert ( new_al_id not in self._id2alias), f"{new_al_id} already in self_id2alias" self._alias2id[mention] = new_al_id self._id2alias[new_al_id] = mention # msg = f"You have added a new mention to the dataset. You MUST reprep you data for this to take effect. # Set data_config.overwrite_preprocessed_data to be True. This warning will now be supressed." # logger.warning(msg) # warnings.filterwarnings("ignore", message=msg) assert ( mention not in self._qid2aliases[qid] ), f"{mention} was a mention for {qid} despite the alias mapping saying otherwise" # If adding will go beyond max candidates, remove the last candidate. Even if the score is higher, # the user still wants this mention added. if len(self._alias2qids[mention]) >= self.max_candidates: qid_to_remove = self._alias2qids[mention][-1][0] self.remove_mention(qid_to_remove, mention) assert ( len(self._alias2qids[mention]) < self.max_candidates ), f"Invalid state: {mention} still has more than {self.max_candidates} candidates after removal" # Add pair self._alias2qids[mention].append([qid, score]) self._alias2qids[mention] = sorted(self._alias2qids[mention], key=lambda x: x[1], reverse=True) self._qid2aliases[qid].add(mention)
def main(): args = parse_args() alias2qids_dict = defaultdict(set) qid2freq = defaultdict(int) with jsonlines.open(args.train_file) as f: for line in f: # this includes weakly labelled aliases for qid, alias in zip(line["qids"], line["aliases"]): # aliases are lower-cased alias2qids_dict[get_lnrm(alias, strip=True, lower=True)].add(qid) qid2freq[qid] += 1 alias2qids = {} for al in tqdm(alias2qids_dict): qid_cands = [[q, qid2freq[q]] for q in alias2qids_dict[al]] qid_cands = sorted(qid_cands, key=lambda x: x[1], reverse=True) alias2qids[al] = qid_cands with open(args.alias2qids_file, "w") as f: ujson.dump(alias2qids, f)
def compute_occurrences_single(args, max_alias_len=6): data_file, lower, strip = args num_lines = sum(1 for _ in open(data_file)) global all_aliases # entity histogram ent_occurrences = Counter() # alias histogram alias_occurrences = Counter() # alias text occurrances alias_text_occurrences = Counter() # number of aliases per sentence alias_pair_occurrences = Counter() # alias|entity histogram alias_entity_pair = Counter() with open(data_file, "r") as in_file: for line in tqdm(in_file, total=num_lines): line = json.loads(line.strip()) for n in range(max_alias_len + 1, 0, -1): grams = nltk.ngrams(line["sentence"].split(), n) for gram_words in grams: gram_attempt = get_lnrm(" ".join(gram_words), lower, strip) if gram_attempt in all_aliases: alias_text_occurrences[gram_attempt] += 1 # Get aliases in wikipedia _before_ the swapping - these represent the true textual aliases aliases = line["unswap_aliases"] qids = line["qids"] for qid, alias in zip(qids, aliases): ent_occurrences[qid] += 1 alias_occurrences[alias] += 1 alias_entity_pair[alias + "|" + qid] += 1 alias_pair_occurrences[len(aliases)] += 1 results = { "ent_occurrences": ent_occurrences, "alias_occurrences": alias_occurrences, "alias_text_occurrences": alias_text_occurrences, "alias_pair_occurrences": alias_pair_occurrences, "alias_entity_pair": alias_entity_pair, } return results
def find_aliases_in_sentence_tag(sentence, all_aliases, max_alias_len=6): """Mention extraction function. Args: sentence: text all_aliases: Trie of all aliases in our save max_alias_len: maximum length (in words) of an alias Returns: list of aliases, list of span offsets """ used_aliases = [] # Remove multiple spaces and replace with single - tokenization eats multiple spaces but # ngrams doesn't which can cause parse issues sentence = " ".join(sentence.strip().split()) doc = nlp(sentence) split_sent = sentence.split() new_to_old_span = get_new_to_old_dict(split_sent) # find largest aliases first for n in range(max_alias_len + 1, 0, -1): grams = nltk.ngrams(doc, n) j_st = -1 j_end = n - 1 for gram_words in grams: j_st += 1 j_end += 1 j_st_adjusted = new_to_old_span[j_st] j_end_adjusted = new_to_old_span[j_end] # Check if nlp has split the word and we are looking at a subword mention - which we don't want is_subword = j_st_adjusted == j_end_adjusted if j_st > 0: is_subword = is_subword | (j_st_adjusted == new_to_old_span[j_st - 1]) # j_end is exclusive and should be a new word from the previous j_end-1 is_subword = is_subword | (j_end_adjusted == new_to_old_span[j_end - 1]) if is_subword: continue # Assert we are a full word assert (j_st_adjusted != j_end_adjusted ), f"Something went wrong getting mentions for {sentence}" # If single word and not in a POS we care about, skip if len(gram_words) == 1 and gram_words[0].pos_ not in KEEP_POS: continue # If multiple word and not any word in a POS we care about, skip if len(gram_words) > 1 and not any(g.pos_ in KEEP_POS for g in gram_words): continue # print("@", gram_words, [g.pos_ for g in gram_words]) # If we are part of a proper noun, make sure there isn't another part of the proper noun to the # left or right - this means we didn't have the entire name in our alias and we should skip if len(gram_words) == 1 and gram_words[0].pos_ == "PROPN": if j_st > 0 and doc[j_st - 1].pos_ == "PROPN": continue # End spans are exclusive so no +1 if j_end < len(doc) and doc[j_end].pos_ == "PROPN": continue # print("3", j_st, gram_words, [g.pos_ for g in gram_words]) # We don't want punctuation words to be used at the beginning/end unless it's capitalized # or first word of sentence if (gram_words[-1].text in PLURAL or gram_words[0].text in PLURAL or (gram_words[0].text.lower() in ALL_STOPWORDS and (not gram_words[0].text[0].isupper() or j_st == 0))): continue # If the word starts with punctuation and there is a space in between, also continue; keep # if punctuation is part of the word boundary # print("4", j_st, gram_words, [g.pos_ for g in gram_words]) if (gram_words[0].text in PUNC and (j_st + 1 >= len(doc) or new_to_old_span[j_st] != new_to_old_span[j_st + 1])) or ( gram_words[-1].text in PUNC and (j_end - 2 < 0 or new_to_old_span[j_end - 1] != new_to_old_span[j_end - 2])): continue joined_gram = " ".join(split_sent[j_st_adjusted:j_end_adjusted]) # If 's in alias, make sure we remove the space and try that alias, too joined_gram_merged_plural = joined_gram.replace(" 's", "'s") # If PUNC in alias, make sure we remove the space and try that alias, too joined_gram_merged_nopunc = joined_gram_merged_plural.translate( table) gram_attempt = get_lnrm(joined_gram, strip=True, lower=True) gram_attempt_merged_plural = get_lnrm(joined_gram_merged_plural, strip=True, lower=True) gram_attempt_merged_nopunc = get_lnrm(joined_gram_merged_nopunc, strip=True, lower=True) # Remove numbers if (gram_attempt.isnumeric() or joined_gram_merged_plural.isnumeric() or gram_attempt_merged_nopunc.isnumeric()): continue final_gram = None # print("4", gram_attempt, [g.pos_ for g in gram_words]) if gram_attempt in all_aliases: final_gram = gram_attempt elif gram_attempt_merged_plural in all_aliases: final_gram = gram_attempt_merged_plural elif gram_attempt_merged_nopunc in all_aliases: final_gram = gram_attempt_merged_nopunc # print("5", final_gram, [g.pos_ for g in gram_words]) # print("FINAL GRAM", final_gram) if final_gram is not None: keep = True # We start from the largest n-grams and go down in size. This prevents us from adding an alias that # is a subset of another. For example: "Tell me about the mother on how I met you mother" will find # "the mother" as alias and "mother". We want to only take "the mother" and not "mother" as it's # likely more descriptive of the real entity. for u_al in used_aliases: u_j_st = u_al[1] u_j_end = u_al[2] if j_st_adjusted < u_j_end and j_end_adjusted > u_j_st: keep = False break if not keep: continue used_aliases.append( tuple([final_gram, j_st_adjusted, j_end_adjusted])) # sort based on span order aliases_for_sorting = sorted(used_aliases, key=lambda elem: [elem[1], elem[2]]) used_aliases = [a[0] for a in aliases_for_sorting] spans = [[a[1], a[2]] for a in aliases_for_sorting] assert all([sp[1] <= len(doc) for sp in spans]), f"{spans} {sentence}" return used_aliases, spans