Beispiel #1
0
    def add_mention(self, qid: str, mention: str, score: float):
        """Add mention to QID with the associated score. The mention already
        exists, error thrown to call ``set_score`` instead. If there are
        already max candidates to that mention, the last candidate of the
        mention is removed in place of QID.

        Args:
            qid: QID
            mention: mention
            score: score

        Returns:
        """
        # Cast to lower and stripped for aliases
        mention = utils.get_lnrm(mention, strip=True, lower=True)

        # If mention is in mapping, make sure the qid is not
        if mention in self._alias2qids:
            if qid in set(map(lambda x: x[0], self._alias2qids[mention])):
                logger.warning(
                    f"The QID {qid} is already associated with {mention}. Use set_score if you want to change "
                    f"the score of an existing mention-qid pair")
                return
        # If mention is not in mapping, add it
        if mention not in self._alias2qids:
            self._alias2qids[mention] = []
            new_al_id = self.max_alid + 1
            self.max_alid += 1
            assert (
                new_al_id
                not in self._id2alias), f"{new_al_id} already in self_id2alias"
            self._alias2id[mention] = new_al_id
            self._id2alias[new_al_id] = mention
            # msg = f"You have added a new mention to the dataset. You MUST reprep you data for this to take effect.
            # Set data_config.overwrite_preprocessed_data to be True. This warning will now be supressed."
            # logger.warning(msg)
            # warnings.filterwarnings("ignore", message=msg)

        assert (
            mention not in self._qid2aliases[qid]
        ), f"{mention} was a mention for {qid} despite the alias mapping saying otherwise"
        # If adding will go beyond max candidates, remove the last candidate. Even if the score is higher,
        # the user still wants this mention added.
        if len(self._alias2qids[mention]) >= self.max_candidates:
            qid_to_remove = self._alias2qids[mention][-1][0]
            self.remove_mention(qid_to_remove, mention)
            assert (
                len(self._alias2qids[mention]) < self.max_candidates
            ), f"Invalid state: {mention} still has more than {self.max_candidates} candidates after removal"
        # Add pair
        self._alias2qids[mention].append([qid, score])
        self._alias2qids[mention] = sorted(self._alias2qids[mention],
                                           key=lambda x: x[1],
                                           reverse=True)
        self._qid2aliases[qid].add(mention)
def main():
    args = parse_args()
    alias2qids_dict = defaultdict(set)
    qid2freq = defaultdict(int)
    with jsonlines.open(args.train_file) as f:
        for line in f:
            # this includes weakly labelled aliases
            for qid, alias in zip(line["qids"], line["aliases"]):
                # aliases are lower-cased
                alias2qids_dict[get_lnrm(alias, strip=True,
                                         lower=True)].add(qid)
                qid2freq[qid] += 1

    alias2qids = {}
    for al in tqdm(alias2qids_dict):
        qid_cands = [[q, qid2freq[q]] for q in alias2qids_dict[al]]
        qid_cands = sorted(qid_cands, key=lambda x: x[1], reverse=True)
        alias2qids[al] = qid_cands

    with open(args.alias2qids_file, "w") as f:
        ujson.dump(alias2qids, f)
def compute_occurrences_single(args, max_alias_len=6):
    data_file, lower, strip = args
    num_lines = sum(1 for _ in open(data_file))
    global all_aliases
    # entity histogram
    ent_occurrences = Counter()
    # alias histogram
    alias_occurrences = Counter()
    # alias text occurrances
    alias_text_occurrences = Counter()
    # number of aliases per sentence
    alias_pair_occurrences = Counter()
    # alias|entity histogram
    alias_entity_pair = Counter()
    with open(data_file, "r") as in_file:
        for line in tqdm(in_file, total=num_lines):
            line = json.loads(line.strip())
            for n in range(max_alias_len + 1, 0, -1):
                grams = nltk.ngrams(line["sentence"].split(), n)
                for gram_words in grams:
                    gram_attempt = get_lnrm(" ".join(gram_words), lower, strip)
                    if gram_attempt in all_aliases:
                        alias_text_occurrences[gram_attempt] += 1
            # Get aliases in wikipedia _before_ the swapping - these represent the true textual aliases
            aliases = line["unswap_aliases"]
            qids = line["qids"]
            for qid, alias in zip(qids, aliases):
                ent_occurrences[qid] += 1
                alias_occurrences[alias] += 1
                alias_entity_pair[alias + "|" + qid] += 1
            alias_pair_occurrences[len(aliases)] += 1
    results = {
        "ent_occurrences": ent_occurrences,
        "alias_occurrences": alias_occurrences,
        "alias_text_occurrences": alias_text_occurrences,
        "alias_pair_occurrences": alias_pair_occurrences,
        "alias_entity_pair": alias_entity_pair,
    }
    return results
def find_aliases_in_sentence_tag(sentence, all_aliases, max_alias_len=6):
    """Mention extraction function.

    Args:
        sentence: text
        all_aliases: Trie of all aliases in our save
        max_alias_len: maximum length (in words) of an alias

    Returns: list of aliases, list of span offsets
    """
    used_aliases = []
    # Remove multiple spaces and replace with single - tokenization eats multiple spaces but
    # ngrams doesn't which can cause parse issues
    sentence = " ".join(sentence.strip().split())

    doc = nlp(sentence)
    split_sent = sentence.split()
    new_to_old_span = get_new_to_old_dict(split_sent)
    # find largest aliases first
    for n in range(max_alias_len + 1, 0, -1):
        grams = nltk.ngrams(doc, n)
        j_st = -1
        j_end = n - 1
        for gram_words in grams:
            j_st += 1
            j_end += 1
            j_st_adjusted = new_to_old_span[j_st]
            j_end_adjusted = new_to_old_span[j_end]
            # Check if nlp has split the word and we are looking at a subword mention - which we don't want
            is_subword = j_st_adjusted == j_end_adjusted
            if j_st > 0:
                is_subword = is_subword | (j_st_adjusted
                                           == new_to_old_span[j_st - 1])
            # j_end is exclusive and should be a new word from the previous j_end-1
            is_subword = is_subword | (j_end_adjusted
                                       == new_to_old_span[j_end - 1])
            if is_subword:
                continue
            # Assert we are a full word
            assert (j_st_adjusted != j_end_adjusted
                    ), f"Something went wrong getting mentions for {sentence}"
            # If single word and not in a POS we care about, skip
            if len(gram_words) == 1 and gram_words[0].pos_ not in KEEP_POS:
                continue
            # If multiple word and not any word in a POS we care about, skip
            if len(gram_words) > 1 and not any(g.pos_ in KEEP_POS
                                               for g in gram_words):
                continue
            # print("@", gram_words, [g.pos_ for g in gram_words])
            # If we are part of a proper noun, make sure there isn't another part of the proper noun to the
            # left or right - this means we didn't have the entire name in our alias and we should skip
            if len(gram_words) == 1 and gram_words[0].pos_ == "PROPN":
                if j_st > 0 and doc[j_st - 1].pos_ == "PROPN":
                    continue
                # End spans are exclusive so no +1
                if j_end < len(doc) and doc[j_end].pos_ == "PROPN":
                    continue
            # print("3", j_st, gram_words, [g.pos_ for g in gram_words])
            # We don't want punctuation words to be used at the beginning/end unless it's capitalized
            # or first word of sentence
            if (gram_words[-1].text in PLURAL or gram_words[0].text in PLURAL
                    or (gram_words[0].text.lower() in ALL_STOPWORDS and
                        (not gram_words[0].text[0].isupper() or j_st == 0))):
                continue
            # If the word starts with punctuation and there is a space in between, also continue; keep
            # if punctuation is part of the word boundary
            # print("4", j_st, gram_words, [g.pos_ for g in gram_words])
            if (gram_words[0].text in PUNC and
                (j_st + 1 >= len(doc)
                 or new_to_old_span[j_st] != new_to_old_span[j_st + 1])) or (
                     gram_words[-1].text in PUNC and
                     (j_end - 2 < 0 or new_to_old_span[j_end - 1] !=
                      new_to_old_span[j_end - 2])):
                continue
            joined_gram = " ".join(split_sent[j_st_adjusted:j_end_adjusted])
            # If 's in alias, make sure we remove the space and try that alias, too
            joined_gram_merged_plural = joined_gram.replace(" 's", "'s")
            # If PUNC in alias, make sure we remove the space and try that alias, too
            joined_gram_merged_nopunc = joined_gram_merged_plural.translate(
                table)
            gram_attempt = get_lnrm(joined_gram, strip=True, lower=True)
            gram_attempt_merged_plural = get_lnrm(joined_gram_merged_plural,
                                                  strip=True,
                                                  lower=True)
            gram_attempt_merged_nopunc = get_lnrm(joined_gram_merged_nopunc,
                                                  strip=True,
                                                  lower=True)
            # Remove numbers
            if (gram_attempt.isnumeric()
                    or joined_gram_merged_plural.isnumeric()
                    or gram_attempt_merged_nopunc.isnumeric()):
                continue
            final_gram = None
            # print("4", gram_attempt, [g.pos_ for g in gram_words])
            if gram_attempt in all_aliases:
                final_gram = gram_attempt
            elif gram_attempt_merged_plural in all_aliases:
                final_gram = gram_attempt_merged_plural
            elif gram_attempt_merged_nopunc in all_aliases:
                final_gram = gram_attempt_merged_nopunc
                # print("5", final_gram, [g.pos_ for g in gram_words])
            # print("FINAL GRAM", final_gram)
            if final_gram is not None:
                keep = True
                # We start from the largest n-grams and go down in size. This prevents us from adding an alias that
                # is a subset of another. For example: "Tell me about the mother on how I met you mother" will find
                # "the mother" as alias and "mother". We want to only take "the mother" and not "mother" as it's
                # likely more descriptive of the real entity.
                for u_al in used_aliases:
                    u_j_st = u_al[1]
                    u_j_end = u_al[2]
                    if j_st_adjusted < u_j_end and j_end_adjusted > u_j_st:
                        keep = False
                        break
                if not keep:
                    continue
                used_aliases.append(
                    tuple([final_gram, j_st_adjusted, j_end_adjusted]))
    # sort based on span order
    aliases_for_sorting = sorted(used_aliases,
                                 key=lambda elem: [elem[1], elem[2]])
    used_aliases = [a[0] for a in aliases_for_sorting]
    spans = [[a[1], a[2]] for a in aliases_for_sorting]
    assert all([sp[1] <= len(doc) for sp in spans]), f"{spans} {sentence}"
    return used_aliases, spans