def preprocess_mention(self, m):
        """
        Responsible for preprocessing a mention and making sure we find a set of matching candidates
        in our database.

        :return: mention
        """

        # TODO: This can be optimised (less db calls required).
        cur_m = modify_uppercase_phrase(m)
        freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq")

        if not freq_lookup_cur_m:
            cur_m = m

        freq_lookup_m = self.wiki_db.wiki(m, "wiki", "freq")
        freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq")

        if freq_lookup_m and (freq_lookup_m > freq_lookup_cur_m):
            # Cases like 'U.S.' are handed badly by modify_uppercase_phrase
            cur_m = m

        freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq")
        # If we cannot find the exact mention in our index, we try our luck to
        # find it in a case insensitive index.
        if not freq_lookup_cur_m:
            # cur_m and m both not found, verify if lower-case version can be found.
            find_lower = self.wiki_db.wiki(m.lower(), "wiki", "lower")

            if find_lower:
                cur_m = find_lower

        freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq")
        # Try and remove first or last characters (e.g. 'Washington,' to 'Washington')
        # To be error prone, we only try this if no match was found thus far, else
        # this might get in the way of 'U.S.' converting to 'US'.
        # Could do this recursively, interesting to explore in future work.
        if not freq_lookup_cur_m:
            temp = re.sub(r"[\(.|,|!|')]", "", m).strip()
            simple_lookup = self.wiki_db.wiki(temp, "wiki", "freq")

            if simple_lookup:
                cur_m = temp

        return cur_m
Exemple #2
0
    def process_aida(self, dataset):
        """
        Preprocesses AIDA into format such that it can be used for training and evaluation the local ED model.

        :return: AIDA dataset with respective ground truth values. In the case of AIDA-A/B (val and test respectively),
        this function returns both in a dictionary.
        """

        if dataset == "train":
            dataset = "aida_train.txt"
        elif dataset == "test":
            dataset = "testa_testb_aggregate_original"

        file_path = "{}{}".format(self.aida_path, dataset)
        sentences = {}

        sentence = []
        gt_sent = []
        contents = {}
        i_sent = 0

        total_cnt = 0
        missing_gt = 0
        doc_name = None
        prev_doc_name = None
        doc_cnt = 0
        cnt_replaced = 0

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()

                if "-DOCSTART-" in line:
                    if len(sentence) > 0:
                        sentence_words = " ".join(sentence)
                        for gt in gt_sent:
                            assert (
                                sentence_words[gt[2]:gt[2] +
                                               len(gt[3])].lower() ==
                                gt[3].lower()
                            ), "AIDA ground-truth incorrect position. {};{};{}".format(
                                sentence_words, gt[2], gt[3])
                        sentences[i_sent] = [sentence_words, gt_sent]

                        for _, _, pos, ment in gt_sent:
                            find_ment = sentence_words[pos:pos + len(ment)]
                            assert (
                                ment.lower() == find_ment.lower()
                            ), "Mention not found on position.. {}, {}, {}, {}".format(
                                ment, find_ment, pos, sentence_words)

                    if len(sentences) > 0:
                        contents[doc_name] = sentences

                    words = split_in_words_mention(line)
                    for w in words:
                        if ("testa" in w) or ("testb" in w):
                            doc_name = w.replace("(", "").replace(")", "")
                            break
                        else:
                            doc_name = line[12:]

                    if ("testb" in doc_name) and ("testa" in prev_doc_name):
                        self.__save(self.__format(contents), "aida_testA")
                        contents = {}

                    prev_doc_name = doc_name
                    sentences = {}
                    sentence = []
                    gt_sent = []
                    i_sent = 0
                else:
                    parts = line.split("\t")
                    assert len(parts) in [0, 1, 4, 6, 7], line
                    if len(parts) <= 0:
                        continue

                    if len(parts) in [7, 6] and parts[1] == "B":
                        y = parts[4].find("/wiki/") + len("/wiki/")
                        ent_title = parts[4][y:].replace("_", " ")
                        mention_gt = parts[2]
                        total_cnt += 1

                        if (ent_title not in self.wikipedia.
                                wiki_id_name_map["ent_name_to_id"]):
                            ent_title_temp = self.wikipedia.preprocess_ent_name(
                                ent_title)
                            if (ent_title_temp in self.wikipedia.
                                    wiki_id_name_map["ent_name_to_id"]):
                                ent_title = ent_title_temp
                                cnt_replaced += 1

                        pos_mention_gt = (
                            len(" ".join(sentence)) +
                            1 if len(sentence) > 0 else 0
                        )  # + 1 for space between mention and sentence
                        gt_sent.append([
                            self.preprocess_mention(mention_gt),
                            ent_title,
                            pos_mention_gt,
                            mention_gt,
                        ])
                        words = mention_gt

                    if len(parts) >= 2 and parts[1] == "B":
                        words = [
                            modify_uppercase_phrase(x)
                            for x in split_in_words_mention(parts[2])
                        ]
                    elif len(parts) >= 2 and parts[1] == "I":
                        # Continuation of mention, which we have added prior
                        # to this iteration, so we skip it.
                        continue
                    else:
                        words = [
                            modify_uppercase_phrase(w)
                            for w in split_in_words_mention(parts[0])
                        ]  # WAS _mention

                    if (parts[0] == ".") and (len(sentence) > 0):
                        # End of sentence, store sentence and additional ground truth mentions.
                        sentence_words = " ".join(sentence)
                        if i_sent in sentences:
                            i_sent += 1
                        sentences[i_sent] = [
                            sentence_words,
                            gt_sent,
                        ]  # unidecode.unidecode(sentence_words)
                        i_sent += 1
                        sentence = []
                        gt_sent = []
                    elif len(words) > 0:
                        sentence += words
        if len(sentence) > 0:
            sentence_words = " ".join(sentence)
            sentences[i_sent] = [sentence_words, gt_sent]
        if len(sentences) > 0:
            contents[doc_name] = sentences

        if "train" in dataset:
            self.__save(self.__format(contents), "aida_train")
        else:
            self.__save(self.__format(contents), "aida_testB")
        print("Replaced {} ground truth entites".format(cnt_replaced))