Exemple #1
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(_process_page,
                                                     dump_reader,
                                                     chunksize=chunk_size):
                titles.append(normalize(page.title))
                if page.is_redirect:
                    redirects[normalize(page.title)] = page.redirect

                for link_obj in links:
                    title_counter[normalize(link_obj.title)] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.items():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title], )))

        redirect_dict = RecordTrie('<I', redirect_items)

        delete_keys = []
        keys = list(title_counter.keys())
        for key in keys:
            title = key
            count = title_counter[key]
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Exemple #2
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(
                _process_page, dump_reader, chunksize=chunk_size
            ):
                titles.append(page.title)
                if page.is_redirect:
                    redirects[page.title] = page.redirect

                for link_obj in links:
                    title_counter[link_obj.title] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.iteritems():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title],)))

        redirect_dict = RecordTrie('<I', redirect_items)

        for (title, count) in title_counter.items():
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Exemple #3
0
class Vocab(object):
    def __init__(self, dic, start_index=0):
        if isinstance(dic, Trie):
            self._dic = dic
        else:
            self._dic = Trie(dic)

        self._start_index = start_index

    @property
    def size(self):
        return len(self)

    def __len__(self):
        return len(self._dic)

    def __iter__(self):
        return iter(self._dic)

    def __contains__(self, key):
        return key in self._dic

    def get_index(self, key, default=None):
        try:
            return self._dic.key_id(key) + self._start_index
        except KeyError:
            return default

    def get_key_by_index(self, index):
        return self._dic.restore_key(index - self._start_index)

    def save(self, out_file):
        joblib.dump(self.serialize(), out_file)

    def serialize(self):
        return dict(dic=self._dic.tobytes(), start_index=self._start_index)
class Preprocessor:
    def __init__(self, args):
        self.args = args
        self.all_titles = self._all_titles_collector()
        self.redirects = _extract_pages(self.args.path_for_raw_xml)
        self.nlp = nlp_returner(args=self.args)

        self.entity_dict = Trie(self.all_titles)

        self.redirect_dict = RecordTrie(
            '<I', [(title, (self.entity_dict[dest_title], ))
                   for (title, dest_title) in self.redirects
                   if dest_title in self.entity_dict])

    def entire_annotation_retriever(self):
        dirpath_after_wikiextractor_preprocessing = self.args.dirpath_after_wikiextractor_preprocessing
        file_paths = glob(dirpath_after_wikiextractor_preprocessing + '**/*')

        if self.args.debug:
            file_paths = file_paths[:16]

        entire_annotations = list()
        doc_title2sents = {}

        debug_idx = 0

        if self.args.multiprocessing:
            n_cores = multi.cpu_count()
            with Pool(n_cores) as pool:
                imap = pool.imap_unordered(self._one_wikifile_process,
                                           file_paths)
                result = list(tqdm(imap, total=len(file_paths)))
        else:
            for file in tqdm(file_paths):
                with open(file, 'r') as f:
                    for idx, line in tqdm(
                            enumerate(f)):  # TODO: multiprocessing
                        line = line.strip()
                        line = json.loads(line)
                        title = _normalize_title(html.unescape(line['title']))
                        one_page_text = html.unescape(line['text'])
                        annotations, sents = self._one_page_text_preprocessor(
                            title=title, text=one_page_text)
                        sents = self._section_anchor_remover(sents)
                        entire_annotations += annotations
                        if sents != list():
                            doc_title2sents.update({title: sents})
                        debug_idx += 1

                        if self.args.debug and debug_idx == 500:
                            break
                    else:
                        continue
                    break  # for debug

            print('all annotations:', len(entire_annotations))

            with open(
                    self.args.annotated_dataset_dir + self.args.world +
                    '_annotation.json', 'w') as f:
                json.dump(entire_annotations,
                          f,
                          ensure_ascii=False,
                          indent=4,
                          sort_keys=False,
                          separators=(',', ': '))

            with open(
                    self.args.annotated_dataset_dir + self.args.world +
                    '_title2doc.json', 'w') as g:
                json.dump(doc_title2sents,
                          g,
                          ensure_ascii=False,
                          indent=4,
                          sort_keys=False,
                          separators=(',', ': '))

    def _one_wikifile_process(self, file_path):
        partial_annotations = list()
        partial_doc_title2sents = {}

        with open(file_path, 'r') as f:
            for idx, line in tqdm(enumerate(f)):  # TODO: multiprocessing
                line = line.strip()
                line = json.loads(line)
                title = _normalize_title(html.unescape(line['title']))
                one_page_text = html.unescape(line['text'])
                annotations, sents = self._one_page_text_preprocessor(
                    title=title, text=one_page_text)
                sents = self._section_anchor_remover(sents)
                partial_annotations += annotations
                if sents != list():
                    partial_doc_title2sents.update({title: sents})
        d_json = {
            'annotations': partial_annotations,
            'doc_title2sents': partial_doc_title2sents
        }
        new_path = file_path.replace(
            self.args.dirpath_after_wikiextractor_preprocessing,
            self.args.annotated_dataset_dir).split('/')
        suffix = new_path[3]
        new_path = '/'.join(new_path[:3])
        if not os.path.exists(self.args.annotated_dataset_dir):
            os.mkdir(self.self.args.annotated_dataset_dir)
        if not os.path.exists(new_path):
            os.mkdir(new_path)
        new_path += '/'
        new_path += suffix
        new_path += '.json'

        with open(new_path, 'w') as dj:
            json.dump(d_json,
                      dj,
                      ensure_ascii=False,
                      indent=4,
                      sort_keys=False,
                      separators=(',', ': '))

        return 1

    def _all_titles_collector(self):
        dirpath_after_wikiextractor_preprocessing = self.args.dirpath_after_wikiextractor_preprocessing
        file_paths = glob(dirpath_after_wikiextractor_preprocessing + '**/*')
        titles = list()

        if self.args.debug:
            file_paths = file_paths[:200]

        for file in tqdm(file_paths):
            with open(file, 'r') as f:
                for line in f:
                    line = line.strip()
                    line = json.loads(line)
                    title = line['title']
                    if '/Gallery' not in title and 'List of' not in title:
                        titles.append(title)

        return list(set(titles))

    def _one_page_text_preprocessor(self, text: str, title: str):
        sections_and_sentences = self._single_newline_to_sentences(text)

        # sections_and_sentences = self._no_sentence_remover(sections_and_sentences)
        sections_and_sentences = self._section_anchor_remover(
            sections_and_sentences)
        sections_and_sentences = [
            self._external_link_remover_from_one_sentence(sentence=sentence)
            for sentence in sections_and_sentences
        ]
        # coref_link_counts_in_one_page = self._coref_link_counts(sections_and_sentences)
        annotations = list()
        sentences_in_one_doc = list()
        for sentence in sections_and_sentences:
            a_tag_remain_text, entities = self._from_anchor_tags_to_entities(
                text=sentence)
            a_tag_no_remaining_text, positions = self._convert_a_tag_to_start_and_end_position(
                text_which_may_contain_a_tag=a_tag_remain_text)
            annotation_json, sents = self._sentence_splitter_with_hyperlink_annotations(
                title, a_tag_no_remaining_text, positions, entities)
            if self.args.augmentation_with_title_set_string_match:
                annotation_json = self._from_entire_titles_distant_augmentaton(
                    annotation_json=annotation_json,
                    sents=sents,
                    document_title=title)

            if self.args.in_document_augmentation_with_its_title:
                annotation_json = self._indocument_augmentation_with_its_title(
                    annotation_json=annotation_json,
                    sents=sents,
                    document_title=title)

            # TODO: Coreference resolusion
            # if self.args.coref_augmentation:
            #     annotation_json = self._coref_augmentation(annotation_json, title, sents)

            sentences_in_one_doc += sents

            if annotation_json != {}:
                for _, annotation in annotation_json.items():
                    annotations.append(annotation)

        return annotations, sentences_in_one_doc

    def _coref_augmentation(self, annotation_json, title, sents):
        ''' add annotations from she/he/her/his match'''
        return annotation_json

    def _indocument_augmentation_with_its_title(self, annotation_json, sents,
                                                document_title):
        lower_document_title = copy.copy(document_title).lower().split(' ')
        its_partial_name = [
            name for name in lower_document_title if not name in COMMON_WORDS
        ]
        capitalized = [
            name.capitalize() for name in its_partial_name
            if not name.capitalize() in self.args.stopwords_for_augmentation
        ]

        if len(capitalized) == 0:
            return annotation_json

        for sent in sents:
            match_result_with_distant_supervision = re.finditer(
                '|'.join(capitalized), sent)

            for result in match_result_with_distant_supervision:
                span = result.span()
                mention = sent[span[0]:span[1]]
                start, end = span[0], span[1]

                same_annotation_flag = 0
                for idx, original_annotation_from_doc in annotation_json.items(
                ):
                    mention_from_annotation = original_annotation_from_doc[
                        'mention']
                    span_start_from_annotation = original_annotation_from_doc[
                        'original_sentence_mention_start']
                    span_end_from_annotation = original_annotation_from_doc[
                        'original_sentence_mention_end']
                    if mention in mention_from_annotation and span_start_from_annotation <= start and end <= span_end_from_annotation:
                        same_annotation_flag += 1
                        break

                if same_annotation_flag:
                    continue

                if sent[start] == ' ':
                    sent_annotated = sent[:start] + '<a>' + sent[
                        start:end] + ' </a>' + sent[end:]
                else:
                    sent_annotated = sent[:start] + '<a> ' + sent[
                        start:end] + ' </a>' + sent[end:]

                annotation_json.update({
                    len(annotation_json): {
                        'document_title': document_title,
                        'anchor_sent': sent_annotated,
                        'annotation_doc_entity_title': document_title,
                        'mention': sent[span[0]:span[1]],
                        'original_sentence': sent,
                        'original_sentence_mention_start': span[0],
                        'original_sentence_mention_end': span[1]
                    }
                })
        return annotation_json

    def _from_entire_titles_distant_augmentaton(self, annotation_json, sents,
                                                document_title):
        '''
        Augment annotations from title collections. Strict string match is used here.
        :param annotation_json:
        :param sents:
        :param document_title:
        :return:
        '''
        regex_pattern_for_all_titles = '|'.join(self.all_titles)
        for sent in sents:

            match_result_with_distant_supervision = re.finditer(
                regex_pattern_for_all_titles, sent)

            for result in match_result_with_distant_supervision:
                span = result.span()
                mention = sent[span[0]:span[1]]
                start, end = span[0], span[1]

                if end != len(sent) and sent[end] not in [" ", "'"]:
                    continue

                same_annotation_flag = 0
                for idx, original_annotation_from_doc in annotation_json.items(
                ):
                    mention_from_annotation = original_annotation_from_doc[
                        'mention']
                    span_start_from_annotation = original_annotation_from_doc[
                        'original_sentence_mention_start']
                    span_end_from_annotation = original_annotation_from_doc[
                        'original_sentence_mention_end']
                    if mention_from_annotation == mention and start == span_start_from_annotation and end == span_end_from_annotation:
                        same_annotation_flag += 1
                        # print('duplicated distant supervised annotation: skipped')
                        break

                if same_annotation_flag:
                    continue

                if sent[start] == ' ':
                    sent_annotated = sent[:start] + '<a>' + sent[
                        start:end] + ' </a>' + sent[end:]
                else:
                    sent_annotated = sent[:start] + '<a> ' + sent[
                        start:end] + ' </a>' + sent[end:]

                annotation_json.update({
                    len(annotation_json): {
                        'document_title': document_title,
                        'anchor_sent': sent_annotated,
                        'annotation_doc_entity_title':
                        self.get_entity(mention),  # Redirects are resolved.
                        'mention': sent[span[0]:span[1]],
                        'original_sentence': sent,
                        'original_sentence_mention_start': span[0],
                        'original_sentence_mention_end': span[1]
                    }
                })

        return annotation_json

    def _sentence_splitter_with_hyperlink_annotations(
            self, title: str, a_tag_no_remaining_text: str, positions: list,
            entities: list):
        if self.args.language == 'en':
            if self.args.multiprocessing:
                raise Exception('Currently not implemented.')
                # sents = nltk_sentencizer(a_tag_no_remaining_text)
            else:
                doc = self.nlp(a_tag_no_remaining_text)
                sents = [sentence.text for sentence in doc.sents]

            # Currently spacy can't be applyed to multiprocessing, so we gonna use pysbd or nltk when multiprocessing.
            # But they have some bug. Space is added at the end of each split sentence.
            # sents = pysbd_sentencizer(a_tag_no_remaining_text)

        elif self.args.language == 'ja':
            t = SentenceTokenizer()
            sents = t.tokenize(a_tag_no_remaining_text)
        else:
            raise ValueError(
                "sentencizer for {} is currently not implemented".format(
                    self.args.language))

        annotation_id2its_annotations = {}
        sent_initial_length = 0

        for sent in sents:
            if self.args.language == 'en':
                sent_length = len(sent) + 1
            elif self.args.language == 'ja':
                sent_length = copy.copy(len(sent))
            else:
                raise ValueError(
                    "sentencizer for {} is currently not implemented".format(
                        self.args.language))
            initial_char_idx = copy.copy(sent_initial_length)
            end_char_idx = initial_char_idx + sent_length

            to_be_considered_annotations = list()
            for annotation, entity in zip(positions, entities):
                start = annotation[0]
                end = annotation[1]
                if initial_char_idx <= start and end <= end_char_idx:
                    to_be_considered_annotations.append(
                        (start - sent_initial_length,
                         end - sent_initial_length, entity))

            for shift_annotation in to_be_considered_annotations:
                start = shift_annotation[0]
                end = shift_annotation[1]
                entity = shift_annotation[2]

                if entity == 'Infobox':
                    continue

                try:
                    if self.args.language == 'ja':
                        sent_annotated = sent[:start] + '<a>' + sent[
                            start:end] + '</a>' + sent[end:]
                    elif sent[start] == ' ':
                        sent_annotated = sent[:start] + '<a>' + sent[
                            start:end] + ' </a>' + sent[end:]
                    else:
                        sent_annotated = sent[:start] + '<a> ' + sent[
                            start:end] + ' </a>' + sent[end:]
                except:
                    print('annotation error')
                    continue

                # TODO: add assertionError
                annotation_id2its_annotations.update({
                    len(annotation_id2its_annotations): {
                        'document_title': title,
                        'anchor_sent': sent_annotated,
                        'annotation_doc_entity_title': entity,
                        'mention': sent[start:end],
                        'original_sentence': sent,
                        'original_sentence_mention_start': start,
                        'original_sentence_mention_end': end,
                    }
                })

            sent_initial_length += sent_length

        return annotation_id2its_annotations, sents

    def _convert_a_tag_to_start_and_end_position(
            self, text_which_may_contain_a_tag: str):
        a_tag_regex = "<a>(.+?)</a>"
        pattern = re.compile(a_tag_regex)

        a_tag_remaining_text = copy.copy(text_which_may_contain_a_tag)
        mention_positions = list()

        while '<a>' in a_tag_remaining_text:
            result = re.search(pattern=pattern, string=a_tag_remaining_text)
            if result == None:
                break

            original_start, original_end = result.span()
            a_tag_removed_start = copy.copy(original_start)
            a_tag_removed_end = copy.copy(original_end) - 7

            mention = result.group(1)

            original_text_before_mention = a_tag_remaining_text[:
                                                                original_start]
            original_text_after_mention = a_tag_remaining_text[original_end:]

            one_mention_a_tag_removed_text = original_text_before_mention + mention + original_text_after_mention
            assert mention == one_mention_a_tag_removed_text[
                a_tag_removed_start:a_tag_removed_end]

            mention_positions.append((a_tag_removed_start, a_tag_removed_end))
            a_tag_remaining_text = copy.copy(one_mention_a_tag_removed_text)

        return a_tag_remaining_text, mention_positions

    def _from_anchor_tags_to_entities(self, text: str):
        '''
        :param text: text which contains <a> tag
        :return: {'text': text,
        'entites': [{'start': 0, 'end': 3, 'mention': 'Furen'}, ...]}

        sample text
        'She used to be the second most subscribed Virtual Youtuber on Youe after <a href="Kizuna%20AI">Kizuna AI</a> until <a href="Gawr%20Gura">Gawr Gura</a> and others surpassed her in 2020.'

        return
        'She used to be the second most subscribed Virtual Youtuber on Youe after <a>Kizuna AI</a> until <a>Gawr Gura</a> and others surpassed her in 2020.', ['Kizuna AI', 'Gawr Gura']
        '''
        soup = BeautifulSoup(text, "html.parser")

        entities = list()
        for link in soup.find_all("a"):
            try:
                entity = unquote(link.get("href"))
                entities.append(
                    self.get_entity(entity))  # Redirects are resolved.
                del link['href']
            except Exception as e:
                print("exception args:", e.args)
                continue

        return str(soup), entities

    def _coref_link_counts(self, sentences):
        entire_hyperlink_counts = 0
        for sentence in sentences:
            soup = BeautifulSoup(sentence, "html.parser")
            link_counts_in_one_sentence = len(soup.find_all("a"))
            entire_hyperlink_counts += link_counts_in_one_sentence

            coref_dict = {'he': 0, 'she': 0, 'his': 0, 'her': 0}

            for word in sentence.lower().split(' '):
                if word.strip() in coref_dict:
                    coref_dict[word.strip()] += 1

            coref_link_sum = sum([v for v in coref_dict.values()])
            entire_hyperlink_counts += coref_link_sum

        return entire_hyperlink_counts

    def _external_link_remover_from_one_sentence(self, sentence: str):
        '''
        https://stackoverflow.com/questions/19080957/how-to-remove-all-a-href-tags-from-text
        https://senablog.com/python-bs4-modification/
        :param sentence:
        :return:
        '''

        soup = BeautifulSoup(sentence, "html.parser")

        for link in soup.find_all("a"):
            try:
                if "http" in link.get("href"):
                    link.unwrap()
            except:
                continue

        return str(soup)

    def _double_newline_replacer(self, text):
        return text.replace('\n\n', '\n')

    def _single_newline_to_sentences(self, text):
        return text.split('\n')

    def _no_sentence_remover(self, sentences):
        new_sentences = list()
        for sentence in sentences:
            if sentence.strip() == '':
                continue
            new_sentences.append(sentence)

        return new_sentences

    def _section_anchor_remover(self, sentences):
        new_sentences = list()
        for sentence in sentences:
            if sentence.replace(' ', '').endswith('ns>'):
                continue
            if sentence.replace(' ', '').endswith('model>'):
                continue
            if sentence.replace(' ', '').endswith('format>'):
                continue
            if sentence.replace(' ', '').endswith('timestamp>'):
                continue
            if sentence.replace(' ', '').endswith('contributor>'):
                continue
            if sentence.replace(' ', '').endswith('username>'):
                continue
            if sentence.replace(' ', '').endswith('comment>'):
                continue
            if sentence.replace(' ', '').endswith('revision>'):
                continue
            if sentence.replace(' ', '').endswith('parentid>'):
                continue
            if sentence.endswith(' />') and sentence.startswith('<mainpage-'):
                continue
            if len(sentence.strip()) <= 2:
                continue
            if sentence.replace(' ',
                                '').endswith('minor') and sentence.replace(
                                    ' ', '').startswith('<minor'):
                continue
            new_sentences.append(sentence)

        return new_sentences

    def get_entity_index(self, title, resolve_redirect=True):
        '''
        Derived from https://github.com/wikipedia2vec/wikipedia2vec/blob/master/wikipedia2vec/dictionary.pyx
        '''
        if resolve_redirect:
            try:
                index = self.redirect_dict[title][0][0]
                return index
            except KeyError:
                pass
        try:
            index = self.entity_dict[title]
            return index

        except KeyError:
            return -1

    def get_entity(self, title, resolve_redirect=True, default=None):
        '''
        Derived from https://github.com/wikipedia2vec/wikipedia2vec/blob/master/wikipedia2vec/dictionary.pyx
        '''
        index = self.get_entity_index(title, resolve_redirect=resolve_redirect)
        if index == -1:
            return default
        else:
            dict_index = index
            title = self.entity_dict.restore_key(dict_index)
            return title