Ejemplo n.º 1
0
    def consolidate_parallel(self, output_dir, num_workers=128):
        # Run bulk requests
        url2qid = {
            self.WIKIDATA_ALIASES_API % qid: qid
            for qid in self.qid2ner
        }
        responses = multi_get_batch(list(url2qid.keys()),
                                    num_workers,
                                    timeout=10)
        assert len(responses) == len(url2qid)
        total_entities = 0
        for url, response in responses:
            qid = url2qid[url]
            try:
                aliases = response.json()['entities'][qid]['aliases'][
                    self.lang_code]
                if aliases:
                    self.qid2ner[qid]['entities'].update(
                        set(a['value'] for a in aliases))
            except:
                pass
            # Convert set to list since set is not serializable
            self.qid2ner[qid]['entities'] = list(self.qid2ner[qid]['entities'])
            total_entities += len(self.qid2ner[qid]['entities'])

        print('We now have a NER dataset of %d QIDs and %d entities!' %
              (len(self.qid2ner), total_entities))
        # TODO: Any better format to save?
        os.makedirs(output_dir, exist_ok=True)
        dataset_file = os.path.join(output_dir, 'ner_dataset.json')
        print('Writing final dataset to:', dataset_file)
        pretty_write_json(self.qid2ner, dataset_file)
        return
    def generate(self, output_dir, consolidate=True, train_split=False):
        save_to = os.path.join(output_dir, 'cloze_set')
        os.makedirs(
            save_to
        )  #, exist_ok=True) # Delete the folder yourself if it exists
        total_data_count = 0
        for article_file in tqdm(self.articles_json,
                                 desc='Generating cloze',
                                 unit=' articles'):
            try:
                with open(article_file, encoding='utf-8') as f:
                    article = json.load(f)
            except:
                print(traceback.format_exc())
                print('Unable to parse:', article_file)
                continue

            cloze_list = self.generate_for_article(article)
            if cloze_list:  # Save the cloze for this article
                save_filepath = get_verified_path(save_to, article['title'],
                                                  '.json')
                pretty_write_json(cloze_list, save_filepath)
                total_data_count += len(cloze_list)

        print('SUCCESS: Generated a total of %d cloze questions!' %
              total_data_count)
        print('For individual results, check the folder:', save_to, '\n')
        if consolidate:
            self.consolidate(save_to, output_dir, train_split)
        return
    def consolidate(self, articles_dir, output_dir, train_split=False):

        article_files = sorted(glob(os.path.join(articles_dir, '*.json')))
        data = []  #WARN: Can be RAM consuming.
        for article_file in tqdm(article_files,
                                 desc='Consolidating',
                                 unit=' articles'):
            with open(article_file, encoding='utf-8') as f:
                cloze_list = json.load(f)
            data += cloze_list

        dataset = {
            'params': self.get_params_dict(),
            'metadata': {
                'TOTAL_CLOZES': len(data),
                'PROCESSED_WIKI_ARTICLES': len(self.articles_json),
                'GENERATED_TIMESTAMP': str(datetime.now())
            },
            'cloze_data': data
        }
        dataset_file = os.path.join(output_dir, 'cloze_dataset.json')
        pretty_write_json(dataset, dataset_file)
        print('Final dataset written to:', dataset_file, '\n')

        # Dump a sample of few questions
        random.seed(666)
        random.shuffle(data)
        sample_file = os.path.join(output_dir, 'cloze_sample.json')
        pretty_write_json(data[:20], sample_file)
        print('Sample dataset written to:', sample_file, '\n')

        if train_split:
            train_split_len = int(self.TRAIN_SPLIT * len(data))
            pretty_write_json(data[:train_split_len],
                              os.path.join(output_dir, 'cloze_train_set.json'))

            dev_split_len = int(self.DEV_SPLIT * len(data))
            pretty_write_json(
                data[train_split_len:train_split_len + dev_split_len],
                os.path.join(output_dir, 'cloze_dev_set.json'))

            test_split_len = len(data) - (train_split_len + dev_split_len)
            pretty_write_json(data[train_split_len + dev_split_len:],
                              os.path.join(output_dir, 'cloze_test_set.json'))

            print('Dataset split into Train-Dev-Test and saved at ',
                  output_dir)
            print('Split ratio %.2f:%.2f:%.2f and count %d:%d:%d\n' %
                  (self.TRAIN_SPLIT, self.DEV_SPLIT, self.TEST_SPLIT,
                   train_split_len, dev_split_len, test_split_len))
        return
Ejemplo n.º 4
0
    def process_titles_serial(self, txt_file, save_to):
        # Read list of all page titles
        with open(txt_file, encoding='utf-8') as f:
            titles = f.read().split('\n')

        ner_data = {}
        for title in tqdm(titles,
                          desc='Performing NER from WikiData',
                          unit=' entities'):
            self.fetch_ner_wiki(title, ner_data)

        os.makedirs(save_to, exist_ok=True)
        ner_file = os.path.join(save_to, 'ner_list.json')
        pretty_write_json(ner_data, ner_file)
        return
Ejemplo n.º 5
0
    def process_wiki_xml(self, save_to):
        os.makedirs(save_to, exist_ok=True)
        articles_path = os.path.join(save_to, 'articles')
        os.makedirs(articles_path, exist_ok=True)
        cleaner = Cleaner()
        page_titles = set()
        for title, text in tqdm(iterate(self.wiki_xml),
                                desc='Wikipedia processing',
                                unit=' articles'):
            # Clean each article to get plain-text and links
            try:
                text = cleaner.clean_text(text)
                cleaned_text, links = cleaner.build_links(text)
            except:
                print(traceback.format_exc())
                print('Failed to parse article:', title)
                continue

            # Store article as JSON. Note: 255 is max_path_length for Linux
            json_path = get_verified_path(articles_path, title, '.json')
            article = {
                'title': title,
                'body': cleaned_text,
                'links': links,
                'lang_code': self.lang_code
            }
            pretty_write_json(article, json_path)

            # Save all link names in this article
            if not cleaned_text.startswith('REDIRECT'):
                page_titles.add(title.strip())
            for l in links:
                entity = l['link'].strip()
                if entity:
                    page_titles.add(entity)

        print('Written all articles to:', articles_path)

        # Write all the page titles as txt to perform NER later
        entities_txt = os.path.join(save_to, 'page_titles.txt')
        with open(entities_txt, 'w', encoding='utf-8') as f:
            f.write('\n'.join(page_titles) + '\n')

        print('Written %d potential Wiki Entities to:' % len(page_titles),
              entities_txt)
        return
Ejemplo n.º 6
0
    def consolidate(self, output_dir):
        os.makedirs(output_dir, exist_ok=True)
        dataset_file = os.path.join(output_dir, 'ner_dataset.json')
        total_entities = 0
        for qid in tqdm(self.qid2ner,
                        desc='Quering WikiData for aliases',
                        unit=' QIDs'):
            aliases = self.get_wikidata_aliases(qid)
            if aliases:
                self.qid2ner[qid]['entities'].update(aliases)
            # Convert set to list since set is not serializable
            self.qid2ner[qid]['entities'] = list(self.qid2ner[qid]['entities'])
            total_entities += len(self.qid2ner[qid]['entities'])

        print('We now have a NER dataset of %d QIDs and %d entities!' %
              (len(self.qid2ner), total_entities))
        # TODO: Any better format to save?
        print('Writing final dataset to:', dataset_file)
        pretty_write_json(self.qid2ner, dataset_file)
        return
Ejemplo n.º 7
0
    def process_titles_parallel(self, txt_file, save_to, num_workers=16):
        # Read list of all page titles
        with open(txt_file, encoding='utf-8') as f:
            titles = f.read().split('\n')

        # Prepare variables for the workers
        results = [{} for i in range(num_workers)]
        titles_per_thread = (len(titles) + num_workers) // num_workers
        threads = []
        self.threads_counter = [0 for i in range(num_workers)]

        # Start all worker threads
        for t_id in range(num_workers):
            t = Thread(target=self.ner_wiki_worker,
                       args=(t_id, titles[t_id * titles_per_thread:(t_id + 1) *
                                          titles_per_thread], results[t_id]))
            t.start()
            threads.append(t)

        # Start the status printing thread
        self.print_worker_status = True
        printer_thread = Thread(target=self.worker_status_printer,
                                args=(num_workers, ))
        printer_thread.start()

        # Wait till all threads are complete
        for t_id in range(num_workers):
            threads[t_id].join()

        self.print_worker_status = False

        ner_data = {}
        for t_id in range(num_workers):
            ner_data.update(results[t_id])

        os.makedirs(save_to, exist_ok=True)
        ner_file = os.path.join(save_to, 'ner_list.json')
        print('Workers completed the work. Saving to:', ner_file)
        pretty_write_json(ner_data, ner_file)
        return