def _parse_pack(
            self, collection: Tuple[str, Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        resource_name, info_box_data = collection

        if resource_name in self.redirects:
            resource_name = self.redirects[resource_name]

        if resource_name in self.pack_index:
            print_progress(f'Add infobox to resource: [{resource_name}]')

            pack_path = os.path.join(
                self.pack_dir,
                self.pack_index[resource_name]
            )

            if os.path.exists(pack_path):
                with open(pack_path) as pack_file:
                    pack = data_utils.deserialize(
                        self._pack_manager, pack_file.read())

                    add_info_boxes(pack, info_box_data['literals'])
                    add_info_boxes(pack, info_box_data['objects'])
                    add_property(pack, info_box_data['properties'])
                    yield pack
        else:
            print_notice(f"Resource {resource_name} is not in the raw packs.")
            self.logger.warning("Resource %s is not in the raw packs.",
                                resource_name)
Beispiel #2
0
    def _collect(
        self,
        nif_context: str  # type: ignore
    ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]:
        str_data: Dict[str, str] = {}
        node_data: Dict[str, List[state_type]] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia context: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    node_data['struct'] = self.struct_reader.get(c)
                    node_data['links'] = self.link_reader.get(c)

                    yield str_data, node_data
        print(' ..Done')
Beispiel #3
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs: str, output_path: str):
    # Load redirects.
    print_progress('Loading redirects', '\n')
    logging.info("Loading redirects")
    redirect_pickle = os.path.join(output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)
    print_progress('\nLoading redirects', '\n')
    logging.info("Done loading.")

    # The datasets are read in two steps.
    raw_pack_dir = os.path.join(output_path, 'nif_raw')

    # First, we create the NIF reader that read the NIF in order.
    nif_pl = Pipeline[DataPack]()
    nif_pl.resource.update(redirects=redirect_map)

    nif_pl.set_reader(DBpediaWikiReader(), config=Config(
        {
            'redirect_path': redirects,
            'nif_page_structure': nif_page_structure,
            'nif_text_links': nif_text_links,
        },
        DBpediaWikiReader.default_configs()
    ))

    nif_pl.add(WikiArticleWriter(), config=Config(
        {
            'output_dir': raw_pack_dir,
            'zip_pack': True,
        },
        WikiArticleWriter.default_configs()
    ))

    nif_pl.initialize()
    logging.info('Start running the DBpedia text pipeline.')
    print_progress('Start running the DBpedia text pipeline.', '\n')
    nif_pl.run(nif_context)

    # Second, we add info boxes to the packs with NIF.
    ib_pl = Pipeline[DataPack]()
    ib_pl.resource.update(redirects=redirect_map)
    ib_pl.set_reader(DBpediaInfoBoxReader(), config=Config(
        {
            'pack_index': os.path.join(raw_pack_dir, 'article.idx'),
            'pack_dir': raw_pack_dir,
            'mapping_literals': mapping_literals,
            'mapping_objects': mapping_objects,
            'reading_log': os.path.join(output_path, 'infobox.log')
        },
        DBpediaInfoBoxReader.default_configs()
    ))

    ib_pl.add(WikiArticleWriter(), config=Config(
        {
            'output_dir': os.path.join(output_path, 'nif_info_box'),
            'zip_pack': True,
        },
        WikiArticleWriter.default_configs()
    ))

    # Now we run the info box pipeline.
    ib_pl.run(info_boxs)