def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, base_output_path: str): # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress('Loading redirects', '\n') redirect_pickle = os.path.join(base_output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", '\n') # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, 'nif_raw') read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", '\n') # 2. Add the rest of wiki page structures: struct_dir = raw_pack_dir + '_struct' add_wiki_info(WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, 'page_structures', True) print_progress("Done reading wikipedia structures.", '\n') link_dir = struct_dir + '_links' add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, 'anchor_links', True) print_progress("Done reading wikipedia anchors.", '\n') property_dir = link_dir + '_property' add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, 'info_box_properties', True) print_progress("Done reading wikipedia info-boxes.", '\n') literal_dir = property_dir + '_literals' add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, 'literals', True) print_progress("Done reading wikipedia info-boxes literals.", '\n') mapping_dir = literal_dir + '_objects' add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, 'objects', True) print_progress("Done reading wikipedia info-boxes objects.", '\n')
def test_struct(self): pl = Pipeline[DataPack](self.resources) pl.set_reader( WikiStructReader(), config={ "pack_index": os.path.join(self.raw_output, "article.idx"), "pack_dir": self.raw_output, }, ) output: str = os.path.join(self.output_dir.name, "struct") write_results(pl, output, os.path.join(self.data_dir, "nif_page_structure.tql")) self.num_packs_check(output, 1) self.num_indexed(output, 1)
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, base_output_path: str): # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress('Loading redirects', '\n') redirect_pickle = os.path.join(base_output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", '\n') # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, 'nif_raw') read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", '\n') # 2. Add wiki page structures, create a new directory for it. struct_dir = raw_pack_dir + '_struct' add_wiki_info(WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, 'page_structures', True) print_progress("Done reading wikipedia structures.", '\n') # 3. Add wiki links, create a new directory for it. link_dir = struct_dir + '_links' add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, 'anchor_links', True) print_progress("Done reading wikipedia anchors.", '\n') # 4 The following steps add info boxes: # 4.1 Add un-mapped infobox, we directly write to the previous directory property_dir = link_dir add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, 'info_box_properties', skip_existing=True, overwrite=True, output_index_file_name='properties.idx') print_progress("Done reading wikipedia info-boxes properties.", '\n') # 4.1 Add mapped literal, we directly write to the previous directory. literal_dir = property_dir add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, 'literals', skip_existing=True, overwrite=True, output_index_file_name='literals.idx') print_progress("Done reading wikipedia info-boxes literals.", '\n') # 4.1 Add mapped object, we directly write to the previous directory. mapping_dir = literal_dir add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, 'objects', skip_existing=True, overwrite=True, output_index_file_name='objects.idx') print_progress("Done reading wikipedia info-boxes objects.", '\n')
def main( nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, categories: str, base_output_path: str, resume_existing: bool, ): # Whether to skip the whole step. if resume_existing: skip_existing = False else: skip_existing = True # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress("Loading redirects", "\n") redirect_map: Dict[str, str] = cache_redirects(base_output_path, redirects) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", "\n") # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, "nif_raw") read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", "\n") # Use the same index structure for all writers. main_index = os.path.join(raw_pack_dir, "article.idx") # 2. Add wiki page structures, create a new directory for it. struct_dir = raw_pack_dir + "_struct" add_wiki_info( WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, "page_structures", use_input_index=True, skip_existing=skip_existing, resume_from_last=resume_existing, input_index_file_path=main_index, ) print_progress("Done reading wikipedia structures.", "\n") # 3. Add wiki links, create a new directory for it. link_dir = struct_dir + "_links" add_wiki_info( WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, "anchor_links", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, input_index_file_path=main_index, ) print_progress("Done reading wikipedia anchors.", "\n") # 4 The following steps add info boxes: # 4.1 Add un-mapped infobox, we directly write to the previous directory property_dir = link_dir add_wiki_info( WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, "info_box_properties", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="properties.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes properties.", "\n") # 4.1 Add mapped literal, we directly write to the previous directory. literal_dir = property_dir add_wiki_info( WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, "literals", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="literals.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes literals.", "\n") # 4.1 Add mapped object, we directly write to the previous directory. mapping_dir = literal_dir add_wiki_info( WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, "objects", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="objects.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes objects.", "\n") # 4.2 Add category, directly write to previous directory. category_dir = mapping_dir add_wiki_info( WikiCategoryReader(), resources, categories, mapping_dir, category_dir, "categories", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="categories.idx", input_index_file_path=main_index, )