コード例 #1
0
ファイル: wiki_dump_parse.py プロジェクト: gxchris95/forte
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add the rest of wiki page structures:
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    property_dir = link_dir + '_property'
    add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties,
                  link_dir, property_dir, 'info_box_properties', True)
    print_progress("Done reading wikipedia info-boxes.", '\n')

    literal_dir = property_dir + '_literals'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals,
                  property_dir, literal_dir, 'literals', True)
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    mapping_dir = literal_dir + '_objects'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir,
                  mapping_dir, 'objects', True)
    print_progress("Done reading wikipedia info-boxes objects.", '\n')
コード例 #2
0
    def test_struct(self):
        pl = Pipeline[DataPack](self.resources)
        pl.set_reader(
            WikiStructReader(),
            config={
                "pack_index": os.path.join(self.raw_output, "article.idx"),
                "pack_dir": self.raw_output,
            },
        )

        output: str = os.path.join(self.output_dir.name, "struct")
        write_results(pl, output,
                      os.path.join(self.data_dir, "nif_page_structure.tql"))
        self.num_packs_check(output, 1)
        self.num_indexed(output, 1)
コード例 #3
0
ファイル: wiki_dump_parse.py プロジェクト: gaurav5590/forte
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(WikiPropertyReader(),
                  resources,
                  info_boxs_properties,
                  link_dir,
                  property_dir,
                  'info_box_properties',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='properties.idx')
    print_progress("Done reading wikipedia info-boxes properties.", '\n')

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_literals,
                  property_dir,
                  literal_dir,
                  'literals',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='literals.idx')
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_objects,
                  literal_dir,
                  mapping_dir,
                  'objects',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='objects.idx')
    print_progress("Done reading wikipedia info-boxes objects.", '\n')
コード例 #4
0
def main(
    nif_context: str,
    nif_page_structure: str,
    mapping_literals: str,
    mapping_objects: str,
    nif_text_links: str,
    redirects: str,
    info_boxs_properties: str,
    categories: str,
    base_output_path: str,
    resume_existing: bool,
):
    # Whether to skip the whole step.
    if resume_existing:
        skip_existing = False
    else:
        skip_existing = True

    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress("Loading redirects", "\n")

    redirect_map: Dict[str, str] = cache_redirects(base_output_path, redirects)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", "\n")

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, "nif_raw")
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", "\n")

    # Use the same index structure for all writers.
    main_index = os.path.join(raw_pack_dir, "article.idx")

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + "_struct"
    add_wiki_info(
        WikiStructReader(),
        resources,
        nif_page_structure,
        raw_pack_dir,
        struct_dir,
        "page_structures",
        use_input_index=True,
        skip_existing=skip_existing,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia structures.", "\n")

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + "_links"
    add_wiki_info(
        WikiAnchorReader(),
        resources,
        nif_text_links,
        struct_dir,
        link_dir,
        "anchor_links",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia anchors.", "\n")

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(
        WikiPropertyReader(),
        resources,
        info_boxs_properties,
        link_dir,
        property_dir,
        "info_box_properties",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="properties.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes properties.", "\n")

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_literals,
        property_dir,
        literal_dir,
        "literals",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="literals.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes literals.", "\n")

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_objects,
        literal_dir,
        mapping_dir,
        "objects",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="objects.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes objects.", "\n")

    # 4.2 Add category, directly write to previous directory.
    category_dir = mapping_dir
    add_wiki_info(
        WikiCategoryReader(),
        resources,
        categories,
        mapping_dir,
        category_dir,
        "categories",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="categories.idx",
        input_index_file_path=main_index,
    )