Example #1
0
def stage_descriptions(env):
    run_gen_tool(env.gen_tool,
                 out=env.get_subprocess_out(),
                 err=env.get_subprocess_out(),
                 intermediate_data_path=env.intermediate_path,
                 user_resource_path=env.user_resource_path,
                 dump_wikipedia_urls=env.wiki_url_path,
                 idToWikidata=env.id_to_wikidata_path)

    langs = ("en", "ru", "es")
    checker = check_and_get_checker(env.popularity_path)
    download_from_wikipedia_tags(env.wiki_url_path, env.descriptions_path,
                                 langs, checker)
    download_from_wikidata_tags(env.id_to_wikidata_path, env.descriptions_path,
                                langs, checker)

    @country_stage_log
    def stage_write_descriptions(env, country, **kwargs):
        stages.run_gen_tool_with_recovery_country(
            env,
            env.gen_tool,
            out=env.get_subprocess_out(country),
            err=env.get_subprocess_out(country),
            data_path=env.mwm_path,
            user_resource_path=env.user_resource_path,
            wikipedia_pages=env.descriptions_path,
            idToWikidata=env.id_to_wikidata_path,
            output=country,
            **kwargs)

    mwms = env.get_mwm_names()
    countries = filter(lambda x: x not in WORLDS_NAMES, mwms)
    with ThreadPool() as pool:
        pool.map(partial(stage_write_descriptions, env), countries)
Example #2
0
def main():
    log.setLevel(logging.WARNING)
    wikipediaapi.log.setLevel(logging.WARNING)
    args = parse_args()
    wikipedia_file = args.wikipedia
    wikidata_file = args.wikidata
    output_dir = args.output_dir
    popularity_file = args.popularity
    langs = list(itertools.chain.from_iterable(args.langs))
    os.makedirs(output_dir, exist_ok=True)
    checker = check_and_get_checker(popularity_file)
    download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
    if wikidata_file is None:
        log.warning(f"Wikidata file not set.")
    elif os.path.exists(wikidata_file):
        download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
    else:
        log.warning(f"Wikidata ({wikidata_file}) file not set.")
    def apply(self, env: Env):
        run_gen_tool(
            env.gen_tool,
            out=env.get_subprocess_out(),
            err=env.get_subprocess_out(),
            intermediate_data_path=env.paths.intermediate_data_path,
            user_resource_path=env.paths.user_resource_path,
            dump_wikipedia_urls=env.paths.wiki_url_path,
            idToWikidata=env.paths.id_to_wikidata_path,
        )

        langs = ("en", "ru", "es", "fr", "de")
        checker = check_and_get_checker(env.paths.popularity_path)
        download_from_wikipedia_tags(env.paths.wiki_url_path,
                                     env.paths.descriptions_path, langs,
                                     checker)
        download_from_wikidata_tags(env.paths.id_to_wikidata_path,
                                    env.paths.descriptions_path, langs,
                                    checker)