Beispiel #1
0
def _upload(catalog, entity, to_deprecate, to_add, sandbox):
    catalog_qid = target_database.get_catalog_qid(catalog)
    LOGGER.info('Starting deprecation of %s IDs ...', catalog)
    wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog, entity,
                                                 to_deprecate, sandbox)
    LOGGER.info('Starting addition of statements to Wikidata ...')
    wikidata_bot.add_people_statements(to_add, catalog_qid, sandbox)
    return catalog_qid
Beispiel #2
0
def _handle_result(
    result: Iterable[Tuple[str, str, str]],
    origin: str,
    catalog: str,
    path_out: str,
    upload: bool,
    sandbox: bool,
):
    if upload:
        to_upload = set()  # In-memory copy of the result generator

    with open(path_out, 'w', 1) as fout:
        writer = csv.writer(fout)
        for statement in result:
            writer.writerow(statement)
            if upload:
                to_upload.add(statement)

    if upload:
        wikidata_bot.add_people_statements(to_upload, catalog, sandbox)

    LOGGER.info('%s %s dumped to %s', catalog, origin, path_out)
Beispiel #3
0
def _upload_result(catalog, entity, to_deprecate, urls_to_add, ext_ids_to_add,
                   sandbox):
    _upload(catalog, entity, to_deprecate, urls_to_add, sandbox)
    LOGGER.info('Starting addition of external IDs to Wikidata ...')
    wikidata_bot.add_people_statements(ext_ids_to_add, sandbox)
Beispiel #4
0
def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
    """Validate identifiers against biographical data.

    Look for birth/death dates, birth/death places, gender.

    Dump 4 output files:

    1. catalog IDs to be deprecated.
    JSON format: {catalog_ID: [list of QIDs]}

    2. statements to be added.
    CSV format: QID,PID,value,catalog_ID

    3. shared statements to be referenced.
    Same format as file #2

    4. statements found in Wikidata but not in the target catalog.
    CSV format: catalog_ID,PID_URL,value,QID_URL

    You can pass the '-u' flag to upload the output to Wikidata.
    """
    criterion = 'bio'
    # Output paths
    deprecate_path = os.path.join(
        dir_io,
        IDS_TO_BE_DEPRECATED_FNAME.format(catalog=catalog,
                                          entity=entity,
                                          criterion=criterion),
    )
    add_path = os.path.join(
        dir_io,
        BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(catalog=catalog,
                                                entity=entity),
    )
    ref_path = os.path.join(
        dir_io,
        SHARED_STATEMENTS_FNAME.format(catalog=catalog,
                                       entity=entity,
                                       criterion=criterion),
    )
    wd_stmts_path = os.path.join(
        dir_io,
        WD_STATEMENTS_FNAME.format(criterion=criterion,
                                   catalog=catalog,
                                   entity=entity),
    )
    wd_cache_path = os.path.join(
        dir_io,
        WD_CACHE_FNAME.format(catalog=catalog,
                              entity=entity,
                              criterion=criterion),
    )

    # Wikidata cache
    wd_cache = None
    if os.path.isfile(wd_cache_path):
        with open(wd_cache_path, 'rb') as cin:
            wd_cache = pickle.load(cin)
        LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)

    # Run validation
    result = bio(catalog, entity, wd_cache=wd_cache)

    # Nothing to do: the catalog doesn't contain biographical data
    if result is None:
        return

    # Unpack the result tuple
    deprecate, add, reference, wd_stmts, wd_cache = result
    # Dump output files
    _dump_deprecated(deprecate, deprecate_path)
    _dump_csv_output(add, add_path, 'statements to be added')
    _dump_csv_output(reference, ref_path, 'shared statements to be referenced')
    _dump_csv_output(
        wd_stmts,
        wd_stmts_path,
        f'statements in Wikidata but not in {catalog} {entity}',
    )

    # Dump Wikidata cache
    if dump_wikidata:
        try:
            with open(wd_cache_path, 'wb') as cout:
                # Using the highest protocol available for the current Python
                # version should be the most efficient solution
                pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
            LOGGER.info(
                'Biographical data gathered from Wikidata dumped to %s',
                wd_cache_path,
            )
        except MemoryError:
            LOGGER.warning('Could not pickle the Wikidata cache: memory error')

    # Upload the output to Wikidata:
    # deprecate, add, reference
    if upload:
        if sandbox:
            LOGGER.info(
                'Running on the Wikidata sandbox item %s ...',
                vocabulary.SANDBOX_2,
            )
        LOGGER.info('Starting deprecation of %s IDs ...', catalog)
        wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog,
                                                     entity, deprecate,
                                                     sandbox)
        LOGGER.info('Starting addition of extra statements to Wikidata ...')
        wikidata_bot.add_people_statements(catalog, add, criterion, sandbox)
        LOGGER.info(
            'Starting referencing of shared statements in Wikidata ...')
        wikidata_bot.add_people_statements(catalog, reference, criterion,
                                           sandbox)
Beispiel #5
0
def links_cli(catalog, entity, blacklist, upload, sandbox, dump_wikidata,
              dir_io):
    """Validate identifiers against links.

    Dump 6 output files:

    1. catalog IDs to be deprecated.
    JSON format: {catalog_ID: [list of QIDs]}

    2. third-party IDs to be added.
    CSV format: QID,third-party_PID,third-party_ID,catalog_ID

    3. URLs to be added.
    CSV format: QID,P2888,URL,catalog_ID

    4. third-party IDs to be referenced.
    Same format as file #2

    5. URLs to be referenced.
    Same format as file #3

    6. URLs found in Wikidata but not in the target catalog.
    CSV format: catalog_ID,URL,QID_URL

    You can pass the '-u' flag to upload the output to Wikidata.

    The '-b' flag applies a URL blacklist of low-quality Web domains to file #3.
    """
    criterion = 'links'
    # Output paths
    deprecate_path = os.path.join(
        dir_io,
        IDS_TO_BE_DEPRECATED_FNAME.format(catalog=catalog,
                                          entity=entity,
                                          criterion=criterion),
    )
    add_ext_ids_path = os.path.join(
        dir_io,
        EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='added'),
    )
    add_urls_path = os.path.join(
        dir_io, URLS_FNAME.format(catalog=catalog, entity=entity,
                                  task='added'))
    ref_ext_ids_path = os.path.join(
        dir_io,
        EXT_IDS_FNAME.format(catalog=catalog, entity=entity,
                             task='referenced'),
    )
    ref_urls_path = os.path.join(
        dir_io,
        URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'),
    )
    wd_urls_path = os.path.join(
        dir_io,
        WD_STATEMENTS_FNAME.format(criterion=criterion,
                                   catalog=catalog,
                                   entity=entity),
    )
    wd_cache_path = os.path.join(
        dir_io,
        WD_CACHE_FNAME.format(catalog=catalog,
                              entity=entity,
                              criterion=criterion),
    )

    # Wikidata cache
    wd_cache = None
    if os.path.isfile(wd_cache_path):
        with open(wd_cache_path, 'rb') as cin:
            wd_cache = pickle.load(cin)
        LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)

    # Run validation
    result = links(catalog, entity, url_blacklist=blacklist, wd_cache=wd_cache)

    # Nothing to do: the catalog doesn't contain links
    if result is None:
        return

    # Unpack the result tuple
    (
        deprecate,
        add_ext_ids,
        add_urls,
        ref_ext_ids,
        ref_urls,
        wd_urls,
        wd_cache,
    ) = result
    # Dump output files
    _dump_deprecated(deprecate, deprecate_path)
    _dump_csv_output(add_ext_ids, add_ext_ids_path,
                     'third-party IDs to be added')
    _dump_csv_output(add_urls, add_urls_path, 'URLs to be added')
    _dump_csv_output(ref_ext_ids, ref_ext_ids_path,
                     'shared third-party IDs to be referenced')
    _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
    _dump_csv_output(wd_urls, wd_urls_path,
                     f'Wikidata URLs not in {catalog} {entity}')

    # Dump Wikidata cache
    if dump_wikidata:
        try:
            with open(wd_cache_path, 'wb') as cout:
                # Using the highest protocol available for the current Python
                # version should be the most efficient solution
                pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
            LOGGER.info('URLs gathered from Wikidata dumped to %s',
                        wd_cache_path)
        except MemoryError:
            LOGGER.warning('Could not pickle the Wikidata cache: memory error')

    # Upload the output to Wikidata
    if upload:
        if sandbox:
            LOGGER.info(
                'Running on the Wikidata sandbox item %s ...',
                vocabulary.SANDBOX_2,
            )
        LOGGER.info('Starting deprecation of %s IDs ...', catalog)
        wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog,
                                                     entity, deprecate,
                                                     sandbox)
        LOGGER.info('Starting addition of external IDs to Wikidata ...')
        wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion,
                                           sandbox)
        LOGGER.info('Starting addition of URLs to Wikidata ...')
        wikidata_bot.add_people_statements(catalog, add_urls, criterion,
                                           sandbox)
        LOGGER.info(
            'Starting referencing of shared external IDs in Wikidata ...')
        wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion,
                                           sandbox)
        LOGGER.info('Starting referencing of shared URLs in Wikidata ...')
        wikidata_bot.add_people_statements(catalog, add_urls, criterion,
                                           sandbox)