Ejemplo n.º 1
0
def _gather_wd_data(catalog, entity, works, people):
    # Works IDs
    data_gathering.gather_target_ids(
        target_database.get_work_type(catalog, entity),
        catalog,
        target_database.get_work_pid(catalog),
        works,
    )

    # People IDs
    data_gathering.gather_target_ids(entity, catalog,
                                     target_database.get_person_pid(catalog),
                                     people)
Ejemplo n.º 2
0
def add_people_statements(catalog: str, statements: Iterable, criterion: str,
                          sandbox: bool) -> None:
    """Add statements to existing Wikidata people.

    Statements typically come from validation criteria 2 or 3
    as per :func:`soweego.validator.checks.links` and
    :func:`soweego.validator.checks.bio`.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param statements: iterable of
      (subject, predicate, value, catalog ID) tuples
    :param criterion: ``{'links', 'bio'}``. A supported validation criterion
    :param sandbox: whether to perform edits on the Wikidata `sandbox 2`_ item
    """
    if criterion == 'links':
        edit_summary = LINKS_VALIDATION_SUMMARY
    elif criterion == 'bio':
        edit_summary = BIO_VALIDATION_SUMMARY
    else:
        raise ValueError(f"Invalid criterion: '{criterion}'. "
                         "Please use either 'links' or 'bio'")

    sandbox_item = vocabulary.SANDBOX_2
    catalog_qid = target_database.get_catalog_qid(catalog)
    person_pid = target_database.get_person_pid(catalog)
    heuristic = vocabulary.RECORD_LINKAGE

    if sandbox:
        LOGGER.info('Running on the Wikidata sandbox item %s ...',
                    sandbox_item)

    for subject, predicate, value, catalog_id in statements:
        LOGGER.info(
            'Processing (%s, %s, %s, %s) statement ...',
            subject,
            predicate,
            value,
            catalog_id,
        )
        actual_subject = subject if not sandbox else sandbox_item
        _add_or_reference(
            (actual_subject, predicate, value),
            heuristic,
            catalog_qid=catalog_qid,
            catalog_pid=person_pid,
            catalog_id=catalog_id,
            edit_summary=edit_summary,
        )
Ejemplo n.º 3
0
def people_cli(catalog, statements, criterion, sandbox):
    """Add statements to Wikidata people.

    STATEMENTS must be a CSV file.
    Format: person_QID, PID, value, person_catalog_ID

    If the claim already exists, just add a reference.

    Example:

    $ echo Q312387,P463,Q483407,264375 > joey.csv

    $ python -m soweego ingester people discogs joey.csv

    Result:

    claim (Joey Ramone, member of, Ramones)

    reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 264375), (retrieved, today)
    """
    sandbox_item = vocabulary.SANDBOX_2
    # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005)
    heuristic = vocabulary.RECORD_LINKAGE
    catalog_qid = target_database.get_catalog_qid(catalog)
    catalog_pid = target_database.get_person_pid(catalog)

    if criterion == 'links':
        edit_summary = LINKS_VALIDATION_SUMMARY
    elif criterion == 'bio':
        edit_summary = BIO_VALIDATION_SUMMARY
    else:
        edit_summary = None

    if sandbox:
        LOGGER.info('Running on the Wikidata sandbox item %s ...',
                    sandbox_item)

    stmt_reader = csv.reader(statements)
    for person, predicate, value, catalog_id in stmt_reader:
        subject = person if not sandbox else sandbox_item
        _add_or_reference(
            (subject, predicate, value),
            heuristic,
            catalog_qid=catalog_qid,
            catalog_pid=catalog_pid,
            catalog_id=catalog_id,
            edit_summary=edit_summary,
        )
Ejemplo n.º 4
0
def _get_works_args(catalog):
    # Boolean to run IMDb-specific checks
    is_imdb = catalog == IMDB
    catalog_qid = target_database.get_catalog_qid(catalog)
    person_pid = target_database.get_person_pid(catalog)
    return catalog_qid, is_imdb, person_pid