def add_identifiers(identifiers: dict, catalog: str, entity: str, sandbox: bool) -> None: """Add identifier statements to existing Wikidata items. :param identifiers: a ``{QID: catalog_identifier}`` dictionary :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param sandbox: whether to perform edits on the `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item """ catalog_qid = target_database.get_catalog_qid(catalog) catalog_pid = target_database.get_catalog_pid(catalog, entity) for qid, tid in identifiers.items(): LOGGER.info('Processing %s match: %s -> %s', catalog, qid, tid) if sandbox: LOGGER.debug( 'Using Wikidata sandbox item %s as subject, instead of %s', vocabulary.SANDBOX_1, qid, ) _add_or_reference(vocabulary.SANDBOX_1, catalog_pid, tid, catalog_qid) else: _add_or_reference(qid, catalog_pid, tid, catalog_qid)
def people_cli(catalog, statements, sandbox): """Add statements to Wikidata people. STATEMENTS must be a CSV file. Format: person_QID, PID, value If the claim already exists, just add a reference. Example: $ echo Q312387,P463,Q483407 > joey.csv $ python -m soweego ingester people discogs joey.csv Result: claim (Joey Ramone, member of, Ramones) reference (stated in, Discogs), (retrieved, today) """ stated_in = target_database.get_catalog_qid(catalog) if sandbox: LOGGER.info('Running on the Wikidata sandbox item ...') for statement in statements: person, predicate, value = statement.rstrip().split(',') if sandbox: _add_or_reference(vocabulary.SANDBOX_1, predicate, value, stated_in) else: _add_or_reference(person, predicate, value, stated_in)
def _upload(catalog, entity, to_deprecate, to_add, sandbox): catalog_qid = target_database.get_catalog_qid(catalog) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers('deprecate', catalog, entity, to_deprecate, sandbox) LOGGER.info('Starting addition of statements to Wikidata ...') wikidata_bot.add_people_statements(to_add, catalog_qid, sandbox) return catalog_qid
def _set_catalog_fields(db_entity, name_field, catalog, entity): db_entity.name = name_field db_entity.active = 1 db_entity.note = NOTE_FIELD db_entity.type = CATALOG_TYPES.get(catalog, '') db_entity.source_item = int(target_database.get_catalog_qid(catalog).lstrip('Q')) wd_prop = target_database.get_catalog_pid(catalog, entity) db_entity.wd_prop = int(wd_prop.lstrip('P')) db_entity.search_wp = SEARCH_WP_FIELD
def add_people_statements(catalog: str, statements: Iterable, criterion: str, sandbox: bool) -> None: """Add statements to existing Wikidata people. Statements typically come from validation criteria 2 or 3 as per :func:`soweego.validator.checks.links` and :func:`soweego.validator.checks.bio`. :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param statements: iterable of (subject, predicate, value, catalog ID) tuples :param criterion: ``{'links', 'bio'}``. A supported validation criterion :param sandbox: whether to perform edits on the Wikidata `sandbox 2`_ item """ if criterion == 'links': edit_summary = LINKS_VALIDATION_SUMMARY elif criterion == 'bio': edit_summary = BIO_VALIDATION_SUMMARY else: raise ValueError(f"Invalid criterion: '{criterion}'. " "Please use either 'links' or 'bio'") sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) person_pid = target_database.get_person_pid(catalog) heuristic = vocabulary.RECORD_LINKAGE if sandbox: LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) for subject, predicate, value, catalog_id in statements: LOGGER.info( 'Processing (%s, %s, %s, %s) statement ...', subject, predicate, value, catalog_id, ) actual_subject = subject if not sandbox else sandbox_item _add_or_reference( (actual_subject, predicate, value), heuristic, catalog_qid=catalog_qid, catalog_pid=person_pid, catalog_id=catalog_id, edit_summary=edit_summary, )
def people_cli(catalog, statements, criterion, sandbox): """Add statements to Wikidata people. STATEMENTS must be a CSV file. Format: person_QID, PID, value, person_catalog_ID If the claim already exists, just add a reference. Example: $ echo Q312387,P463,Q483407,264375 > joey.csv $ python -m soweego ingester people discogs joey.csv Result: claim (Joey Ramone, member of, Ramones) reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 264375), (retrieved, today) """ sandbox_item = vocabulary.SANDBOX_2 # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005) heuristic = vocabulary.RECORD_LINKAGE catalog_qid = target_database.get_catalog_qid(catalog) catalog_pid = target_database.get_person_pid(catalog) if criterion == 'links': edit_summary = LINKS_VALIDATION_SUMMARY elif criterion == 'bio': edit_summary = BIO_VALIDATION_SUMMARY else: edit_summary = None if sandbox: LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) stmt_reader = csv.reader(statements) for person, predicate, value, catalog_id in stmt_reader: subject = person if not sandbox else sandbox_item _add_or_reference( (subject, predicate, value), heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, edit_summary=edit_summary, )
def works_cli(catalog, statements, sandbox): """Add statements to Wikidata works. STATEMENTS must be a CSV file. Format: work_QID, PID, person_QID, person_target_ID If the claim already exists, just add a reference. Example: $ echo Q4354548,P175,Q5969,139984 > cmon.csv $ python -m soweego ingester works discogs cmon.csv Result: claim (C'mon Everybody, performer, Eddie Cochran) reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 139984), (retrieved, today) """ sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) is_imdb, person_pid = _get_works_args(catalog) heuristic = vocabulary.RECORD_LINKAGE if sandbox: LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) stmt_reader = csv.reader(statements) for work, predicate, person, person_id in stmt_reader: subject = work if not sandbox else sandbox_item _add_or_reference_works( (subject, predicate, person), heuristic, catalog_qid, person_pid, person_id, is_imdb=is_imdb, edit_summary=WORKS_SUMMARY, )
def add_works_statements(statements: Iterable, catalog: str, sandbox: bool) -> None: """Add statements to existing Wikidata works. Statements typically come from :func:`soweego.validator.enrichment.generate_statements`. :param statements: iterable of (work QID, predicate, person QID, person target ID) tuples :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param sandbox: whether to perform edits on the Wikidata `sandbox 2`_ item """ sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) is_imdb, person_pid = _get_works_args(catalog) heuristic = vocabulary.RECORD_LINKAGE if sandbox: LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) for work, predicate, person, person_id in statements: LOGGER.info( 'Processing (%s, %s, %s, %s) statement', work, predicate, person, person_id, ) subject = work if not sandbox else sandbox_item _add_or_reference_works( (subject, predicate, person), heuristic, catalog_qid, person_pid, person_id, is_imdb=is_imdb, edit_summary=WORKS_SUMMARY, )
def add_people_statements(statements: Iterable, catalog: str, sandbox: bool) -> None: """Add statements to existing Wikidata people. Statements typically come from validation criteria 2 or 3 as per :func:`soweego.validator.checks.links` and :func:`soweego.validator.checks.bio`. :param statements: iterable of (subject, predicate, value) triples :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param sandbox: whether to perform edits on the `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item """ catalog_qid = target_database.get_catalog_qid(catalog) for subject, predicate, value in statements: LOGGER.info('Processing (%s, %s, %s) statement', subject, predicate, value) if sandbox: _add_or_reference(vocabulary.SANDBOX_1, predicate, value, catalog_qid) else: _add_or_reference(subject, predicate, value, catalog_qid)
def _get_works_args(catalog): # Boolean to run IMDb-specific checks is_imdb = catalog == IMDB catalog_qid = target_database.get_catalog_qid(catalog) person_pid = target_database.get_person_pid(catalog) return catalog_qid, is_imdb, person_pid