def _gather_wd_data(catalog, entity, works, people): # Works IDs data_gathering.gather_target_ids( target_database.get_work_type(catalog, entity), catalog, target_database.get_work_pid(catalog), works, ) # People IDs data_gathering.gather_target_ids(entity, catalog, target_database.get_person_pid(catalog), people)
def bio(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, Iterator, Dict]: """Validate identifiers against available biographical data. Look for: - birth and death dates - birth and death places - gender Also generate statements based on additional data found in the given catalog. They can be used to enrich Wikidata items. **How it works:** 1. gather data from the given catalog 2. gather data from relevant Wikidata items 3. look for shared data between pairs of Wikidata and catalog items: - when the pair does not share any data, the catalog identifier should be marked with a deprecated rank - when the catalog item has more data than the Wikidata one, it should be added to the latter :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run :return: 3 objects 1. ``dict`` of identifiers that should be deprecated 2. ``generator`` of statements that should be added 3. ``dict`` of biographical data gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target data target_bio = data_gathering.gather_target_biodata(entity, catalog) if target_bio is None: return None, None, None to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set) # Wikidata side if wd_cache is None: wd_bio = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_bio, ) data_gathering.gather_wikidata_biodata(wd_bio) else: wd_bio = wd_cache # Validation _validate(keys.BIODATA, wd_bio, target_bio, to_be_deprecated, to_be_added) return to_be_deprecated, _bio_to_be_added_generator(to_be_added), wd_bio
def links(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, List, List, Dict]: """Validate identifiers against available links. Also generate statements based on additional links found in the given catalog. They can be used to enrich Wikidata items. **How it works:** 1. gather links from the given catalog 2. gather links from relevant Wikidata items 3. look for shared links between pairs of Wikidata and catalog items: - when the pair does not share any link, the catalog identifier should be marked with a deprecated rank - when the catalog item has more links than the Wikidata one, they should be added to the latter 4. try to extract third-party identifiers from extra links :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run :return: 4 objects 1. ``dict`` of identifiers that should be deprecated 2. ``list`` of third-party identifiers that should be added 3. ``list`` of URLs that should be added 4. ``dict`` of links gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target links target_links = data_gathering.gather_target_links(entity, catalog) if target_links is None: return None, None, None, None to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() if wd_cache is None: wd_links = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_links, ) data_gathering.gather_wikidata_links(wd_links, url_pids, ext_id_pids_to_urls) else: wd_links = wd_cache # Validation _validate(keys.LINKS, wd_links, target_links, to_be_deprecated, to_be_added) # Separate external IDs from URLs ext_ids_to_be_added, urls_to_be_added = data_gathering.extract_ids_from_urls( to_be_added, ext_id_pids_to_urls) LOGGER.info( 'Validation completed. Target: %s %s. ' 'IDs to be deprecated: %d. ' 'Third-party IDs to be added: %d. ' 'URL statements to be added: %d', catalog, entity, len(to_be_deprecated), len(ext_ids_to_be_added), len(urls_to_be_added), ) return to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, wd_links
def dead_ids(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, Dict]: """Look for dead identifiers in Wikidata. An identifier is dead if it does not exist in the given catalog when this function is executed. Dead identifiers should be marked with a deprecated rank in Wikidata. **How it works:** 1. gather identifiers of the given catalog from relevant Wikidata items 2. look them up in the given catalog 3. if an identifier is not in the given catalog anymore, it should be deprecated :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata in a previous run :return: the ``dict`` pair of dead identifiers and identifiers gathered from Wikidata """ dead = defaultdict(set) db_entity = target_database.get_main_entity(catalog, entity) # Wikidata side if wd_cache is None: wd_ids = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_ids, ) else: wd_ids = wd_cache # Target catalog side session = DBManager.connect_to_db() try: for qid in wd_ids: for tid in wd_ids[qid][keys.TID]: existing = (session.query( db_entity.catalog_id).filter_by(catalog_id=tid).count()) if existing == 0: LOGGER.debug('%s %s identifier %s is dead', qid, catalog, tid) dead[tid].add(qid) session.commit() except SQLAlchemyError as error: LOGGER.error( "Failed query of target catalog identifiers due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.validator DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() finally: session.close() LOGGER.info( 'Check completed. Target: %s %s. Total dead identifiers: %d', catalog, entity, len(dead), ) return dead, wd_ids
def build_wikidata(goal: str, catalog: str, entity: str, dir_io: str) -> JsonReader: """Build a Wikidata dataset for training or classification purposes: workflow step 1. Data is gathered from the `SPARQL endpoint <https://query.wikidata.org/>`_ and the `Web API <https://www.wikidata.org/w/api.php>`_. **How it works:** 1. gather relevant Wikidata items that *hold* (for *training*) or *lack* (for *classification*) identifiers of the given catalog 2. gather relevant item data 3. dump the dataset to a gzipped `JSON Lines <http://jsonlines.org/>`_ file 4. read the dataset into a generator of :class:`pandas.DataFrame` chunks for memory-efficient processing :param goal: ``{'training', 'classification'}``. Whether to build a dataset for training or classification :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param dir_io: input/output directory where working files will be read/written :return: the generator yielding :class:`pandas.DataFrame` chunks """ qids_and_tids, wd_io_path = _handle_goal(goal, catalog, entity, dir_io) catalog_pid = target_database.get_catalog_pid(catalog, entity) if not os.path.isfile(wd_io_path): LOGGER.info( "Building Wikidata %s set for %s %s, output file '%s' ...", goal, catalog, entity, wd_io_path, ) # Make working folders os.makedirs(os.path.dirname(wd_io_path), exist_ok=True) # 1. Gather Wikidata items if goal == 'training': # WITH target IDs data_gathering.gather_target_ids(entity, catalog, catalog_pid, qids_and_tids) qids = qids_and_tids.keys() elif goal == 'classification': # WITHOUT target IDs qids = data_gathering.gather_qids(entity, catalog, catalog_pid) # 2. Collect relevant data, and 3. dump to gzipped JSON Lines url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() with gzip.open(wd_io_path, 'wt') as wd_io: api_requests.get_data_for_linker( catalog, entity, qids, url_pids, ext_id_pids_to_urls, qids_and_tids, wd_io, ) # Cached dataset, for development purposes else: LOGGER.info("Will reuse existing Wikidata %s set: '%s'", goal, wd_io_path) if goal == 'training': _reconstruct_qids_and_tids(wd_io_path, qids_and_tids) LOGGER.info('Wikidata %s set built', goal) return pd.read_json(wd_io_path, lines=True, chunksize=1000)
def links( catalog: str, entity: str, url_blacklist=False, wd_cache=None ) -> Optional[Tuple[defaultdict, list, list, list, list, list, dict]]: """Validate identifiers against available links. Also generate statements based on additional links found in the target catalog. They can be used to enrich Wikidata items. **How it works:** 1. gather links from the target catalog 2. gather links from relevant Wikidata items 3. look for shared links between pairs of Wikidata and catalog items: - when the pair does not share any link, the catalog identifier should be marked with a deprecated rank - when the catalog item has more links than the Wikidata one, they should be added to the latter 4. try to extract third-party identifiers from extra links :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param url_blacklist: (optional) whether to apply a blacklist of URL domains. Default: ``False`` :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run. Default: ``None`` :return: 7 objects 1. ``dict`` of identifiers that should be deprecated 2. ``list`` of third-party identifiers that should be added 3. ``list`` of URLs that should be added 4. ``list`` of third-party identifiers that should be referenced 5. ``list`` of URLs that should be referenced 6. ``list`` of URLs found in Wikidata but not in the target catalog 7. ``dict`` of links gathered from Wikidata or ``None`` if the target catalog has no links. """ # Target catalog side first: # enable early return in case of no target links target_links = data_gathering.gather_target_links(entity, catalog) if target_links is None: return None deprecate, add = defaultdict(set), defaultdict(set) reference, wd_only = defaultdict(set), defaultdict(set) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() if wd_cache is None: wd_links = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_links, ) data_gathering.gather_wikidata_links(wd_links, url_pids, ext_id_pids_to_urls) else: wd_links = wd_cache # Validation _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference, wd_only) # URLs to be added: # 1. Separate external IDs from URLs add_ext_ids, add_urls = data_gathering.extract_ids_from_urls( add, ext_id_pids_to_urls) # 2. Apply URL blacklist if url_blacklist: add_urls = _apply_url_blacklist(add_urls) # URLs to be referenced: separate external IDs from URLs ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls( reference, ext_id_pids_to_urls) # Wikidata-only URLs: convert into a list of statements # with complete Wikidata item URLs wd_only_urls = [] for (qid, tid), urls in wd_only.items(): for url in urls: wd_only_urls.append((tid, url, QID_PREFIX + qid)) LOGGER.info( 'Validation completed. Target: %s %s. ' 'IDs to be deprecated: %d. ' 'Third-party IDs to be added: %d. ' 'URL statements to be added: %d. ' 'Third-party IDs to be referenced: %d. ' 'URL statements to be referenced: %d. ' 'URL in Wikidata but not in the target: %d', catalog, entity, len(deprecate), len(add_ext_ids), len(add_urls), len(ref_ext_ids), len(ref_urls), len(wd_only_urls), ) return ( deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_only_urls, wd_links, )