def gather_target_links(entity_type, catalog): catalog_constants = _get_catalog_constants(catalog) catalog_entity = _get_catalog_entity(entity_type, catalog_constants) LOGGER.info('Gathering %s %s links ...', catalog, entity_type) link_entity = catalog_entity['link_entity'] session = DBManager.connect_to_db() result = None try: query = session.query(link_entity.catalog_id, link_entity.url) count = query.count() if count == 0: LOGGER.warning( "No links available for %s %s. Stopping validation here", catalog, entity_type) return None LOGGER.info('Got %d links from %s %s', count, catalog, entity_type) result = query.all() session.commit() except: session.rollback() raise finally: session.close() if result is None: return None for row in result: yield row.catalog_id, row.url
def gather_target_metadata(entity_type, catalog): catalog_constants = _get_catalog_constants(catalog) catalog_entity = _get_catalog_entity(entity_type, catalog_constants) LOGGER.info( 'Gathering %s birth/death dates/places and gender metadata ...', catalog) entity = catalog_entity['entity'] # Base metadata query_fields = _build_metadata_query_fields(entity, entity_type, catalog) session = DBManager.connect_to_db() result = None try: result = _run_metadata_query(session, query_fields, entity, catalog, entity_type) session.commit() except: session.rollback() raise finally: session.close() if not result: return None return _parse_target_metadata_query_result(result)
def check_links_cli(catalog: str): """ Check for rotten URLs of an imported catalog. :param catalog: one of the keys of constants.TARGET_CATALOGS """ for entity_type in target_database.supported_entities_for_target(catalog): LOGGER.info("Validating %s %s links...", catalog, entity_type) entity = target_database.get_link_entity(catalog, entity_type) if not entity: LOGGER.info( "%s %s does not have a links table. Skipping...", catalog, entity_type, ) continue session = DBManager.connect_to_db() total = session.query(entity).count() removed = 0 with Pool() as pool: # Validate each link for resolved, res_entity in tqdm( pool.imap_unordered(_resolve_url, session.query(entity)), total=total, ): if not resolved: session_delete = DBManager.connect_to_db() # if not valid delete session_delete.delete(res_entity) try: session_delete.commit() removed += 1 except: session.rollback() raise finally: session_delete.close() session.close() LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog, entity_type)
def build_target( goal: str, catalog: str, entity: str, identifiers: Set[str] ) -> Iterator[pd.DataFrame]: """Build a target catalog dataset for training or classification purposes: workflow step 1. Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p`` database. This is where the :mod:`importer` inserts processed catalog dumps. The database is located in `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_ under the Wikimedia `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure. See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_. :param goal: ``{'training', 'classification'}``. Whether to build a dataset for training or classification :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param identifiers: a set of catalog IDs to gather data for :return: the generator yielding :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity) # Target catalog ORM entities/DB tables base, link, nlp = ( target_database.get_main_entity(catalog, entity), target_database.get_link_entity(catalog, entity), target_database.get_nlp_entity(catalog, entity), ) tables = [table for table in (base, link, nlp) if table] # Initial query with all non-null tables query = Query(tables) # Remove `base` to avoid outer join with itself tables.remove(base) # Outer joins for table in tables: query = query.outerjoin(table, base.catalog_id == table.catalog_id) # Condition query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads( False ) sql = query.statement LOGGER.debug('SQL query to be fired: %s', sql) # Avoid loading query result in memory db_engine = DBManager().get_engine().execution_options(stream_results=True) return read_sql(sql, db_engine, chunksize=1000)
def perfect_name_search(target_entity: T, to_search: str) -> Iterable[T]: session = DBManager.connect_to_db() try: for r in session.query(target_entity).filter( target_entity.name == to_search).all(): yield r session.commit() except: session.rollback() raise finally: session.close()
def perfect_name_search(target_entity: constants.DB_ENTITY, to_search: str) -> Iterable[constants.DB_ENTITY]: session = DBManager.connect_to_db() try: for r in (session.query(target_entity).filter( target_entity.name == to_search).all()): yield r except: session.rollback() raise finally: session.close()
def name_fulltext_search(target_entity: T, query: str) -> Iterable[T]: ft_search = target_entity.name.match(query) session = DBManager.connect_to_db() try: for r in session.query(target_entity).filter(ft_search).all(): yield r session.commit() except: session.rollback() raise finally: session.close()
def tokens_fulltext_search( target_entity: constants.DB_ENTITY, boolean_mode: bool, tokens: Iterable[str], where_clause=None, limit: int = 10, ) -> Iterable[constants.DB_ENTITY]: if issubclass(target_entity, models.base_entity.BaseEntity): column = target_entity.name_tokens elif issubclass(target_entity, models.base_link_entity.BaseLinkEntity): column = target_entity.url_tokens elif issubclass(target_entity, models.base_nlp_entity.BaseNlpEntity): column = target_entity.description_tokens else: LOGGER.critical('Bad target entity class: %s', target_entity) raise ValueError('Bad target entity class: %s' % target_entity) tokens = filter(None, tokens) terms = (' '.join(map('+{0}'.format, tokens)) if boolean_mode else ' '.join(tokens)) ft_search = column.match(terms) session = DBManager.connect_to_db() try: if where_clause is None: query = session.query(target_entity).filter(ft_search).limit(limit) else: query = (session.query(target_entity).filter(ft_search).filter( where_clause).limit(limit)) count = query.count() if count == 0: LOGGER.debug( "No result from full-text index query to %s. Terms: '%s'", target_entity.__name__, terms, ) session.commit() else: for row in query: yield row session.commit() except: session.rollback() raise finally: session.close()
def check_existence(class_or_occupation_query, class_qid, catalog_pid, entity: BaseEntity): query_type = 'identifier', class_or_occupation_query session = DBManager.connect_to_db() invalid = defaultdict(set) count = 0 for result in sparql_queries.run_identifier_or_links_query( query_type, class_qid, catalog_pid, 0): for qid, target_id in result.items(): results = session.query(entity).filter( entity.catalog_id == target_id).all() if not results: LOGGER.warning('%s identifier %s is invalid', qid, target_id) invalid[target_id].add(qid) count += 1 LOGGER.info('Total invalid identifiers = %d', count) # Sets are not serializable to JSON, so cast them to lists return {target_id: list(qids) for target_id, qids in invalid.items()}
def gather_target_links(entity, catalog): LOGGER.info('Gathering %s %s links ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) # Early return when the links table doesn't exist if link_entity is None: LOGGER.warning( 'No links table available in the database for %s %s. ' 'Stopping validation here', catalog, entity, ) return None session = DBManager.connect_to_db() result = None try: query = session.query(link_entity.catalog_id, link_entity.url) count = query.count() # Early return when no links if count == 0: LOGGER.warning( 'No links available for %s %s. Stopping validation here', catalog, entity, ) return None LOGGER.info('Got %d links from %s %s', count, catalog, entity) result = query.all() session.commit() except: session.rollback() raise finally: session.close() if result is None: return None for row in result: yield row.catalog_id, row.url
def tokens_fulltext_search(target_entity: T, boolean_mode: bool, tokens: Iterable[str]) -> Iterable[T]: query = None if boolean_mode: query = ' '.join(map('+{0}'.format, tokens)) else: query = ' '.join(tokens) ft_search = target_entity.tokens.match(query) session = DBManager.connect_to_db() result = [] try: result = session.query(target_entity).filter(ft_search).all() session.commit() except: session.rollback() raise if not result: return [] return result
def _gather_target_data( catalog, entity, total_queries, works_buckets, works_inverted, people_buckets, people_inverted, ): claim_pid = vocabulary.WORKS_BY_PEOPLE_MAPPING[catalog][entity] db_entity = target_database.get_relationship_entity(catalog, entity) session = DBManager().connect_to_db() # Leverage works-people relationships try: for works, people in tqdm(product(works_buckets, people_buckets), total=total_queries): works_to_people = session.query(db_entity).filter( and_( db_entity.from_catalog_id.in_(works), db_entity.to_catalog_id.in_(people), )) for result in works_to_people: yield works_inverted[ result.from_catalog_id], claim_pid, people_inverted[ result.to_catalog_id], result.to_catalog_id except SQLAlchemyError as error: LOGGER.error( "Failed query of works-people relationships due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.validator DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() return None finally: session.close()
def gather_target_biodata(entity, catalog): LOGGER.info( 'Gathering %s birth/death dates/places and gender metadata ...', catalog) db_entity = target_database.get_main_entity(catalog, entity) # Base biodata query_fields = _build_biodata_query_fields(db_entity, entity, catalog) session = DBManager.connect_to_db() query = session.query(*query_fields).filter( or_(db_entity.born.isnot(None), db_entity.died.isnot(None))) result = None try: raw_result = _run_query(query, catalog, entity) if raw_result is None: return None result = _parse_target_biodata_query_result(raw_result) session.commit() except: session.rollback() raise finally: session.close() return result
def extract_and_populate(self, dump_file_path): dump_path = os.path.join(os.path.dirname( os.path.abspath(dump_file_path)), "%s_%s" % (os.path.basename(dump_file_path), 'extracted')) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: tar.extractall(dump_path) tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity] db_manager = DBManager() db_manager.drop(tables) db_manager.create(tables) artist_count = 0 for artist in self._artist_generator(dump_path): artist_count = artist_count + 1 session = db_manager.new_session() session.add(artist) session.commit() LOGGER.debug("Added %s artist records" % artist_count) db_manager.drop([MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]) db_manager.create([MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]) link_count = 0 for link in self._link_generator(dump_path): link_count = link_count + 1 session = db_manager.new_session() session.add(link) session.commit() LOGGER.debug("Added %s link records" % link_count) isni_link_count = 0 for link in self._isni_link_generator(dump_path): isni_link_count = isni_link_count + 1 session = db_manager.new_session() session.add(link) session.commit() LOGGER.debug("Added %s ISNI link records" % isni_link_count) db_manager.drop([MusicBrainzArtistBandRelationship]) db_manager.create([MusicBrainzArtistBandRelationship]) relationships_count = 0 relationships_total = 0 for relationship in self._artist_band_relationship_generator(dump_path): try: relationships_total = relationships_total + 1 session = db_manager.new_session() session.add(relationship) session.commit() relationships_count = relationships_count + 1 except IntegrityError as i: LOGGER.warning(str(i)) LOGGER.debug("Added %s/%s relationships records" % (relationships_count, relationships_total))
def add_matches( file_path: str, catalog_id: int, catalog: str, entity: str, confidence_range: Tuple[float, float], ) -> None: """Add or update matches to an existing catalog. Curated matches found in the catalog are kept as is. :param file_path: path to a file with matches :param catalog_id: the catalog *id* field of the *catalog* table in the *s51434__mixnmatch_p* Toolforge database :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param confidence_range: a pair of floats indicating the minimum and maximum confidence scores of matches that will be added/updated. """ success = True # Flag to log that everything went fine class_qid, url_prefix = _handle_metadata(catalog, entity) matches = _handle_matches(file_path, confidence_range) LOGGER.info( "Starting import of %s %s matches (catalog ID: %d) into the mix'n'match DB ...", catalog, entity, catalog_id, ) start = datetime.now() session = DBManager(MNM_DB).new_session() # Note that the session is kept open after this operation curated, success = _sync_matches(session, catalog_id, success) # Filter curated matches: # rows with tids that are NOT (~) in curated tids matches = matches[~matches[keys.TID].isin(curated)] n_matches = len(matches) matches_reader = matches.itertuples(index=False, name=None) batch = [] try: _import_matches( batch, catalog, catalog_id, class_qid, entity, matches_reader, n_matches, session, url_prefix, ) LOGGER.info( 'Adding last batch of %d %s %s matches, this may take a while ...', len(batch), catalog, entity, ) # Commit remaining entities session.bulk_save_objects(batch) session.commit() except SQLAlchemyError as error: LOGGER.error( "Failed addition/update due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.ingester DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() success = False finally: session.close() if success: end = datetime.now() LOGGER.info( 'Import of %s %s matches (catalog ID: %d) completed in %s. ' 'Total matches: %d', catalog, entity, catalog_id, end - start, n_matches, )
def add_catalog(catalog: str, entity: str) -> int: """Add or update a catalog. :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :return: the catalog *id* field of the *catalog* table in the *s51434__mixnmatch_p* Toolforge database """ name_field = f'{catalog.title()} {entity}' session = DBManager(MNM_DB).new_session() try: existing = (session.query( mix_n_match.MnMCatalog).filter_by(name=name_field).first()) if existing is None: LOGGER.info( "Adding %s %s catalog to the mix'n'match DB ... ", catalog, entity, ) db_entity = mix_n_match.MnMCatalog() _set_catalog_fields(db_entity, name_field, catalog, entity) session.add(db_entity) session.commit() catalog_id = db_entity.id else: LOGGER.info('Updating %s %s catalog ... ', catalog, entity) catalog_id = existing.id _set_catalog_fields(existing, name_field, catalog, entity) session.add(existing) session.commit() except SQLAlchemyError as error: LOGGER.error( "Failed catalog addition/update due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.ingester DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() return None finally: session.close() LOGGER.info('Catalog addition/update went fine. Internal ID: %d', catalog_id) return catalog_id
def dead_ids(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, Dict]: """Look for dead identifiers in Wikidata. An identifier is dead if it does not exist in the given catalog when this function is executed. Dead identifiers should be marked with a deprecated rank in Wikidata. **How it works:** 1. gather identifiers of the given catalog from relevant Wikidata items 2. look them up in the given catalog 3. if an identifier is not in the given catalog anymore, it should be deprecated :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata in a previous run :return: the ``dict`` pair of dead identifiers and identifiers gathered from Wikidata """ dead = defaultdict(set) db_entity = target_database.get_main_entity(catalog, entity) # Wikidata side if wd_cache is None: wd_ids = {} data_gathering.gather_target_ids( entity, catalog, target_database.get_catalog_pid(catalog, entity), wd_ids, ) else: wd_ids = wd_cache # Target catalog side session = DBManager.connect_to_db() try: for qid in wd_ids: for tid in wd_ids[qid][keys.TID]: existing = (session.query( db_entity.catalog_id).filter_by(catalog_id=tid).count()) if existing == 0: LOGGER.debug('%s %s identifier %s is dead', qid, catalog, tid) dead[tid].add(qid) session.commit() except SQLAlchemyError as error: LOGGER.error( "Failed query of target catalog identifiers due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.validator DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) session.rollback() finally: session.close() LOGGER.info( 'Check completed. Target: %s %s. Total dead identifiers: %d', catalog, entity, len(dead), ) return dead, wd_ids
def _process_masters_dump(self, dump_file_path): LOGGER.info("Starting import of masters from Discogs dump '%s'", dump_file_path) start = datetime.now() tables = [DiscogsMasterEntity, DiscogsMasterArtistRelationship] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) extracted_path = '.'.join(dump_file_path.split('.')[:-1]) # Extract dump file if it has not yet been extracted if not os.path.exists(extracted_path): LOGGER.info('Extracting dump file') with gzip.open(dump_file_path, 'rb') as f_in: with open(extracted_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # count number of entries n_rows = sum( 1 for _ in self._g_process_et_items(extracted_path, 'master')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities relationships_set = set() self.total_entities = 0 for _, node in tqdm(self._g_process_et_items(extracted_path, 'master'), total=n_rows): if not node.tag == 'master': continue self.total_entities += 1 entity = self._extract_from_master_node(node, relationships_set) entity_array.append(entity) # commit in batches of `self._sqlalchemy_commit_every` if len(entity_array) >= self._sqlalchemy_commit_every: LOGGER.info('Adding batch of entities to the database, ' 'this will take a while.' 'Progress will resume soon.') insert_start_time = datetime.now() session.bulk_save_objects(entity_array) session.commit() session.expunge_all() # clear session entity_array.clear() # clear entity array LOGGER.debug( 'It took %s to add %s entities to the database', datetime.now() - insert_start_time, self._sqlalchemy_commit_every, ) # finally commit remaining entities in session # (if any), and close session session.bulk_save_objects(entity_array) session.bulk_save_objects([ DiscogsMasterArtistRelationship(id1, id2) for id1, id2 in relationships_set ]) session.commit() session.close() end = datetime.now() LOGGER.info( 'Import completed in %s. Total entities: %d. ' 'Total relationships %s.', end - start, self.total_entities, len(relationships_set), ) # once the import process is complete, # we can safely delete the extracted discogs dump os.remove(extracted_path)
def _process_artists_dump(self, dump_file_path, resolve): LOGGER.info( "Starting import of musicians and bands from Discogs dump '%s'", dump_file_path, ) start = datetime.now() tables = [ DiscogsMusicianEntity, DiscogsMusicianNlpEntity, DiscogsMusicianLinkEntity, DiscogsGroupEntity, DiscogsGroupNlpEntity, DiscogsGroupLinkEntity, ] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) extracted_path = '.'.join(dump_file_path.split('.')[:-1]) # Extract dump file if it has not yet been extracted if not os.path.exists(extracted_path): LOGGER.info('Extracting dump file') with gzip.open(dump_file_path, 'rb') as f_in: with open(extracted_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # count number of entries n_rows = sum( 1 for _ in self._g_process_et_items(extracted_path, 'artist')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities for _, node in tqdm(self._g_process_et_items(extracted_path, 'artist'), total=n_rows): if not node.tag == 'artist': continue infos = self._extract_from_artist_node(node, resolve) if infos is None: continue if 'groups' in infos: entity = DiscogsMusicianEntity() self._populate_musician(entity_array, entity, infos) # Band elif 'members' in infos: entity = DiscogsGroupEntity() self._populate_band(entity_array, entity, infos) # commit in batches of `self._sqlalchemy_commit_every` if len(entity_array) >= self._sqlalchemy_commit_every: LOGGER.info('Adding batch of entities to the database, ' 'this will take a while. ' 'Progress will resume soon.') insert_start_time = datetime.now() session.bulk_save_objects(entity_array) session.commit() session.expunge_all() # clear session entity_array.clear() # clear entity array LOGGER.debug( 'It took %s to add %s entities to the database', datetime.now() - insert_start_time, self._sqlalchemy_commit_every, ) # finally commit remaining entities in session # (if any), and close session session.bulk_save_objects(entity_array) session.commit() session.close() end = datetime.now() LOGGER.info( 'Import completed in %s. ' 'Total entities: %d - %d musicians with %d links - %d bands' ' with %d links - %d discarded dead links.', end - start, self.total_entities, self.musicians, self.musician_links, self.bands, self.band_links, self.dead_links, ) # once the import process is complete, # we can safely delete the extracted discogs dump os.remove(extracted_path)
def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """ Extracts the data in the dumps (person and movie) and processes them. It then proceeds to add the appropriate data to the database. See :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy definition of the entities we use to save IMDB data. :param dump_file_paths: the absolute paths of the already downloaded dump files. """ # the order of these files is specified in `self.get_dump_download_urls` person_file_path = dump_file_paths[0] movies_file_path = dump_file_paths[1] LOGGER.debug('Path to movie info dump: %s', movies_file_path) LOGGER.debug('Path to person info dump: %s', person_file_path) start = datetime.datetime.now() tables = [ imdb_entity.ImdbActorEntity, imdb_entity.ImdbDirectorEntity, imdb_entity.ImdbMovieEntity, imdb_entity.ImdbMusicianEntity, imdb_entity.ImdbProducerEntity, imdb_entity.ImdbWriterEntity, imdb_entity.ImdbMoviePersonRelationship, ] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) LOGGER.info('Starting import of movies ...') # Here we open the movie dump file, and add everything to the DB for movie_info, entity_array in self._loop_through_entities( movies_file_path): # create the movie SQLAlchemy entity and populate it movie_entity = imdb_entity.ImdbMovieEntity() movie_entity.catalog_id = movie_info.get('tconst') movie_entity.title_type = movie_info.get('titleType') if movie_info.get('primaryTitle') is not None: movie_entity.name = movie_info.get('primaryTitle') movie_entity.name_tokens = ' '.join( text_utils.tokenize(movie_info.get('primaryTitle'))) movie_entity.is_adult = (True if movie_info.get('isAdult') == '1' else False) try: movie_entity.born = datetime.date(year=int( movie_info.get('startYear')), month=1, day=1) movie_entity.born_precision = 9 except (KeyError, TypeError): LOGGER.debug('No start year value for %s', movie_entity) try: movie_entity.died = datetime.date(year=int( movie_info.get('endYear')), month=1, day=1) movie_entity.died_precision = 9 except (KeyError, TypeError): LOGGER.debug('No end year value for %s', movie_entity) movie_entity.runtime_minutes = movie_info.get('runtimeMinutes') if movie_info.get('genres'): # if movie has a genre specified movie_entity.genres = ' '.join( text_utils.tokenize(movie_info.get('genres'))) # Creates entity for alias alias = movie_info.get('originalTitle') if alias is not None and movie_entity.name != alias: alias_entity = copy.deepcopy(movie_entity) alias_entity.name = alias alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias)) entity_array.append(alias_entity) entity_array.append(movie_entity) self.n_movies += 1 # mark end for movie import process end = datetime.datetime.now() LOGGER.info( 'Movie import completed in %s. ' 'Total movies imported: %d', end - start, self.n_movies, ) LOGGER.info('Starting import of people ...') # reset timer for persons import start = datetime.datetime.now() for person_info, entity_array in self._loop_through_entities( person_file_path): # IMDb saves the list of professions as a comma separated # string professions = person_info.get('primaryProfession') # if person has no professions then ignore it if not professions: LOGGER.debug('Person %s has no professions', person_info.get('nconst')) continue professions = professions.split(',') # each person can be added to multiple tables in the DB, # each table stands for one of the main professions types_of_entities = [] if 'actor' in professions or 'actress' in professions: self.n_actors += 1 types_of_entities.append(imdb_entity.ImdbActorEntity()) if 'director' in professions: self.n_directors += 1 types_of_entities.append(imdb_entity.ImdbDirectorEntity()) if 'producer' in professions: self.n_producers += 1 types_of_entities.append(imdb_entity.ImdbProducerEntity()) if any(prof in [ 'sound_department', 'composer', 'music_department', 'soundtrack', ] for prof in professions): self.n_musicians += 1 types_of_entities.append(imdb_entity.ImdbMusicianEntity()) if 'writer' in professions: self.n_writers += 1 types_of_entities.append(imdb_entity.ImdbWriterEntity()) # if the only profession a person has is `miscellaneous` then we # add it to all tables if professions == ['miscellaneous']: self.n_misc += 1 types_of_entities = [ imdb_entity.ImdbActorEntity(), imdb_entity.ImdbDirectorEntity(), imdb_entity.ImdbMusicianEntity(), imdb_entity.ImdbProducerEntity(), imdb_entity.ImdbWriterEntity(), ] # add person to every matching table for etype in types_of_entities: self._populate_person(etype, person_info, entity_array) # if person is known for any movies then add these to the # database as well if person_info.get('knownForTitles'): self.n_person_movie_links += 1 self._populate_person_movie_relations(person_info, entity_array) self.n_persons += 1 # mark the end time for the person import process end = datetime.datetime.now() LOGGER.info( 'Person import completed in %s. ' 'Total people imported: %d - ' 'Actors: %d - Directors: %d - Musicians: %d - ' 'Producers: %d - Writers: %d - Misc: %d', end - start, self.n_persons, self.n_actors, self.n_directors, self.n_musicians, self.n_producers, self.n_writers, self.n_misc, )
def _loop_through_entities( self, file_path: str) -> Generator[Tuple[Dict, List], None, None]: """ Generator that given an IMDb dump file (which should be ".tsv.gz" format) it loops through every entry and yields it. :return: a generator which yields a Tuple[entity_info, entity_array] the consumer of this generator will take `entity_info`, create an SQLAlchemy entity, and append this to the `entity_array` """ db_manager = DBManager() with gzip.open(file_path, 'rt') as ddump: session = db_manager.new_session() # count number of rows for TQDM, so we can display how # much is missing to complete the process. Then go back # to the start of the file with `.seek(0)` n_rows = sum(1 for line in ddump) ddump.seek(0) entity_array = [] LOGGER.debug('Dump "%s" has %d entries', file_path, n_rows) reader = csv.DictReader(ddump, delimiter='\t') # for every entry in the file.. for entity_info in tqdm(reader, total=n_rows): # clean the entry self._normalize_null(entity_info) # yield the cleaned dict yield entity_info, entity_array # every `_sqlalchemy_commit_every` loops we commit the # session to the DB. This is more efficient than commiting # every loop, and is not so hard on the memory requirements # as would be adding everything to session and commiting once # the for loop is done if len(entity_array) >= self._sqlalchemy_commit_every: LOGGER.info( 'Adding batch of entities to the database, ' 'this will take a while. Progress will resume soon.') insert_start_time = datetime.datetime.now() session.bulk_save_objects(entity_array) session.commit() session.expunge_all() # clear session entity_array.clear() # clear entity array LOGGER.debug( 'It took %s to add %s entities to the database', datetime.datetime.now() - insert_start_time, len(entity_array), ) # commit remaining entities session.bulk_save_objects(entity_array) session.commit() # clear list reference since it might still be available in # the scope where this generator was used. entity_array.clear()
def extract_and_populate(self, dump_file_paths: List[str], resolve: bool): """Extract relevant data from the *artist* (people) and *release group* (works) MusicBrainz dumps, preprocess them, populate `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist them to a database instance. See :mod:`~soweego.importer.models.musicbrainz_entity` for the ORM definitions. :param dump_file_paths: paths to downloaded catalog dumps :param resolve: whether to resolve URLs found in catalog dumps or not """ dump_file_path = dump_file_paths[0] dump_path = os.path.join( os.path.dirname(os.path.abspath(dump_file_path)), f"{os.path.basename(dump_file_path)}_extracted", ) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: LOGGER.info("Extracting dump %s in %s", dump_file_path, dump_path) tar.extractall(dump_path) LOGGER.info("Extracted dump %s in %s", dump_file_path, dump_path) db_manager = DBManager() tables = [MusicBrainzReleaseGroupEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups") release_groups_count = self._add_entities_from_generator( db_manager, self._release_group_generator, dump_path) LOGGER.debug("Added %s/%s release group records", *release_groups_count) def release_artist_relationships_uniqueness_filter(): """Remove duplicates from _release_group_artist_relationship_generator """ yield from [ MusicBrainzReleaseGroupArtistRelationship(item[0], item[1]) for item in set( self._release_group_artist_relationship_generator( dump_path)) ] tables = [MusicBrainzReleaseGroupArtistRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships release-artist/band") relationships_count = self._add_entities_from_generator( db_manager, release_artist_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) tables = [MusicBrainzReleaseGroupLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups links") link_count = self._add_entities_from_generator( db_manager, self._release_group_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s release group link records", *link_count) tables = [MusicBrainzArtistEntity, MusicBrainzBandEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing artists and bands") artist_count = self._add_entities_from_generator( db_manager, self._artist_generator, dump_path) LOGGER.debug("Added %s/%s artist records", *artist_count) tables = [MusicBrainzArtistLinkEntity, MusicBrainzBandLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing links") link_count = self._add_entities_from_generator( db_manager, self._artist_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s link records", *link_count) LOGGER.info("Importing ISNIs") isni_link_count = self._add_entities_from_generator( db_manager, self._isni_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count) tables = [MusicBrainzArtistBandRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships artist-band") def artist_band_relationships_uniqueness_filter(): yield from [ MusicBrainzArtistBandRelationship(item[0], item[1]) for item in set( self._artist_band_relationship_generator(dump_path)) ] relationships_count = self._add_entities_from_generator( db_manager, artist_band_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) shutil.rmtree(dump_path, ignore_errors=True)
def _get_credentials_from_file() -> Tuple[Union[str, None], Union[str, None]]: credentials = DBManager.get_credentials() return ( credentials[keys.WIKIDATA_API_USER], credentials[keys.WIKIDATA_API_PASSWORD], )
def handle(file_path: str, mappings: dict, entity_type, link_type): """Generic .nt file dump handler for the dump import into the DB. Assumptions: each entity must be represented in a compact block of ''adjacent'' lines """ db_manager = DBManager( get_path('soweego.importer.resources', 'db_credentials.json')) db_manager.drop(link_type) db_manager.drop(entity_type) session = db_manager.new_session() if not os.path.isfile(file_path): LOGGER.warning("file: %s not found", file_path) with open(file_path) as file: current_key = None current_entity = entity_type() for row in file.readlines(): try: # split triples row_chunks = [] split_regex = r'(?<=["\<])[^<>"]*(?=["\>])' for match in re.finditer(split_regex, row, re.I): row_chunks.append(match.group(0)) if len(row_chunks) != 3: raise Exception(loc.MALFORMED_ROW) if row_chunks[1] in mappings: current_field = mappings[row_chunks[1]] else: raise Exception(loc.FIELD_NOT_MAPPED % row_chunks[1]) if current_field == 'url': link = link_type() link.url = row_chunks[2] link.tokens = row_chunks[2].replace("/", " ") # TODO link.catalog_id = current_entity.catalog_id session.add(link) if current_key is not None and current_key != row_chunks[0]: session.add(current_entity) current_entity = entity_type() current_key = row_chunks[0] current_value = row_chunks[2] setattr(current_entity, current_field, current_value) except Exception as e: LOGGER.warning('Error at row %s \n %s', row, str(e)) session.add(current_entity) try: db_manager.create(entity_type) db_manager.create(link_type) session.commit() except Exception as e: LOGGER.warning(loc.WRONG_MAPPINGS, str(e))
def check_urls_cli(catalog, drop, dir_io): """Check for rotten URLs of an imported catalog. For every catalog entity, dump rotten URLs to a file. CSV format: URL,catalog_ID Use '-d' to drop rotten URLs from the DB on the fly. """ for entity in target_database.supported_entities_for_target(catalog): out_path = os.path.join( dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity) ) LOGGER.info('Starting check of %s %s URLs ...', catalog, entity) link_entity = target_database.get_link_entity(catalog, entity) if not link_entity: LOGGER.info( '%s %s does not have a links table. Skipping ...', catalog, entity, ) continue query_session = DBManager.connect_to_db() total = query_session.query(link_entity).count() rotten = 0 if drop: removed = 0 # Parallel operation with Pool() as pool, open(out_path, 'w', buffering=1) as fout: writer = csv.writer(fout) try: # Resolve every URL for resolved, result in tqdm( pool.imap_unordered(_resolve, query_session.query(link_entity)), total=total, ): if not resolved: # Dump writer.writerow((result.url, result.catalog_id)) rotten += 1 # Drop from DB if drop: delete_session = DBManager.connect_to_db() delete_session.delete(result) try: delete_session.commit() removed += 1 except SQLAlchemyError as error: LOGGER.error( 'Failed deletion of %s: %s', result, error.__class__.__name__, ) LOGGER.debug(error) delete_session.rollback() finally: delete_session.close() except SQLAlchemyError as error: LOGGER.error( '%s while querying %s %s URLs', error.__class__.__name__, catalog, entity, ) LOGGER.debug(error) session.rollback() finally: query_session.close() LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info()) LOGGER.info( "Total %s %s rotten URLs dumped to '%s': %d / %d", catalog, entity, out_path, rotten, total, ) if drop: LOGGER.info( 'Total %s %s rotten URLs dropped from the DB: %d / %d', catalog, entity, rotten, removed, )
def extract_and_populate(self, dump_file_paths: Iterable[str], resolve: bool): dump_file_path = dump_file_paths[0] dump_path = os.path.join( os.path.dirname(os.path.abspath(dump_file_path)), f"{os.path.basename(dump_file_path)}_extracted", ) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: LOGGER.info("Extracting dump %s in %s", dump_file_path, dump_path) tar.extractall(dump_path) LOGGER.info("Extracted dump %s in %s", dump_file_path, dump_path) db_manager = DBManager() tables = [MusicbrainzReleaseGroupEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups") release_groups_count = self._add_entities_from_generator( db_manager, self._release_group_generator, dump_path) LOGGER.debug("Added %s/%s release group records", *release_groups_count) def release_artist_relationships_uniqueness_filter(): """Remove duplicates from _release_group_artist_relationship_generator """ yield from [ MusicBrainzReleaseGroupArtistRelationship(item[0], item[1]) for item in set( self._release_group_artist_relationship_generator( dump_path)) ] tables = [MusicBrainzReleaseGroupArtistRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships release-artist/band") relationships_count = self._add_entities_from_generator( db_manager, release_artist_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) tables = [MusicbrainzReleaseGroupLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups links") link_count = self._add_entities_from_generator( db_manager, self._release_group_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s release group link records", *link_count) tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing artists and bands") artist_count = self._add_entities_from_generator( db_manager, self._artist_generator, dump_path) LOGGER.debug("Added %s/%s artist records", *artist_count) tables = [MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing links") link_count = self._add_entities_from_generator( db_manager, self._artist_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s link records", *link_count) LOGGER.info("Importing ISNIs") isni_link_count = self._add_entities_from_generator( db_manager, self._isni_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count) tables = [MusicBrainzArtistBandRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships artist-band") def artist_band_relationships_uniqueness_filter(): yield from [ MusicBrainzArtistBandRelationship(item[0], item[1]) for item in set( self._artist_band_relationship_generator(dump_path)) ] relationships_count = self._add_entities_from_generator( db_manager, artist_band_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) shutil.rmtree(dump_path, ignore_errors=True)