def handle(file_path: str, mappings: dict, entity_type, link_type): """Generic .nt file dump handler for the dump import into the DB. Assumptions: each entity must be represented in a compact block of ''adjacent'' lines """ db_manager = DBManager( get_path('soweego.importer.resources', 'db_credentials.json')) db_manager.drop(link_type) db_manager.drop(entity_type) session = db_manager.new_session() if not os.path.isfile(file_path): LOGGER.warning("file: %s not found", file_path) with open(file_path) as file: current_key = None current_entity = entity_type() for row in file.readlines(): try: # split triples row_chunks = [] split_regex = r'(?<=["\<])[^<>"]*(?=["\>])' for match in re.finditer(split_regex, row, re.I): row_chunks.append(match.group(0)) if len(row_chunks) != 3: raise Exception(loc.MALFORMED_ROW) if row_chunks[1] in mappings: current_field = mappings[row_chunks[1]] else: raise Exception(loc.FIELD_NOT_MAPPED % row_chunks[1]) if current_field == 'url': link = link_type() link.url = row_chunks[2] link.tokens = row_chunks[2].replace("/", " ") # TODO link.catalog_id = current_entity.catalog_id session.add(link) if current_key is not None and current_key != row_chunks[0]: session.add(current_entity) current_entity = entity_type() current_key = row_chunks[0] current_value = row_chunks[2] setattr(current_entity, current_field, current_value) except Exception as e: LOGGER.warning('Error at row %s \n %s', row, str(e)) session.add(current_entity) try: db_manager.create(entity_type) db_manager.create(link_type) session.commit() except Exception as e: LOGGER.warning(loc.WRONG_MAPPINGS, str(e))
def extract_and_populate(self, dump_file_path): dump_path = os.path.join(os.path.dirname( os.path.abspath(dump_file_path)), "%s_%s" % (os.path.basename(dump_file_path), 'extracted')) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: tar.extractall(dump_path) tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity] db_manager = DBManager() db_manager.drop(tables) db_manager.create(tables) artist_count = 0 for artist in self._artist_generator(dump_path): artist_count = artist_count + 1 session = db_manager.new_session() session.add(artist) session.commit() LOGGER.debug("Added %s artist records" % artist_count) db_manager.drop([MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]) db_manager.create([MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]) link_count = 0 for link in self._link_generator(dump_path): link_count = link_count + 1 session = db_manager.new_session() session.add(link) session.commit() LOGGER.debug("Added %s link records" % link_count) isni_link_count = 0 for link in self._isni_link_generator(dump_path): isni_link_count = isni_link_count + 1 session = db_manager.new_session() session.add(link) session.commit() LOGGER.debug("Added %s ISNI link records" % isni_link_count) db_manager.drop([MusicBrainzArtistBandRelationship]) db_manager.create([MusicBrainzArtistBandRelationship]) relationships_count = 0 relationships_total = 0 for relationship in self._artist_band_relationship_generator(dump_path): try: relationships_total = relationships_total + 1 session = db_manager.new_session() session.add(relationship) session.commit() relationships_count = relationships_count + 1 except IntegrityError as i: LOGGER.warning(str(i)) LOGGER.debug("Added %s/%s relationships records" % (relationships_count, relationships_total))
def _process_masters_dump(self, dump_file_path): LOGGER.info("Starting import of masters from Discogs dump '%s'", dump_file_path) start = datetime.now() tables = [DiscogsMasterEntity, DiscogsMasterArtistRelationship] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) extracted_path = '.'.join(dump_file_path.split('.')[:-1]) # Extract dump file if it has not yet been extracted if not os.path.exists(extracted_path): LOGGER.info('Extracting dump file') with gzip.open(dump_file_path, 'rb') as f_in: with open(extracted_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # count number of entries n_rows = sum( 1 for _ in self._g_process_et_items(extracted_path, 'master')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities relationships_set = set() self.total_entities = 0 for _, node in tqdm(self._g_process_et_items(extracted_path, 'master'), total=n_rows): if not node.tag == 'master': continue self.total_entities += 1 entity = self._extract_from_master_node(node, relationships_set) entity_array.append(entity) # commit in batches of `self._sqlalchemy_commit_every` if len(entity_array) >= self._sqlalchemy_commit_every: LOGGER.info('Adding batch of entities to the database, ' 'this will take a while.' 'Progress will resume soon.') insert_start_time = datetime.now() session.bulk_save_objects(entity_array) session.commit() session.expunge_all() # clear session entity_array.clear() # clear entity array LOGGER.debug( 'It took %s to add %s entities to the database', datetime.now() - insert_start_time, self._sqlalchemy_commit_every, ) # finally commit remaining entities in session # (if any), and close session session.bulk_save_objects(entity_array) session.bulk_save_objects([ DiscogsMasterArtistRelationship(id1, id2) for id1, id2 in relationships_set ]) session.commit() session.close() end = datetime.now() LOGGER.info( 'Import completed in %s. Total entities: %d. ' 'Total relationships %s.', end - start, self.total_entities, len(relationships_set), ) # once the import process is complete, # we can safely delete the extracted discogs dump os.remove(extracted_path)
def _process_artists_dump(self, dump_file_path, resolve): LOGGER.info( "Starting import of musicians and bands from Discogs dump '%s'", dump_file_path, ) start = datetime.now() tables = [ DiscogsMusicianEntity, DiscogsMusicianNlpEntity, DiscogsMusicianLinkEntity, DiscogsGroupEntity, DiscogsGroupNlpEntity, DiscogsGroupLinkEntity, ] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) extracted_path = '.'.join(dump_file_path.split('.')[:-1]) # Extract dump file if it has not yet been extracted if not os.path.exists(extracted_path): LOGGER.info('Extracting dump file') with gzip.open(dump_file_path, 'rb') as f_in: with open(extracted_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # count number of entries n_rows = sum( 1 for _ in self._g_process_et_items(extracted_path, 'artist')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities for _, node in tqdm(self._g_process_et_items(extracted_path, 'artist'), total=n_rows): if not node.tag == 'artist': continue infos = self._extract_from_artist_node(node, resolve) if infos is None: continue if 'groups' in infos: entity = DiscogsMusicianEntity() self._populate_musician(entity_array, entity, infos) # Band elif 'members' in infos: entity = DiscogsGroupEntity() self._populate_band(entity_array, entity, infos) # commit in batches of `self._sqlalchemy_commit_every` if len(entity_array) >= self._sqlalchemy_commit_every: LOGGER.info('Adding batch of entities to the database, ' 'this will take a while. ' 'Progress will resume soon.') insert_start_time = datetime.now() session.bulk_save_objects(entity_array) session.commit() session.expunge_all() # clear session entity_array.clear() # clear entity array LOGGER.debug( 'It took %s to add %s entities to the database', datetime.now() - insert_start_time, self._sqlalchemy_commit_every, ) # finally commit remaining entities in session # (if any), and close session session.bulk_save_objects(entity_array) session.commit() session.close() end = datetime.now() LOGGER.info( 'Import completed in %s. ' 'Total entities: %d - %d musicians with %d links - %d bands' ' with %d links - %d discarded dead links.', end - start, self.total_entities, self.musicians, self.musician_links, self.bands, self.band_links, self.dead_links, ) # once the import process is complete, # we can safely delete the extracted discogs dump os.remove(extracted_path)
def extract_and_populate(self, dump_file_paths: Iterable[str], resolve: bool): dump_file_path = dump_file_paths[0] dump_path = os.path.join( os.path.dirname(os.path.abspath(dump_file_path)), f"{os.path.basename(dump_file_path)}_extracted", ) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: LOGGER.info("Extracting dump %s in %s", dump_file_path, dump_path) tar.extractall(dump_path) LOGGER.info("Extracted dump %s in %s", dump_file_path, dump_path) db_manager = DBManager() tables = [MusicbrainzReleaseGroupEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups") release_groups_count = self._add_entities_from_generator( db_manager, self._release_group_generator, dump_path) LOGGER.debug("Added %s/%s release group records", *release_groups_count) def release_artist_relationships_uniqueness_filter(): """Remove duplicates from _release_group_artist_relationship_generator """ yield from [ MusicBrainzReleaseGroupArtistRelationship(item[0], item[1]) for item in set( self._release_group_artist_relationship_generator( dump_path)) ] tables = [MusicBrainzReleaseGroupArtistRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships release-artist/band") relationships_count = self._add_entities_from_generator( db_manager, release_artist_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) tables = [MusicbrainzReleaseGroupLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups links") link_count = self._add_entities_from_generator( db_manager, self._release_group_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s release group link records", *link_count) tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing artists and bands") artist_count = self._add_entities_from_generator( db_manager, self._artist_generator, dump_path) LOGGER.debug("Added %s/%s artist records", *artist_count) tables = [MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing links") link_count = self._add_entities_from_generator( db_manager, self._artist_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s link records", *link_count) LOGGER.info("Importing ISNIs") isni_link_count = self._add_entities_from_generator( db_manager, self._isni_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count) tables = [MusicBrainzArtistBandRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships artist-band") def artist_band_relationships_uniqueness_filter(): yield from [ MusicBrainzArtistBandRelationship(item[0], item[1]) for item in set( self._artist_band_relationship_generator(dump_path)) ] relationships_count = self._add_entities_from_generator( db_manager, artist_band_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) shutil.rmtree(dump_path, ignore_errors=True)
def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """ Extracts the data in the dumps (person and movie) and processes them. It then proceeds to add the appropriate data to the database. See :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy definition of the entities we use to save IMDB data. :param dump_file_paths: the absolute paths of the already downloaded dump files. """ # the order of these files is specified in `self.get_dump_download_urls` person_file_path = dump_file_paths[0] movies_file_path = dump_file_paths[1] LOGGER.debug('Path to movie info dump: %s', movies_file_path) LOGGER.debug('Path to person info dump: %s', person_file_path) start = datetime.datetime.now() tables = [ imdb_entity.ImdbActorEntity, imdb_entity.ImdbDirectorEntity, imdb_entity.ImdbMovieEntity, imdb_entity.ImdbMusicianEntity, imdb_entity.ImdbProducerEntity, imdb_entity.ImdbWriterEntity, imdb_entity.ImdbMoviePersonRelationship, ] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) LOGGER.info('Starting import of movies ...') # Here we open the movie dump file, and add everything to the DB for movie_info, entity_array in self._loop_through_entities( movies_file_path): # create the movie SQLAlchemy entity and populate it movie_entity = imdb_entity.ImdbMovieEntity() movie_entity.catalog_id = movie_info.get('tconst') movie_entity.title_type = movie_info.get('titleType') if movie_info.get('primaryTitle') is not None: movie_entity.name = movie_info.get('primaryTitle') movie_entity.name_tokens = ' '.join( text_utils.tokenize(movie_info.get('primaryTitle'))) movie_entity.is_adult = (True if movie_info.get('isAdult') == '1' else False) try: movie_entity.born = datetime.date(year=int( movie_info.get('startYear')), month=1, day=1) movie_entity.born_precision = 9 except (KeyError, TypeError): LOGGER.debug('No start year value for %s', movie_entity) try: movie_entity.died = datetime.date(year=int( movie_info.get('endYear')), month=1, day=1) movie_entity.died_precision = 9 except (KeyError, TypeError): LOGGER.debug('No end year value for %s', movie_entity) movie_entity.runtime_minutes = movie_info.get('runtimeMinutes') if movie_info.get('genres'): # if movie has a genre specified movie_entity.genres = ' '.join( text_utils.tokenize(movie_info.get('genres'))) # Creates entity for alias alias = movie_info.get('originalTitle') if alias is not None and movie_entity.name != alias: alias_entity = copy.deepcopy(movie_entity) alias_entity.name = alias alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias)) entity_array.append(alias_entity) entity_array.append(movie_entity) self.n_movies += 1 # mark end for movie import process end = datetime.datetime.now() LOGGER.info( 'Movie import completed in %s. ' 'Total movies imported: %d', end - start, self.n_movies, ) LOGGER.info('Starting import of people ...') # reset timer for persons import start = datetime.datetime.now() for person_info, entity_array in self._loop_through_entities( person_file_path): # IMDb saves the list of professions as a comma separated # string professions = person_info.get('primaryProfession') # if person has no professions then ignore it if not professions: LOGGER.debug('Person %s has no professions', person_info.get('nconst')) continue professions = professions.split(',') # each person can be added to multiple tables in the DB, # each table stands for one of the main professions types_of_entities = [] if 'actor' in professions or 'actress' in professions: self.n_actors += 1 types_of_entities.append(imdb_entity.ImdbActorEntity()) if 'director' in professions: self.n_directors += 1 types_of_entities.append(imdb_entity.ImdbDirectorEntity()) if 'producer' in professions: self.n_producers += 1 types_of_entities.append(imdb_entity.ImdbProducerEntity()) if any(prof in [ 'sound_department', 'composer', 'music_department', 'soundtrack', ] for prof in professions): self.n_musicians += 1 types_of_entities.append(imdb_entity.ImdbMusicianEntity()) if 'writer' in professions: self.n_writers += 1 types_of_entities.append(imdb_entity.ImdbWriterEntity()) # if the only profession a person has is `miscellaneous` then we # add it to all tables if professions == ['miscellaneous']: self.n_misc += 1 types_of_entities = [ imdb_entity.ImdbActorEntity(), imdb_entity.ImdbDirectorEntity(), imdb_entity.ImdbMusicianEntity(), imdb_entity.ImdbProducerEntity(), imdb_entity.ImdbWriterEntity(), ] # add person to every matching table for etype in types_of_entities: self._populate_person(etype, person_info, entity_array) # if person is known for any movies then add these to the # database as well if person_info.get('knownForTitles'): self.n_person_movie_links += 1 self._populate_person_movie_relations(person_info, entity_array) self.n_persons += 1 # mark the end time for the person import process end = datetime.datetime.now() LOGGER.info( 'Person import completed in %s. ' 'Total people imported: %d - ' 'Actors: %d - Directors: %d - Musicians: %d - ' 'Producers: %d - Writers: %d - Misc: %d', end - start, self.n_persons, self.n_actors, self.n_directors, self.n_musicians, self.n_producers, self.n_writers, self.n_misc, )
def extract_and_populate(self, dump_file_paths: List[str], resolve: bool): """Extract relevant data from the *artist* (people) and *release group* (works) MusicBrainz dumps, preprocess them, populate `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist them to a database instance. See :mod:`~soweego.importer.models.musicbrainz_entity` for the ORM definitions. :param dump_file_paths: paths to downloaded catalog dumps :param resolve: whether to resolve URLs found in catalog dumps or not """ dump_file_path = dump_file_paths[0] dump_path = os.path.join( os.path.dirname(os.path.abspath(dump_file_path)), f"{os.path.basename(dump_file_path)}_extracted", ) if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: LOGGER.info("Extracting dump %s in %s", dump_file_path, dump_path) tar.extractall(dump_path) LOGGER.info("Extracted dump %s in %s", dump_file_path, dump_path) db_manager = DBManager() tables = [MusicBrainzReleaseGroupEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups") release_groups_count = self._add_entities_from_generator( db_manager, self._release_group_generator, dump_path) LOGGER.debug("Added %s/%s release group records", *release_groups_count) def release_artist_relationships_uniqueness_filter(): """Remove duplicates from _release_group_artist_relationship_generator """ yield from [ MusicBrainzReleaseGroupArtistRelationship(item[0], item[1]) for item in set( self._release_group_artist_relationship_generator( dump_path)) ] tables = [MusicBrainzReleaseGroupArtistRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships release-artist/band") relationships_count = self._add_entities_from_generator( db_manager, release_artist_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) tables = [MusicBrainzReleaseGroupLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing release groups links") link_count = self._add_entities_from_generator( db_manager, self._release_group_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s release group link records", *link_count) tables = [MusicBrainzArtistEntity, MusicBrainzBandEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing artists and bands") artist_count = self._add_entities_from_generator( db_manager, self._artist_generator, dump_path) LOGGER.debug("Added %s/%s artist records", *artist_count) tables = [MusicBrainzArtistLinkEntity, MusicBrainzBandLinkEntity] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing links") link_count = self._add_entities_from_generator( db_manager, self._artist_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s link records", *link_count) LOGGER.info("Importing ISNIs") isni_link_count = self._add_entities_from_generator( db_manager, self._isni_link_generator, dump_path, resolve) LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count) tables = [MusicBrainzArtistBandRelationship] db_manager.drop(tables) db_manager.create(tables) LOGGER.info( "Dropped and created tables %s", [table.__tablename__ for table in tables], ) LOGGER.info("Importing relationships artist-band") def artist_band_relationships_uniqueness_filter(): yield from [ MusicBrainzArtistBandRelationship(item[0], item[1]) for item in set( self._artist_band_relationship_generator(dump_path)) ] relationships_count = self._add_entities_from_generator( db_manager, artist_band_relationships_uniqueness_filter) LOGGER.debug("Added %s/%s relationships records", *relationships_count) shutil.rmtree(dump_path, ignore_errors=True)