Example #1
0
def gather_target_links(entity_type, catalog):
    catalog_constants = _get_catalog_constants(catalog)
    catalog_entity = _get_catalog_entity(entity_type, catalog_constants)

    LOGGER.info('Gathering %s %s links ...', catalog, entity_type)
    link_entity = catalog_entity['link_entity']

    session = DBManager.connect_to_db()
    result = None
    try:
        query = session.query(link_entity.catalog_id, link_entity.url)
        count = query.count()
        if count == 0:
            LOGGER.warning(
                "No links available for %s %s. Stopping validation here",
                catalog, entity_type)
            return None
        LOGGER.info('Got %d links from %s %s', count, catalog, entity_type)
        result = query.all()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if result is None:
        return None
    for row in result:
        yield row.catalog_id, row.url
Example #2
0
def gather_target_metadata(entity_type, catalog):
    catalog_constants = _get_catalog_constants(catalog)
    catalog_entity = _get_catalog_entity(entity_type, catalog_constants)

    LOGGER.info(
        'Gathering %s birth/death dates/places and gender metadata ...',
        catalog)
    entity = catalog_entity['entity']
    # Base metadata
    query_fields = _build_metadata_query_fields(entity, entity_type, catalog)

    session = DBManager.connect_to_db()
    result = None
    try:
        result = _run_metadata_query(session, query_fields, entity, catalog,
                                     entity_type)
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if not result:
        return None
    return _parse_target_metadata_query_result(result)
Example #3
0
def check_links_cli(catalog: str):
    """
    Check for rotten URLs of an imported catalog.

    :param catalog: one of the keys of constants.TARGET_CATALOGS
    """
    for entity_type in target_database.supported_entities_for_target(catalog):

        LOGGER.info("Validating %s %s links...", catalog, entity_type)
        entity = target_database.get_link_entity(catalog, entity_type)
        if not entity:
            LOGGER.info(
                "%s %s does not have a links table. Skipping...",
                catalog,
                entity_type,
            )
            continue

        session = DBManager.connect_to_db()
        total = session.query(entity).count()
        removed = 0

        with Pool() as pool:
            # Validate each link
            for resolved, res_entity in tqdm(
                    pool.imap_unordered(_resolve_url, session.query(entity)),
                    total=total,
            ):
                if not resolved:
                    session_delete = DBManager.connect_to_db()
                    # if not valid delete
                    session_delete.delete(res_entity)
                    try:
                        session_delete.commit()
                        removed += 1
                    except:
                        session.rollback()
                        raise
                    finally:
                        session_delete.close()

        session.close()
        LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog,
                    entity_type)
Example #4
0
def build_target(
    goal: str, catalog: str, entity: str, identifiers: Set[str]
) -> Iterator[pd.DataFrame]:
    """Build a target catalog dataset for training or classification purposes:
    workflow step 1.

    Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p``
    database. This is where the :mod:`importer` inserts processed catalog dumps.

    The database is located in
    `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_
    under the Wikimedia
    `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure.
    See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_.

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param identifiers: a set of catalog IDs to gather data for
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity)

    # Target catalog ORM entities/DB tables
    base, link, nlp = (
        target_database.get_main_entity(catalog, entity),
        target_database.get_link_entity(catalog, entity),
        target_database.get_nlp_entity(catalog, entity),
    )
    tables = [table for table in (base, link, nlp) if table]

    # Initial query with all non-null tables
    query = Query(tables)
    # Remove `base` to avoid outer join with itself
    tables.remove(base)
    # Outer joins
    for table in tables:
        query = query.outerjoin(table, base.catalog_id == table.catalog_id)
    # Condition
    query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(
        False
    )

    sql = query.statement
    LOGGER.debug('SQL query to be fired: %s', sql)

    # Avoid loading query result in memory
    db_engine = DBManager().get_engine().execution_options(stream_results=True)

    return read_sql(sql, db_engine, chunksize=1000)
Example #5
0
def perfect_name_search(target_entity: T, to_search: str) -> Iterable[T]:
    session = DBManager.connect_to_db()
    try:
        for r in session.query(target_entity).filter(
                target_entity.name == to_search).all():
            yield r
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
def perfect_name_search(target_entity: constants.DB_ENTITY,
                        to_search: str) -> Iterable[constants.DB_ENTITY]:
    session = DBManager.connect_to_db()
    try:
        for r in (session.query(target_entity).filter(
                target_entity.name == to_search).all()):
            yield r

    except:
        session.rollback()
        raise
    finally:
        session.close()
Example #7
0
def name_fulltext_search(target_entity: T, query: str) -> Iterable[T]:
    ft_search = target_entity.name.match(query)

    session = DBManager.connect_to_db()
    try:
        for r in session.query(target_entity).filter(ft_search).all():
            yield r
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
def tokens_fulltext_search(
    target_entity: constants.DB_ENTITY,
    boolean_mode: bool,
    tokens: Iterable[str],
    where_clause=None,
    limit: int = 10,
) -> Iterable[constants.DB_ENTITY]:
    if issubclass(target_entity, models.base_entity.BaseEntity):
        column = target_entity.name_tokens
    elif issubclass(target_entity, models.base_link_entity.BaseLinkEntity):
        column = target_entity.url_tokens
    elif issubclass(target_entity, models.base_nlp_entity.BaseNlpEntity):
        column = target_entity.description_tokens
    else:
        LOGGER.critical('Bad target entity class: %s', target_entity)
        raise ValueError('Bad target entity class: %s' % target_entity)

    tokens = filter(None, tokens)
    terms = (' '.join(map('+{0}'.format, tokens))
             if boolean_mode else ' '.join(tokens))
    ft_search = column.match(terms)

    session = DBManager.connect_to_db()
    try:
        if where_clause is None:
            query = session.query(target_entity).filter(ft_search).limit(limit)
        else:
            query = (session.query(target_entity).filter(ft_search).filter(
                where_clause).limit(limit))

        count = query.count()
        if count == 0:
            LOGGER.debug(
                "No result from full-text index query to %s. Terms: '%s'",
                target_entity.__name__,
                terms,
            )
            session.commit()
        else:
            for row in query:
                yield row
            session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
Example #9
0
def check_existence(class_or_occupation_query, class_qid, catalog_pid,
                    entity: BaseEntity):
    query_type = 'identifier', class_or_occupation_query
    session = DBManager.connect_to_db()
    invalid = defaultdict(set)
    count = 0

    for result in sparql_queries.run_identifier_or_links_query(
            query_type, class_qid, catalog_pid, 0):
        for qid, target_id in result.items():
            results = session.query(entity).filter(
                entity.catalog_id == target_id).all()
            if not results:
                LOGGER.warning('%s identifier %s is invalid', qid, target_id)
                invalid[target_id].add(qid)
                count += 1

    LOGGER.info('Total invalid identifiers = %d', count)
    # Sets are not serializable to JSON, so cast them to lists
    return {target_id: list(qids) for target_id, qids in invalid.items()}
def gather_target_links(entity, catalog):
    LOGGER.info('Gathering %s %s links ...', catalog, entity)
    link_entity = target_database.get_link_entity(catalog, entity)

    # Early return when the links table doesn't exist
    if link_entity is None:
        LOGGER.warning(
            'No links table available in the database for %s %s. '
            'Stopping validation here',
            catalog,
            entity,
        )
        return None

    session = DBManager.connect_to_db()
    result = None
    try:
        query = session.query(link_entity.catalog_id, link_entity.url)
        count = query.count()
        # Early return when no links
        if count == 0:
            LOGGER.warning(
                'No links available for %s %s. Stopping validation here',
                catalog,
                entity,
            )
            return None
        LOGGER.info('Got %d links from %s %s', count, catalog, entity)
        result = query.all()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if result is None:
        return None
    for row in result:
        yield row.catalog_id, row.url
Example #11
0
def tokens_fulltext_search(target_entity: T, boolean_mode: bool,
                           tokens: Iterable[str]) -> Iterable[T]:
    query = None
    if boolean_mode:
        query = ' '.join(map('+{0}'.format, tokens))
    else:
        query = ' '.join(tokens)

    ft_search = target_entity.tokens.match(query)

    session = DBManager.connect_to_db()
    result = []
    try:
        result = session.query(target_entity).filter(ft_search).all()
        session.commit()
    except:
        session.rollback()
        raise

    if not result:
        return []
    return result
Example #12
0
def _gather_target_data(
    catalog,
    entity,
    total_queries,
    works_buckets,
    works_inverted,
    people_buckets,
    people_inverted,
):
    claim_pid = vocabulary.WORKS_BY_PEOPLE_MAPPING[catalog][entity]
    db_entity = target_database.get_relationship_entity(catalog, entity)
    session = DBManager().connect_to_db()

    # Leverage works-people relationships
    try:
        for works, people in tqdm(product(works_buckets, people_buckets),
                                  total=total_queries):
            works_to_people = session.query(db_entity).filter(
                and_(
                    db_entity.from_catalog_id.in_(works),
                    db_entity.to_catalog_id.in_(people),
                ))

            for result in works_to_people:
                yield works_inverted[
                    result.from_catalog_id], claim_pid, people_inverted[
                        result.to_catalog_id], result.to_catalog_id
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of works-people relationships due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)

        session.rollback()
        return None
    finally:
        session.close()
def gather_target_biodata(entity, catalog):
    LOGGER.info(
        'Gathering %s birth/death dates/places and gender metadata ...',
        catalog)
    db_entity = target_database.get_main_entity(catalog, entity)
    # Base biodata
    query_fields = _build_biodata_query_fields(db_entity, entity, catalog)

    session = DBManager.connect_to_db()
    query = session.query(*query_fields).filter(
        or_(db_entity.born.isnot(None), db_entity.died.isnot(None)))
    result = None
    try:
        raw_result = _run_query(query, catalog, entity)
        if raw_result is None:
            return None
        result = _parse_target_biodata_query_result(raw_result)
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
    return result
Example #14
0
    def extract_and_populate(self, dump_file_path):
        dump_path = os.path.join(os.path.dirname(
            os.path.abspath(dump_file_path)), "%s_%s" % (os.path.basename(dump_file_path), 'extracted'))

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                tar.extractall(dump_path)

        tables = [MusicbrainzArtistEntity,
                  MusicbrainzBandEntity]

        db_manager = DBManager()
        db_manager.drop(tables)
        db_manager.create(tables)

        artist_count = 0
        for artist in self._artist_generator(dump_path):
            artist_count = artist_count + 1
            session = db_manager.new_session()
            session.add(artist)
            session.commit()

        LOGGER.debug("Added %s artist records" % artist_count)

        db_manager.drop([MusicbrainzArtistLinkEntity,
                         MusicbrainzBandLinkEntity])
        db_manager.create([MusicbrainzArtistLinkEntity,
                           MusicbrainzBandLinkEntity])

        link_count = 0
        for link in self._link_generator(dump_path):
            link_count = link_count + 1
            session = db_manager.new_session()
            session.add(link)
            session.commit()

        LOGGER.debug("Added %s link records" % link_count)

        isni_link_count = 0
        for link in self._isni_link_generator(dump_path):
            isni_link_count = isni_link_count + 1
            session = db_manager.new_session()
            session.add(link)
            session.commit()

        LOGGER.debug("Added %s ISNI link records" % isni_link_count)

        db_manager.drop([MusicBrainzArtistBandRelationship])
        db_manager.create([MusicBrainzArtistBandRelationship])

        relationships_count = 0
        relationships_total = 0
        for relationship in self._artist_band_relationship_generator(dump_path):
            try:
                relationships_total = relationships_total + 1
                session = db_manager.new_session()
                session.add(relationship)
                session.commit()
                relationships_count = relationships_count + 1
            except IntegrityError as i:
                LOGGER.warning(str(i))

        LOGGER.debug("Added %s/%s relationships records" %
                     (relationships_count, relationships_total))
Example #15
0
def add_matches(
    file_path: str,
    catalog_id: int,
    catalog: str,
    entity: str,
    confidence_range: Tuple[float, float],
) -> None:
    """Add or update matches to an existing catalog.
    Curated matches found in the catalog are kept as is.

    :param file_path: path to a file with matches
    :param catalog_id: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param confidence_range: a pair of floats indicating
      the minimum and maximum confidence scores of matches
      that will be added/updated.
    """
    success = True  # Flag to log that everything went fine
    class_qid, url_prefix = _handle_metadata(catalog, entity)
    matches = _handle_matches(file_path, confidence_range)

    LOGGER.info(
        "Starting import of %s %s matches (catalog ID: %d) into the mix'n'match DB ...",
        catalog,
        entity,
        catalog_id,
    )

    start = datetime.now()
    session = DBManager(MNM_DB).new_session()

    # Note that the session is kept open after this operation
    curated, success = _sync_matches(session, catalog_id, success)

    # Filter curated matches:
    # rows with tids that are NOT (~) in curated tids
    matches = matches[~matches[keys.TID].isin(curated)]

    n_matches = len(matches)
    matches_reader = matches.itertuples(index=False, name=None)
    batch = []

    try:
        _import_matches(
            batch,
            catalog,
            catalog_id,
            class_qid,
            entity,
            matches_reader,
            n_matches,
            session,
            url_prefix,
        )

        LOGGER.info(
            'Adding last batch of %d %s %s matches, this may take a while ...',
            len(batch),
            catalog,
            entity,
        )
        # Commit remaining entities
        session.bulk_save_objects(batch)
        session.commit()

    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        success = False

    finally:
        session.close()

    if success:
        end = datetime.now()
        LOGGER.info(
            'Import of %s %s matches (catalog ID: %d) completed in %s. '
            'Total matches: %d',
            catalog,
            entity,
            catalog_id,
            end - start,
            n_matches,
        )
Example #16
0
def add_catalog(catalog: str, entity: str) -> int:
    """Add or update a catalog.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :return: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    """
    name_field = f'{catalog.title()} {entity}'

    session = DBManager(MNM_DB).new_session()
    try:
        existing = (session.query(
            mix_n_match.MnMCatalog).filter_by(name=name_field).first())
        if existing is None:
            LOGGER.info(
                "Adding %s %s catalog to the mix'n'match DB ... ",
                catalog,
                entity,
            )
            db_entity = mix_n_match.MnMCatalog()
            _set_catalog_fields(db_entity, name_field, catalog, entity)
            session.add(db_entity)
            session.commit()
            catalog_id = db_entity.id
        else:
            LOGGER.info('Updating %s %s catalog ... ', catalog, entity)
            catalog_id = existing.id
            _set_catalog_fields(existing, name_field, catalog, entity)
            session.add(existing)
            session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed catalog addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        return None
    finally:
        session.close()

    LOGGER.info('Catalog addition/update went fine. Internal ID: %d',
                catalog_id)
    return catalog_id
Example #17
0
def dead_ids(catalog: str,
             entity: str,
             wd_cache=None) -> Tuple[DefaultDict, Dict]:
    """Look for dead identifiers in Wikidata.
    An identifier is dead if it does not exist in the given catalog
    when this function is executed.

    Dead identifiers should be marked with a deprecated rank in Wikidata.

    **How it works:**

    1. gather identifiers of the given catalog from relevant Wikidata items
    2. look them up in the given catalog
    3. if an identifier is not in the given catalog anymore,
       it should be deprecated

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata
      in a previous run
    :return: the ``dict`` pair of dead identifiers
      and identifiers gathered from Wikidata
    """
    dead = defaultdict(set)
    db_entity = target_database.get_main_entity(catalog, entity)

    # Wikidata side
    if wd_cache is None:
        wd_ids = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_ids,
        )
    else:
        wd_ids = wd_cache

    # Target catalog side
    session = DBManager.connect_to_db()

    try:
        for qid in wd_ids:
            for tid in wd_ids[qid][keys.TID]:
                existing = (session.query(
                    db_entity.catalog_id).filter_by(catalog_id=tid).count())
                if existing == 0:
                    LOGGER.debug('%s %s identifier %s is dead', qid, catalog,
                                 tid)
                    dead[tid].add(qid)
        session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of target catalog identifiers due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
    finally:
        session.close()

    LOGGER.info(
        'Check completed. Target: %s %s. Total dead identifiers: %d',
        catalog,
        entity,
        len(dead),
    )
    return dead, wd_ids
    def _process_masters_dump(self, dump_file_path):
        LOGGER.info("Starting import of masters from Discogs dump '%s'",
                    dump_file_path)
        start = datetime.now()
        tables = [DiscogsMasterEntity, DiscogsMasterArtistRelationship]
        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )
        extracted_path = '.'.join(dump_file_path.split('.')[:-1])
        # Extract dump file if it has not yet been extracted
        if not os.path.exists(extracted_path):
            LOGGER.info('Extracting dump file')

            with gzip.open(dump_file_path, 'rb') as f_in:
                with open(extracted_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        # count number of entries
        n_rows = sum(
            1 for _ in self._g_process_et_items(extracted_path, 'master'))
        session = db_manager.new_session()
        entity_array = []  # array to which we'll add the entities
        relationships_set = set()
        self.total_entities = 0
        for _, node in tqdm(self._g_process_et_items(extracted_path, 'master'),
                            total=n_rows):

            if not node.tag == 'master':
                continue

            self.total_entities += 1
            entity = self._extract_from_master_node(node, relationships_set)
            entity_array.append(entity)
            # commit in batches of `self._sqlalchemy_commit_every`
            if len(entity_array) >= self._sqlalchemy_commit_every:
                LOGGER.info('Adding batch of entities to the database, '
                            'this will take a while.'
                            'Progress will resume soon.')

                insert_start_time = datetime.now()

                session.bulk_save_objects(entity_array)
                session.commit()
                session.expunge_all()  # clear session

                entity_array.clear()  # clear entity array

                LOGGER.debug(
                    'It took %s to add %s entities to the database',
                    datetime.now() - insert_start_time,
                    self._sqlalchemy_commit_every,
                )
        # finally commit remaining entities in session
        # (if any), and close session
        session.bulk_save_objects(entity_array)
        session.bulk_save_objects([
            DiscogsMasterArtistRelationship(id1, id2)
            for id1, id2 in relationships_set
        ])
        session.commit()
        session.close()

        end = datetime.now()
        LOGGER.info(
            'Import completed in %s. Total entities: %d. '
            'Total relationships %s.',
            end - start,
            self.total_entities,
            len(relationships_set),
        )
        # once the import process is complete,
        # we can safely delete the extracted discogs dump
        os.remove(extracted_path)
    def _process_artists_dump(self, dump_file_path, resolve):
        LOGGER.info(
            "Starting import of musicians and bands from Discogs dump '%s'",
            dump_file_path,
        )
        start = datetime.now()
        tables = [
            DiscogsMusicianEntity,
            DiscogsMusicianNlpEntity,
            DiscogsMusicianLinkEntity,
            DiscogsGroupEntity,
            DiscogsGroupNlpEntity,
            DiscogsGroupLinkEntity,
        ]
        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )
        extracted_path = '.'.join(dump_file_path.split('.')[:-1])
        # Extract dump file if it has not yet been extracted
        if not os.path.exists(extracted_path):
            LOGGER.info('Extracting dump file')

            with gzip.open(dump_file_path, 'rb') as f_in:
                with open(extracted_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        # count number of entries
        n_rows = sum(
            1 for _ in self._g_process_et_items(extracted_path, 'artist'))
        session = db_manager.new_session()
        entity_array = []  # array to which we'll add the entities
        for _, node in tqdm(self._g_process_et_items(extracted_path, 'artist'),
                            total=n_rows):

            if not node.tag == 'artist':
                continue

            infos = self._extract_from_artist_node(node, resolve)

            if infos is None:
                continue

            if 'groups' in infos:
                entity = DiscogsMusicianEntity()
                self._populate_musician(entity_array, entity, infos)
            # Band
            elif 'members' in infos:
                entity = DiscogsGroupEntity()
                self._populate_band(entity_array, entity, infos)

            # commit in batches of `self._sqlalchemy_commit_every`
            if len(entity_array) >= self._sqlalchemy_commit_every:
                LOGGER.info('Adding batch of entities to the database, '
                            'this will take a while. '
                            'Progress will resume soon.')

                insert_start_time = datetime.now()

                session.bulk_save_objects(entity_array)
                session.commit()
                session.expunge_all()  # clear session

                entity_array.clear()  # clear entity array

                LOGGER.debug(
                    'It took %s to add %s entities to the database',
                    datetime.now() - insert_start_time,
                    self._sqlalchemy_commit_every,
                )
        # finally commit remaining entities in session
        # (if any), and close session
        session.bulk_save_objects(entity_array)
        session.commit()
        session.close()
        end = datetime.now()
        LOGGER.info(
            'Import completed in %s. '
            'Total entities: %d - %d musicians with %d links - %d bands'
            ' with %d links - %d discarded dead links.',
            end - start,
            self.total_entities,
            self.musicians,
            self.musician_links,
            self.bands,
            self.band_links,
            self.dead_links,
        )
        # once the import process is complete,
        # we can safely delete the extracted discogs dump
        os.remove(extracted_path)
Example #20
0
    def extract_and_populate(self, dump_file_paths: List[str],
                             resolve: bool) -> None:
        """
        Extracts the data in the dumps (person and movie) and processes them.
        It then proceeds to add the appropriate data to the database.

        See
        :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy
        definition of the entities we use to save IMDB data.

        :param dump_file_paths: the absolute paths of the already downloaded
        dump files.
        """

        # the order of these files is specified in `self.get_dump_download_urls`
        person_file_path = dump_file_paths[0]
        movies_file_path = dump_file_paths[1]

        LOGGER.debug('Path to movie info dump: %s', movies_file_path)
        LOGGER.debug('Path to person info dump: %s', person_file_path)

        start = datetime.datetime.now()

        tables = [
            imdb_entity.ImdbActorEntity,
            imdb_entity.ImdbDirectorEntity,
            imdb_entity.ImdbMovieEntity,
            imdb_entity.ImdbMusicianEntity,
            imdb_entity.ImdbProducerEntity,
            imdb_entity.ImdbWriterEntity,
            imdb_entity.ImdbMoviePersonRelationship,
        ]

        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)

        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )

        LOGGER.info('Starting import of movies ...')

        # Here we open the movie dump file, and add everything to the DB
        for movie_info, entity_array in self._loop_through_entities(
                movies_file_path):

            # create the movie SQLAlchemy entity and populate it
            movie_entity = imdb_entity.ImdbMovieEntity()
            movie_entity.catalog_id = movie_info.get('tconst')
            movie_entity.title_type = movie_info.get('titleType')
            if movie_info.get('primaryTitle') is not None:
                movie_entity.name = movie_info.get('primaryTitle')
                movie_entity.name_tokens = ' '.join(
                    text_utils.tokenize(movie_info.get('primaryTitle')))
            movie_entity.is_adult = (True if movie_info.get('isAdult') == '1'
                                     else False)
            try:
                movie_entity.born = datetime.date(year=int(
                    movie_info.get('startYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.born_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No start year value for %s', movie_entity)
            try:
                movie_entity.died = datetime.date(year=int(
                    movie_info.get('endYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.died_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No end year value for %s', movie_entity)
            movie_entity.runtime_minutes = movie_info.get('runtimeMinutes')

            if movie_info.get('genres'):  # if movie has a genre specified
                movie_entity.genres = ' '.join(
                    text_utils.tokenize(movie_info.get('genres')))

            # Creates entity for alias
            alias = movie_info.get('originalTitle')
            if alias is not None and movie_entity.name != alias:
                alias_entity = copy.deepcopy(movie_entity)
                alias_entity.name = alias
                alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias))
                entity_array.append(alias_entity)

            entity_array.append(movie_entity)

            self.n_movies += 1

        # mark end for movie import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Movie import completed in %s. '
            'Total movies imported: %d',
            end - start,
            self.n_movies,
        )

        LOGGER.info('Starting import of people ...')

        # reset timer for persons import
        start = datetime.datetime.now()

        for person_info, entity_array in self._loop_through_entities(
                person_file_path):

            # IMDb saves the list of professions as a comma separated
            # string
            professions = person_info.get('primaryProfession')

            # if person has no professions then ignore it
            if not professions:
                LOGGER.debug('Person %s has no professions',
                             person_info.get('nconst'))
                continue

            professions = professions.split(',')

            # each person can be added to multiple tables in the DB,
            # each table stands for one of the main professions
            types_of_entities = []

            if 'actor' in professions or 'actress' in professions:
                self.n_actors += 1
                types_of_entities.append(imdb_entity.ImdbActorEntity())

            if 'director' in professions:
                self.n_directors += 1
                types_of_entities.append(imdb_entity.ImdbDirectorEntity())

            if 'producer' in professions:
                self.n_producers += 1
                types_of_entities.append(imdb_entity.ImdbProducerEntity())

            if any(prof in [
                    'sound_department',
                    'composer',
                    'music_department',
                    'soundtrack',
            ] for prof in professions):
                self.n_musicians += 1
                types_of_entities.append(imdb_entity.ImdbMusicianEntity())

            if 'writer' in professions:
                self.n_writers += 1
                types_of_entities.append(imdb_entity.ImdbWriterEntity())

            # if the only profession a person has is `miscellaneous` then we
            # add it to all tables
            if professions == ['miscellaneous']:
                self.n_misc += 1
                types_of_entities = [
                    imdb_entity.ImdbActorEntity(),
                    imdb_entity.ImdbDirectorEntity(),
                    imdb_entity.ImdbMusicianEntity(),
                    imdb_entity.ImdbProducerEntity(),
                    imdb_entity.ImdbWriterEntity(),
                ]

            # add person to every matching table
            for etype in types_of_entities:
                self._populate_person(etype, person_info, entity_array)

            # if person is known for any movies then add these to the
            # database as well
            if person_info.get('knownForTitles'):
                self.n_person_movie_links += 1
                self._populate_person_movie_relations(person_info,
                                                      entity_array)

            self.n_persons += 1

        # mark the end time for the person import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Person import completed in %s. '
            'Total people imported: %d - '
            'Actors: %d - Directors: %d - Musicians: %d - '
            'Producers: %d - Writers: %d - Misc: %d',
            end - start,
            self.n_persons,
            self.n_actors,
            self.n_directors,
            self.n_musicians,
            self.n_producers,
            self.n_writers,
            self.n_misc,
        )
Example #21
0
    def _loop_through_entities(
            self, file_path: str) -> Generator[Tuple[Dict, List], None, None]:
        """
        Generator that given an IMDb dump file (which
        should be ".tsv.gz" format) it loops through every
        entry and yields it.

        :return: a generator which yields a Tuple[entity_info, entity_array]
        the consumer of this generator will take `entity_info`, create an
        SQLAlchemy entity, and append this to the `entity_array`
        """
        db_manager = DBManager()

        with gzip.open(file_path, 'rt') as ddump:
            session = db_manager.new_session()

            # count number of rows for TQDM, so we can display how
            # much is missing to complete the process. Then go back
            # to the start of the file with `.seek(0)`
            n_rows = sum(1 for line in ddump)
            ddump.seek(0)

            entity_array = []
            LOGGER.debug('Dump "%s" has %d entries', file_path, n_rows)

            reader = csv.DictReader(ddump, delimiter='\t')

            # for every entry in the file..
            for entity_info in tqdm(reader, total=n_rows):
                # clean the entry
                self._normalize_null(entity_info)

                # yield the cleaned dict
                yield entity_info, entity_array

                # every `_sqlalchemy_commit_every` loops we commit the
                # session to the DB. This is more efficient than commiting
                # every loop, and is not so hard on the memory requirements
                # as would be adding everything to session and commiting once
                # the for loop is done
                if len(entity_array) >= self._sqlalchemy_commit_every:
                    LOGGER.info(
                        'Adding batch of entities to the database, '
                        'this will take a while. Progress will resume soon.')

                    insert_start_time = datetime.datetime.now()

                    session.bulk_save_objects(entity_array)
                    session.commit()
                    session.expunge_all()  # clear session

                    entity_array.clear()  # clear entity array

                    LOGGER.debug(
                        'It took %s to add %s entities to the database',
                        datetime.datetime.now() - insert_start_time,
                        len(entity_array),
                    )

            # commit remaining entities
            session.bulk_save_objects(entity_array)
            session.commit()

            # clear list reference since it might still be available in
            # the scope where this generator was used.
            entity_array.clear()
    def extract_and_populate(self, dump_file_paths: List[str], resolve: bool):
        """Extract relevant data from the *artist* (people) and *release group*
        (works) MusicBrainz dumps, preprocess them, populate
        `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
        them to a database instance.

        See :mod:`~soweego.importer.models.musicbrainz_entity`
        for the ORM definitions.

        :param dump_file_paths: paths to downloaded catalog dumps
        :param resolve: whether to resolve URLs found in catalog dumps or not
        """
        dump_file_path = dump_file_paths[0]
        dump_path = os.path.join(
            os.path.dirname(os.path.abspath(dump_file_path)),
            f"{os.path.basename(dump_file_path)}_extracted",
        )

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                LOGGER.info("Extracting dump %s in %s", dump_file_path,
                            dump_path)
                tar.extractall(dump_path)
                LOGGER.info("Extracted dump %s in %s", dump_file_path,
                            dump_path)

        db_manager = DBManager()

        tables = [MusicBrainzReleaseGroupEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups")

        release_groups_count = self._add_entities_from_generator(
            db_manager, self._release_group_generator, dump_path)

        LOGGER.debug("Added %s/%s release group records",
                     *release_groups_count)

        def release_artist_relationships_uniqueness_filter():
            """Remove duplicates from
            _release_group_artist_relationship_generator """
            yield from [
                MusicBrainzReleaseGroupArtistRelationship(item[0], item[1])
                for item in set(
                    self._release_group_artist_relationship_generator(
                        dump_path))
            ]

        tables = [MusicBrainzReleaseGroupArtistRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships release-artist/band")

        relationships_count = self._add_entities_from_generator(
            db_manager, release_artist_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)

        tables = [MusicBrainzReleaseGroupLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups links")

        link_count = self._add_entities_from_generator(
            db_manager, self._release_group_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s release group link records", *link_count)

        tables = [MusicBrainzArtistEntity, MusicBrainzBandEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing artists and bands")

        artist_count = self._add_entities_from_generator(
            db_manager, self._artist_generator, dump_path)

        LOGGER.debug("Added %s/%s artist records", *artist_count)

        tables = [MusicBrainzArtistLinkEntity, MusicBrainzBandLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing links")

        link_count = self._add_entities_from_generator(
            db_manager, self._artist_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s link records", *link_count)
        LOGGER.info("Importing ISNIs")

        isni_link_count = self._add_entities_from_generator(
            db_manager, self._isni_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count)

        tables = [MusicBrainzArtistBandRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships artist-band")

        def artist_band_relationships_uniqueness_filter():
            yield from [
                MusicBrainzArtistBandRelationship(item[0], item[1])
                for item in set(
                    self._artist_band_relationship_generator(dump_path))
            ]

        relationships_count = self._add_entities_from_generator(
            db_manager, artist_band_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)
        shutil.rmtree(dump_path, ignore_errors=True)
Example #23
0
def _get_credentials_from_file() -> Tuple[Union[str, None], Union[str, None]]:
    credentials = DBManager.get_credentials()
    return (
        credentials[keys.WIKIDATA_API_USER],
        credentials[keys.WIKIDATA_API_PASSWORD],
    )
Example #24
0
def handle(file_path: str, mappings: dict, entity_type, link_type):
    """Generic .nt file dump handler for the dump import into the DB.
    Assumptions: each entity must be represented in a compact block of ''adjacent'' lines
    """
    db_manager = DBManager(
        get_path('soweego.importer.resources', 'db_credentials.json'))
    db_manager.drop(link_type)
    db_manager.drop(entity_type)
    session = db_manager.new_session()

    if not os.path.isfile(file_path):
        LOGGER.warning("file: %s not found", file_path)

    with open(file_path) as file:
        current_key = None
        current_entity = entity_type()

        for row in file.readlines():
            try:
                # split triples
                row_chunks = []
                split_regex = r'(?<=["\<])[^<>"]*(?=["\>])'
                for match in re.finditer(split_regex, row, re.I):
                    row_chunks.append(match.group(0))

                if len(row_chunks) != 3:
                    raise Exception(loc.MALFORMED_ROW)
                if row_chunks[1] in mappings:
                    current_field = mappings[row_chunks[1]]
                else:
                    raise Exception(loc.FIELD_NOT_MAPPED % row_chunks[1])

                if current_field == 'url':
                    link = link_type()
                    link.url = row_chunks[2]
                    link.tokens = row_chunks[2].replace("/", " ")  # TODO
                    link.catalog_id = current_entity.catalog_id
                    session.add(link)

                if current_key is not None and current_key != row_chunks[0]:
                    session.add(current_entity)
                    current_entity = entity_type()

                current_key = row_chunks[0]
                current_value = row_chunks[2]

                setattr(current_entity, current_field, current_value)

            except Exception as e:
                LOGGER.warning('Error at row %s \n %s', row, str(e))
        session.add(current_entity)
    try:
        db_manager.create(entity_type)
        db_manager.create(link_type)
        session.commit()
    except Exception as e:
        LOGGER.warning(loc.WRONG_MAPPINGS, str(e))
Example #25
0
def check_urls_cli(catalog, drop, dir_io):
    """Check for rotten URLs of an imported catalog.

    For every catalog entity, dump rotten URLs to a file.
    CSV format: URL,catalog_ID

    Use '-d' to drop rotten URLs from the DB on the fly.
    """
    for entity in target_database.supported_entities_for_target(catalog):
        out_path = os.path.join(
            dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity)
        )

        LOGGER.info('Starting check of %s %s URLs ...', catalog, entity)
        link_entity = target_database.get_link_entity(catalog, entity)
        if not link_entity:
            LOGGER.info(
                '%s %s does not have a links table. Skipping ...',
                catalog,
                entity,
            )
            continue

        query_session = DBManager.connect_to_db()
        total = query_session.query(link_entity).count()

        rotten = 0
        if drop:
            removed = 0

        # Parallel operation
        with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
            writer = csv.writer(fout)
            try:
                # Resolve every URL
                for resolved, result in tqdm(
                    pool.imap_unordered(_resolve, query_session.query(link_entity)),
                    total=total,
                ):
                    if not resolved:
                        # Dump
                        writer.writerow((result.url, result.catalog_id))
                        rotten += 1

                        # Drop from DB
                        if drop:
                            delete_session = DBManager.connect_to_db()
                            delete_session.delete(result)
                            try:
                                delete_session.commit()
                                removed += 1
                            except SQLAlchemyError as error:
                                LOGGER.error(
                                    'Failed deletion of %s: %s',
                                    result,
                                    error.__class__.__name__,
                                )
                                LOGGER.debug(error)
                                delete_session.rollback()
                            finally:
                                delete_session.close()
            except SQLAlchemyError as error:
                LOGGER.error(
                    '%s while querying %s %s URLs',
                    error.__class__.__name__,
                    catalog,
                    entity,
                )
                LOGGER.debug(error)
                session.rollback()
            finally:
                query_session.close()

        LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info())
        LOGGER.info(
            "Total %s %s rotten URLs dumped to '%s': %d / %d",
            catalog,
            entity,
            out_path,
            rotten,
            total,
        )

        if drop:
            LOGGER.info(
                'Total %s %s rotten URLs dropped from the DB: %d / %d',
                catalog,
                entity,
                rotten,
                removed,
            )
Example #26
0
    def extract_and_populate(self, dump_file_paths: Iterable[str],
                             resolve: bool):
        dump_file_path = dump_file_paths[0]
        dump_path = os.path.join(
            os.path.dirname(os.path.abspath(dump_file_path)),
            f"{os.path.basename(dump_file_path)}_extracted",
        )

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                LOGGER.info("Extracting dump %s in %s", dump_file_path,
                            dump_path)
                tar.extractall(dump_path)
                LOGGER.info("Extracted dump %s in %s", dump_file_path,
                            dump_path)

        db_manager = DBManager()

        tables = [MusicbrainzReleaseGroupEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups")

        release_groups_count = self._add_entities_from_generator(
            db_manager, self._release_group_generator, dump_path)

        LOGGER.debug("Added %s/%s release group records",
                     *release_groups_count)

        def release_artist_relationships_uniqueness_filter():
            """Remove duplicates from
            _release_group_artist_relationship_generator """
            yield from [
                MusicBrainzReleaseGroupArtistRelationship(item[0], item[1])
                for item in set(
                    self._release_group_artist_relationship_generator(
                        dump_path))
            ]

        tables = [MusicBrainzReleaseGroupArtistRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships release-artist/band")

        relationships_count = self._add_entities_from_generator(
            db_manager, release_artist_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)

        tables = [MusicbrainzReleaseGroupLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups links")

        link_count = self._add_entities_from_generator(
            db_manager, self._release_group_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s release group link records", *link_count)

        tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing artists and bands")

        artist_count = self._add_entities_from_generator(
            db_manager, self._artist_generator, dump_path)

        LOGGER.debug("Added %s/%s artist records", *artist_count)

        tables = [MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing links")

        link_count = self._add_entities_from_generator(
            db_manager, self._artist_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s link records", *link_count)
        LOGGER.info("Importing ISNIs")

        isni_link_count = self._add_entities_from_generator(
            db_manager, self._isni_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count)

        tables = [MusicBrainzArtistBandRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships artist-band")

        def artist_band_relationships_uniqueness_filter():
            yield from [
                MusicBrainzArtistBandRelationship(item[0], item[1])
                for item in set(
                    self._artist_band_relationship_generator(dump_path))
            ]

        relationships_count = self._add_entities_from_generator(
            db_manager, artist_band_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)
        shutil.rmtree(dump_path, ignore_errors=True)