Ejemplo n.º 1
0
def add_catalog(catalog: str, entity: str) -> int:
    """Add or update a catalog.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :return: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    """
    name_field = f'{catalog.title()} {entity}'

    session = DBManager(MNM_DB).new_session()
    try:
        existing = (session.query(
            mix_n_match.MnMCatalog).filter_by(name=name_field).first())
        if existing is None:
            LOGGER.info(
                "Adding %s %s catalog to the mix'n'match DB ... ",
                catalog,
                entity,
            )
            db_entity = mix_n_match.MnMCatalog()
            _set_catalog_fields(db_entity, name_field, catalog, entity)
            session.add(db_entity)
            session.commit()
            catalog_id = db_entity.id
        else:
            LOGGER.info('Updating %s %s catalog ... ', catalog, entity)
            catalog_id = existing.id
            _set_catalog_fields(existing, name_field, catalog, entity)
            session.add(existing)
            session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed catalog addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        return None
    finally:
        session.close()

    LOGGER.info('Catalog addition/update went fine. Internal ID: %d',
                catalog_id)
    return catalog_id
Ejemplo n.º 2
0
def _gather_target_data(
    catalog,
    entity,
    total_queries,
    works_buckets,
    works_inverted,
    people_buckets,
    people_inverted,
):
    claim_pid = vocabulary.WORKS_BY_PEOPLE_MAPPING[catalog][entity]
    db_entity = target_database.get_relationship_entity(catalog, entity)
    session = DBManager().connect_to_db()

    # Leverage works-people relationships
    try:
        for works, people in tqdm(product(works_buckets, people_buckets),
                                  total=total_queries):
            works_to_people = session.query(db_entity).filter(
                and_(
                    db_entity.from_catalog_id.in_(works),
                    db_entity.to_catalog_id.in_(people),
                ))

            for result in works_to_people:
                yield works_inverted[
                    result.from_catalog_id], claim_pid, people_inverted[
                        result.to_catalog_id], result.to_catalog_id
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of works-people relationships due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)

        session.rollback()
        return None
    finally:
        session.close()
Ejemplo n.º 3
0
def add_matches(
    file_path: str,
    catalog_id: int,
    catalog: str,
    entity: str,
    confidence_range: Tuple[float, float],
) -> None:
    """Add or update matches to an existing catalog.
    Curated matches found in the catalog are kept as is.

    :param file_path: path to a file with matches
    :param catalog_id: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param confidence_range: a pair of floats indicating
      the minimum and maximum confidence scores of matches
      that will be added/updated.
    """
    success = True  # Flag to log that everything went fine
    class_qid, url_prefix = _handle_metadata(catalog, entity)
    matches = _handle_matches(file_path, confidence_range)

    LOGGER.info(
        "Starting import of %s %s matches (catalog ID: %d) into the mix'n'match DB ...",
        catalog,
        entity,
        catalog_id,
    )

    start = datetime.now()
    session = DBManager(MNM_DB).new_session()

    # Note that the session is kept open after this operation
    curated, success = _sync_matches(session, catalog_id, success)

    # Filter curated matches:
    # rows with tids that are NOT (~) in curated tids
    matches = matches[~matches[keys.TID].isin(curated)]

    n_matches = len(matches)
    matches_reader = matches.itertuples(index=False, name=None)
    batch = []

    try:
        _import_matches(
            batch,
            catalog,
            catalog_id,
            class_qid,
            entity,
            matches_reader,
            n_matches,
            session,
            url_prefix,
        )

        LOGGER.info(
            'Adding last batch of %d %s %s matches, this may take a while ...',
            len(batch),
            catalog,
            entity,
        )
        # Commit remaining entities
        session.bulk_save_objects(batch)
        session.commit()

    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        success = False

    finally:
        session.close()

    if success:
        end = datetime.now()
        LOGGER.info(
            'Import of %s %s matches (catalog ID: %d) completed in %s. '
            'Total matches: %d',
            catalog,
            entity,
            catalog_id,
            end - start,
            n_matches,
        )