Beispiel #1
0
def gather_target_metadata(entity_type, catalog):
    catalog_constants = _get_catalog_constants(catalog)
    catalog_entity = _get_catalog_entity(entity_type, catalog_constants)

    LOGGER.info(
        'Gathering %s birth/death dates/places and gender metadata ...',
        catalog)
    entity = catalog_entity['entity']
    # Base metadata
    query_fields = _build_metadata_query_fields(entity, entity_type, catalog)

    session = DBManager.connect_to_db()
    result = None
    try:
        result = _run_metadata_query(session, query_fields, entity, catalog,
                                     entity_type)
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if not result:
        return None
    return _parse_target_metadata_query_result(result)
Beispiel #2
0
def gather_target_links(entity_type, catalog):
    catalog_constants = _get_catalog_constants(catalog)
    catalog_entity = _get_catalog_entity(entity_type, catalog_constants)

    LOGGER.info('Gathering %s %s links ...', catalog, entity_type)
    link_entity = catalog_entity['link_entity']

    session = DBManager.connect_to_db()
    result = None
    try:
        query = session.query(link_entity.catalog_id, link_entity.url)
        count = query.count()
        if count == 0:
            LOGGER.warning(
                "No links available for %s %s. Stopping validation here",
                catalog, entity_type)
            return None
        LOGGER.info('Got %d links from %s %s', count, catalog, entity_type)
        result = query.all()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if result is None:
        return None
    for row in result:
        yield row.catalog_id, row.url
Beispiel #3
0
def check_links_cli(catalog: str):
    """
    Check for rotten URLs of an imported catalog.

    :param catalog: one of the keys of constants.TARGET_CATALOGS
    """
    for entity_type in target_database.supported_entities_for_target(catalog):

        LOGGER.info("Validating %s %s links...", catalog, entity_type)
        entity = target_database.get_link_entity(catalog, entity_type)
        if not entity:
            LOGGER.info(
                "%s %s does not have a links table. Skipping...",
                catalog,
                entity_type,
            )
            continue

        session = DBManager.connect_to_db()
        total = session.query(entity).count()
        removed = 0

        with Pool() as pool:
            # Validate each link
            for resolved, res_entity in tqdm(
                    pool.imap_unordered(_resolve_url, session.query(entity)),
                    total=total,
            ):
                if not resolved:
                    session_delete = DBManager.connect_to_db()
                    # if not valid delete
                    session_delete.delete(res_entity)
                    try:
                        session_delete.commit()
                        removed += 1
                    except:
                        session.rollback()
                        raise
                    finally:
                        session_delete.close()

        session.close()
        LOGGER.info("Removed %s/%s from %s %s", removed, total, catalog,
                    entity_type)
Beispiel #4
0
def perfect_name_search(target_entity: T, to_search: str) -> Iterable[T]:
    session = DBManager.connect_to_db()
    try:
        for r in session.query(target_entity).filter(
                target_entity.name == to_search).all():
            yield r
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
def perfect_name_search(target_entity: constants.DB_ENTITY,
                        to_search: str) -> Iterable[constants.DB_ENTITY]:
    session = DBManager.connect_to_db()
    try:
        for r in (session.query(target_entity).filter(
                target_entity.name == to_search).all()):
            yield r

    except:
        session.rollback()
        raise
    finally:
        session.close()
Beispiel #6
0
def name_fulltext_search(target_entity: T, query: str) -> Iterable[T]:
    ft_search = target_entity.name.match(query)

    session = DBManager.connect_to_db()
    try:
        for r in session.query(target_entity).filter(ft_search).all():
            yield r
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
def tokens_fulltext_search(
    target_entity: constants.DB_ENTITY,
    boolean_mode: bool,
    tokens: Iterable[str],
    where_clause=None,
    limit: int = 10,
) -> Iterable[constants.DB_ENTITY]:
    if issubclass(target_entity, models.base_entity.BaseEntity):
        column = target_entity.name_tokens
    elif issubclass(target_entity, models.base_link_entity.BaseLinkEntity):
        column = target_entity.url_tokens
    elif issubclass(target_entity, models.base_nlp_entity.BaseNlpEntity):
        column = target_entity.description_tokens
    else:
        LOGGER.critical('Bad target entity class: %s', target_entity)
        raise ValueError('Bad target entity class: %s' % target_entity)

    tokens = filter(None, tokens)
    terms = (' '.join(map('+{0}'.format, tokens))
             if boolean_mode else ' '.join(tokens))
    ft_search = column.match(terms)

    session = DBManager.connect_to_db()
    try:
        if where_clause is None:
            query = session.query(target_entity).filter(ft_search).limit(limit)
        else:
            query = (session.query(target_entity).filter(ft_search).filter(
                where_clause).limit(limit))

        count = query.count()
        if count == 0:
            LOGGER.debug(
                "No result from full-text index query to %s. Terms: '%s'",
                target_entity.__name__,
                terms,
            )
            session.commit()
        else:
            for row in query:
                yield row
            session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
Beispiel #8
0
def check_existence(class_or_occupation_query, class_qid, catalog_pid,
                    entity: BaseEntity):
    query_type = 'identifier', class_or_occupation_query
    session = DBManager.connect_to_db()
    invalid = defaultdict(set)
    count = 0

    for result in sparql_queries.run_identifier_or_links_query(
            query_type, class_qid, catalog_pid, 0):
        for qid, target_id in result.items():
            results = session.query(entity).filter(
                entity.catalog_id == target_id).all()
            if not results:
                LOGGER.warning('%s identifier %s is invalid', qid, target_id)
                invalid[target_id].add(qid)
                count += 1

    LOGGER.info('Total invalid identifiers = %d', count)
    # Sets are not serializable to JSON, so cast them to lists
    return {target_id: list(qids) for target_id, qids in invalid.items()}
def gather_target_links(entity, catalog):
    LOGGER.info('Gathering %s %s links ...', catalog, entity)
    link_entity = target_database.get_link_entity(catalog, entity)

    # Early return when the links table doesn't exist
    if link_entity is None:
        LOGGER.warning(
            'No links table available in the database for %s %s. '
            'Stopping validation here',
            catalog,
            entity,
        )
        return None

    session = DBManager.connect_to_db()
    result = None
    try:
        query = session.query(link_entity.catalog_id, link_entity.url)
        count = query.count()
        # Early return when no links
        if count == 0:
            LOGGER.warning(
                'No links available for %s %s. Stopping validation here',
                catalog,
                entity,
            )
            return None
        LOGGER.info('Got %d links from %s %s', count, catalog, entity)
        result = query.all()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

    if result is None:
        return None
    for row in result:
        yield row.catalog_id, row.url
Beispiel #10
0
def tokens_fulltext_search(target_entity: T, boolean_mode: bool,
                           tokens: Iterable[str]) -> Iterable[T]:
    query = None
    if boolean_mode:
        query = ' '.join(map('+{0}'.format, tokens))
    else:
        query = ' '.join(tokens)

    ft_search = target_entity.tokens.match(query)

    session = DBManager.connect_to_db()
    result = []
    try:
        result = session.query(target_entity).filter(ft_search).all()
        session.commit()
    except:
        session.rollback()
        raise

    if not result:
        return []
    return result
def gather_target_biodata(entity, catalog):
    LOGGER.info(
        'Gathering %s birth/death dates/places and gender metadata ...',
        catalog)
    db_entity = target_database.get_main_entity(catalog, entity)
    # Base biodata
    query_fields = _build_biodata_query_fields(db_entity, entity, catalog)

    session = DBManager.connect_to_db()
    query = session.query(*query_fields).filter(
        or_(db_entity.born.isnot(None), db_entity.died.isnot(None)))
    result = None
    try:
        raw_result = _run_query(query, catalog, entity)
        if raw_result is None:
            return None
        result = _parse_target_biodata_query_result(raw_result)
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
    return result
Beispiel #12
0
def check_urls_cli(catalog, drop, dir_io):
    """Check for rotten URLs of an imported catalog.

    For every catalog entity, dump rotten URLs to a file.
    CSV format: URL,catalog_ID

    Use '-d' to drop rotten URLs from the DB on the fly.
    """
    for entity in target_database.supported_entities_for_target(catalog):
        out_path = os.path.join(
            dir_io, ROTTEN_URLS_FNAME.format(catalog=catalog, entity=entity)
        )

        LOGGER.info('Starting check of %s %s URLs ...', catalog, entity)
        link_entity = target_database.get_link_entity(catalog, entity)
        if not link_entity:
            LOGGER.info(
                '%s %s does not have a links table. Skipping ...',
                catalog,
                entity,
            )
            continue

        query_session = DBManager.connect_to_db()
        total = query_session.query(link_entity).count()

        rotten = 0
        if drop:
            removed = 0

        # Parallel operation
        with Pool() as pool, open(out_path, 'w', buffering=1) as fout:
            writer = csv.writer(fout)
            try:
                # Resolve every URL
                for resolved, result in tqdm(
                    pool.imap_unordered(_resolve, query_session.query(link_entity)),
                    total=total,
                ):
                    if not resolved:
                        # Dump
                        writer.writerow((result.url, result.catalog_id))
                        rotten += 1

                        # Drop from DB
                        if drop:
                            delete_session = DBManager.connect_to_db()
                            delete_session.delete(result)
                            try:
                                delete_session.commit()
                                removed += 1
                            except SQLAlchemyError as error:
                                LOGGER.error(
                                    'Failed deletion of %s: %s',
                                    result,
                                    error.__class__.__name__,
                                )
                                LOGGER.debug(error)
                                delete_session.rollback()
                            finally:
                                delete_session.close()
            except SQLAlchemyError as error:
                LOGGER.error(
                    '%s while querying %s %s URLs',
                    error.__class__.__name__,
                    catalog,
                    entity,
                )
                LOGGER.debug(error)
                session.rollback()
            finally:
                query_session.close()

        LOGGER.debug('Cache information: %s', url_utils.resolve.cache_info())
        LOGGER.info(
            "Total %s %s rotten URLs dumped to '%s': %d / %d",
            catalog,
            entity,
            out_path,
            rotten,
            total,
        )

        if drop:
            LOGGER.info(
                'Total %s %s rotten URLs dropped from the DB: %d / %d',
                catalog,
                entity,
                rotten,
                removed,
            )
Beispiel #13
0
def dead_ids(catalog: str,
             entity: str,
             wd_cache=None) -> Tuple[DefaultDict, Dict]:
    """Look for dead identifiers in Wikidata.
    An identifier is dead if it does not exist in the given catalog
    when this function is executed.

    Dead identifiers should be marked with a deprecated rank in Wikidata.

    **How it works:**

    1. gather identifiers of the given catalog from relevant Wikidata items
    2. look them up in the given catalog
    3. if an identifier is not in the given catalog anymore,
       it should be deprecated

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata
      in a previous run
    :return: the ``dict`` pair of dead identifiers
      and identifiers gathered from Wikidata
    """
    dead = defaultdict(set)
    db_entity = target_database.get_main_entity(catalog, entity)

    # Wikidata side
    if wd_cache is None:
        wd_ids = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_ids,
        )
    else:
        wd_ids = wd_cache

    # Target catalog side
    session = DBManager.connect_to_db()

    try:
        for qid in wd_ids:
            for tid in wd_ids[qid][keys.TID]:
                existing = (session.query(
                    db_entity.catalog_id).filter_by(catalog_id=tid).count())
                if existing == 0:
                    LOGGER.debug('%s %s identifier %s is dead', qid, catalog,
                                 tid)
                    dead[tid].add(qid)
        session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of target catalog identifiers due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
    finally:
        session.close()

    LOGGER.info(
        'Check completed. Target: %s %s. Total dead identifiers: %d',
        catalog,
        entity,
        len(dead),
    )
    return dead, wd_ids