Example #1
0
def build_target(
    goal: str, catalog: str, entity: str, identifiers: Set[str]
) -> Iterator[pd.DataFrame]:
    """Build a target catalog dataset for training or classification purposes:
    workflow step 1.

    Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p``
    database. This is where the :mod:`importer` inserts processed catalog dumps.

    The database is located in
    `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_
    under the Wikimedia
    `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure.
    See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_.

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param identifiers: a set of catalog IDs to gather data for
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity)

    # Target catalog ORM entities/DB tables
    base, link, nlp = (
        target_database.get_main_entity(catalog, entity),
        target_database.get_link_entity(catalog, entity),
        target_database.get_nlp_entity(catalog, entity),
    )
    tables = [table for table in (base, link, nlp) if table]

    # Initial query with all non-null tables
    query = Query(tables)
    # Remove `base` to avoid outer join with itself
    tables.remove(base)
    # Outer joins
    for table in tables:
        query = query.outerjoin(table, base.catalog_id == table.catalog_id)
    # Condition
    query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(
        False
    )

    sql = query.statement
    LOGGER.debug('SQL query to be fired: %s', sql)

    # Avoid loading query result in memory
    db_engine = DBManager().get_engine().execution_options(stream_results=True)

    return read_sql(sql, db_engine, chunksize=1000)
Example #2
0
def _classification_set_generator(
        catalog, entity,
        dir_io) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
    goal = 'classification'

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Collect samples via queries to the target DB
        samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(samples.get_level_values(keys.TID)))

        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        # Extract features
        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i))
        feature_vectors = workflow.extract_features(samples, wd_chunk,
                                                    target_chunk,
                                                    features_path)

        yield wd_chunk, target_chunk, feature_vectors

        LOGGER.info('Chunk %d classified', i)
def gather_target_biodata(entity, catalog):
    LOGGER.info(
        'Gathering %s birth/death dates/places and gender metadata ...',
        catalog)
    db_entity = target_database.get_main_entity(catalog, entity)
    # Base biodata
    query_fields = _build_biodata_query_fields(db_entity, entity, catalog)

    session = DBManager.connect_to_db()
    query = session.query(*query_fields).filter(
        or_(db_entity.born.isnot(None), db_entity.died.isnot(None)))
    result = None
    try:
        raw_result = _run_query(query, catalog, entity)
        if raw_result is None:
            return None
        result = _parse_target_biodata_query_result(raw_result)
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
    return result
Example #4
0
def build_training_set(
    catalog: str, entity: str, dir_io: str
) -> Tuple[pd.DataFrame, pd.MultiIndex]:
    """Build a training set.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the feature vectors and positive samples pair.
      Features are computed by comparing *(QID, catalog ID)* pairs.
      Positive samples are catalog IDs available in Wikidata
    """
    goal = 'training'

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    positive_samples, feature_vectors = None, None

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Positive samples come from Wikidata
        if positive_samples is None:
            positive_samples = wd_chunk[keys.TID]
        else:
            # We concatenate the current chunk
            # and reset `positive_samples` at each iteration,
            # instead of appending each chunk to a list,
            # then concatenate it at the end of the loop.
            # Reason: keeping multiple yet small pandas objects
            # is less memory-efficient
            positive_samples = pd.concat([positive_samples, wd_chunk[keys.TID]])

        # All samples come from queries to the target DB
        # and include negative ones
        all_samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from all samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(all_samples.get_level_values(keys.TID))
        )
        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i)
        )

        # Extract features from all samples
        chunk_fv = workflow.extract_features(
            all_samples, wd_chunk, target_chunk, features_path
        )

        if feature_vectors is None:
            feature_vectors = chunk_fv
        else:
            feature_vectors = pd.concat([feature_vectors, chunk_fv], sort=False)

    # Final positive samples index
    positive_samples_index = pd.MultiIndex.from_tuples(
        zip(positive_samples.index, positive_samples),
        names=[keys.QID, keys.TID],
    )

    LOGGER.info('Built positive samples index from Wikidata')

    feature_vectors = feature_vectors.fillna(constants.FEATURE_MISSING_VALUE)

    return feature_vectors, positive_samples_index
Example #5
0
def dead_ids(catalog: str,
             entity: str,
             wd_cache=None) -> Tuple[DefaultDict, Dict]:
    """Look for dead identifiers in Wikidata.
    An identifier is dead if it does not exist in the given catalog
    when this function is executed.

    Dead identifiers should be marked with a deprecated rank in Wikidata.

    **How it works:**

    1. gather identifiers of the given catalog from relevant Wikidata items
    2. look them up in the given catalog
    3. if an identifier is not in the given catalog anymore,
       it should be deprecated

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param wd_cache: (optional) a ``dict`` of identifiers gathered from Wikidata
      in a previous run
    :return: the ``dict`` pair of dead identifiers
      and identifiers gathered from Wikidata
    """
    dead = defaultdict(set)
    db_entity = target_database.get_main_entity(catalog, entity)

    # Wikidata side
    if wd_cache is None:
        wd_ids = {}
        data_gathering.gather_target_ids(
            entity,
            catalog,
            target_database.get_catalog_pid(catalog, entity),
            wd_ids,
        )
    else:
        wd_ids = wd_cache

    # Target catalog side
    session = DBManager.connect_to_db()

    try:
        for qid in wd_ids:
            for tid in wd_ids[qid][keys.TID]:
                existing = (session.query(
                    db_entity.catalog_id).filter_by(catalog_id=tid).count())
                if existing == 0:
                    LOGGER.debug('%s %s identifier %s is dead', qid, catalog,
                                 tid)
                    dead[tid].add(qid)
        session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of target catalog identifiers due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
    finally:
        session.close()

    LOGGER.info(
        'Check completed. Target: %s %s. Total dead identifiers: %d',
        catalog,
        entity,
        len(dead),
    )
    return dead, wd_ids
Example #6
0
def _run(catalog, entity, rule, check_dates, upload, sandbox, dir_io):
    wd_io_path = os.path.join(
        dir_io, constants.WD_CLASSIFICATION_SET.format(catalog, entity))
    base_entity = target_database.get_main_entity(catalog, entity)
    link_entity = target_database.get_link_entity(catalog, entity)

    if rule == 'links' and link_entity is None:
        LOGGER.warning(
            "No links available for %s %s. Stopping baseline here ...",
            catalog,
            entity,
        )
        return

    pid = target_database.get_catalog_pid(catalog, entity)

    with gzip.open(wd_io_path, 'rt') as wd_io:
        if rule in ('perfect', 'all'):
            wd_io.seek(0)

            LOGGER.info('Starting perfect names linker ...')

            result = _perfect_names_linker(wd_io, base_entity, pid,
                                           check_dates)

            perfect_path = os.path.join(
                dir_io, constants.BASELINE_PERFECT.format(catalog, entity))
            os.makedirs(os.path.dirname(perfect_path), exist_ok=True)
            _handle_result(result, rule, catalog, perfect_path, upload,
                           sandbox)

        if rule == 'all' and link_entity is None:
            LOGGER.warning(
                "No links available for %s %s. Won't run the 'links' rule ...",
                catalog,
                entity,
            )

        if rule in ('links', 'all') and link_entity is not None:
            wd_io.seek(0)

            LOGGER.info('Starting similar link tokens linker ...')

            result = _similar_tokens_linker(
                wd_io,
                link_entity,
                (keys.URL, keys.URL_TOKENS),
                pid,
                False,
                url_utils.tokenize,
            )

            links_path = os.path.join(
                dir_io, constants.BASELINE_LINKS.format(catalog, entity))
            os.makedirs(os.path.dirname(links_path), exist_ok=True)
            _handle_result(result, rule, catalog, links_path, upload, sandbox)

        if rule in ('names', 'all'):
            wd_io.seek(0)

            LOGGER.info('Starting similar name tokens linker ...')

            result = _similar_tokens_linker(
                wd_io,
                base_entity,
                (keys.NAME, keys.NAME_TOKENS),
                pid,
                check_dates,
                text_utils.tokenize,
            )

            names_path = os.path.join(
                dir_io, constants.BASELINE_NAMES.format(catalog, entity))
            os.makedirs(os.path.dirname(names_path), exist_ok=True)
            _handle_result(result, rule, catalog, names_path, upload, sandbox)
Example #7
0
def execute(
    model_path: str,
    catalog: str,
    entity: str,
    threshold: float,
    name_rule: bool,
    dir_io: str,
) -> Iterator[pd.Series]:
    """Run a supervised linker.

    1. Build the classification set relevant to the given catalog and entity
    2. generate links between Wikidata items and catalog identifiers

    :param model_path: path to a trained model file
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param threshold: minimum confidence score for generated links.
      Those below this value are discarded.
      Must be a float between 0 and 1
    :param name_rule: whether to enable the rule on full names or not:
      if *True*, links with different full names
      are discarded after classification
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the generator yielding chunks of links
    """
    goal = 'classification'
    classifier = joblib.load(model_path)

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Collect samples via queries to the target DB
        samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(samples.get_level_values(keys.TID))
        )

        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        # Extract features
        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i)
        )
        feature_vectors = workflow.extract_features(
            samples, wd_chunk, target_chunk, features_path
        )

        # The classification set must have the same feature space
        # as the training one
        _add_missing_feature_columns(classifier, feature_vectors)

        predictions = (
            # LSVM doesn't support probability scores
            classifier.predict(feature_vectors)
            if isinstance(classifier, rl.SVMClassifier)
            else classifier.prob(feature_vectors)
        )

        # Full name rule: if names differ, it's not a link
        if name_rule:
            LOGGER.info('Applying full names rule ...')
            predictions = pd.DataFrame(predictions).apply(
                _zero_when_different_names,
                axis=1,
                args=(wd_chunk, target_chunk),
            )

        # Wikidata URL rule: if the target ID has a Wikidata URL, it's a link
        if target_chunk.get(keys.URL) is not None:
            predictions = pd.DataFrame(predictions).apply(
                _one_when_wikidata_link_correct, axis=1, args=(target_chunk,)
            )

        LOGGER.info('Chunk %d classified', i)

        # Filter by threshold
        above_threshold = predictions[predictions >= threshold]

        # Remove duplicates
        yield above_threshold[~above_threshold.index.duplicated()]