Beispiel #1
0
def build_target(
    goal: str, catalog: str, entity: str, identifiers: Set[str]
) -> Iterator[pd.DataFrame]:
    """Build a target catalog dataset for training or classification purposes:
    workflow step 1.

    Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p``
    database. This is where the :mod:`importer` inserts processed catalog dumps.

    The database is located in
    `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_
    under the Wikimedia
    `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure.
    See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_.

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param identifiers: a set of catalog IDs to gather data for
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity)

    # Target catalog ORM entities/DB tables
    base, link, nlp = (
        target_database.get_main_entity(catalog, entity),
        target_database.get_link_entity(catalog, entity),
        target_database.get_nlp_entity(catalog, entity),
    )
    tables = [table for table in (base, link, nlp) if table]

    # Initial query with all non-null tables
    query = Query(tables)
    # Remove `base` to avoid outer join with itself
    tables.remove(base)
    # Outer joins
    for table in tables:
        query = query.outerjoin(table, base.catalog_id == table.catalog_id)
    # Condition
    query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(
        False
    )

    sql = query.statement
    LOGGER.debug('SQL query to be fired: %s', sql)

    # Avoid loading query result in memory
    db_engine = DBManager().get_engine().execution_options(stream_results=True)

    return read_sql(sql, db_engine, chunksize=1000)
Beispiel #2
0
def find_samples(
    goal: str,
    catalog: str,
    wikidata_column: pd.Series,
    chunk_number: int,
    target_db_entity: constants.DB_ENTITY,
    dir_io: str,
) -> pd.MultiIndex:
    """Build a blocking index by looking up target catalog identifiers given a
    Wikidata dataset column. A meaningful column should hold strings.

    Under the hood, run
    `full-text search <https://mariadb.com/kb/en/library/full-text-index-overview/>`_
    in *natural language mode* against the target catalog database.

    This function uses multithreaded parallel processing.

    :param goal: ``{'training', 'classification'}``.
      Whether the samples are for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param wikidata_column: a Wikidata dataset column holding values suitable
      for full-text search against the target database
    :param chunk_number: which Wikidata chunk will undergo blocking.
      Typically returned by calling :func:`enumerate` over
      :func:`preprocess_wikidata() <soweego.linker.workflow.preprocess_wikidata>`
    :param target_db_entity: an ORM entity (AKA table) of the target catalog
      database that full-text search should aim at
    :param dir_io: input/output directory where index chunks
      will be read/written
    :return: the blocking index holding candidate pairs
    """
    check_goal_value(goal)

    samples_path = os.path.join(
        dir_io,
        constants.SAMPLES.format(catalog, target_db_entity.__name__, goal,
                                 chunk_number),
    )

    # Early return cached samples, for development purposes
    if os.path.isfile(samples_path):
        LOGGER.info(
            "Will reuse existing %s %s samples index, chunk %d: '%s'",
            catalog,
            goal,
            chunk_number,
            samples_path,
        )
        return pd.read_pickle(samples_path)

    LOGGER.info(
        "Blocking on Wikidata column '%s' "
        "via full-text search to find all samples ...",
        wikidata_column.name,
    )

    wikidata_column.dropna(inplace=True)

    samples = _fire_queries(wikidata_column, target_db_entity)
    samples_index = pd.MultiIndex.from_tuples(samples,
                                              names=[keys.QID, keys.TID])

    LOGGER.debug(
        '%s %s samples index chunk %d random example:\n%s',
        catalog,
        goal,
        chunk_number,
        samples_index.to_series().sample(5),
    )

    os.makedirs(os.path.dirname(samples_path), exist_ok=True)
    pd.to_pickle(samples_index, samples_path)

    LOGGER.info(
        "%s %s samples index chunk %d dumped to '%s'",
        catalog,
        goal,
        chunk_number,
        samples_path,
    )

    LOGGER.info('Built blocking index of all samples, chunk %d', chunk_number)

    return samples_index
Beispiel #3
0
def preprocess_target(goal: str,
                      target_reader: Iterator[pd.DataFrame]) -> pd.DataFrame:
    """Preprocess a target catalog dataset: workflow step 2.

    This function consumes :class:`pandas.DataFrame` chunks and
    should be pipelined after :func:`build_target`.

    **Preprocessing actions:**

    1. drop unneeded columns holding target DB primary keys
    2. rename non-null catalog ID columns & drop others
    3. drop columns with null values only
    4. pair dates with their precision and drop precision columns
       when applicable
    5. aggregate denormalized data on target ID
    6. *(shared with* :func:`preprocess_wikidata` *)*
       normalize columns with names, occupations, dates, when applicable

    :param goal: ``{'training', 'classification'}``.
      Whether the dataset is for training or classification
    :param target_reader: a dataset reader as returned by
      :func:`build_target`
    :return: the generator yielding preprocessed
      :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Preprocessing target ...')

    # Target data is denormalized, so we must consume the input generator
    # to perform consistent aggregations later
    target = pd.concat([chunk for chunk in target_reader], sort=False)

    # 1. Drop target DB internal ID columns
    LOGGER.info("Dropping '%s' columns ...", keys.INTERNAL_ID)
    target.drop(columns=keys.INTERNAL_ID, inplace=True)
    log_dataframe_info(LOGGER, target,
                       f"Dropped '{keys.INTERNAL_ID}'' columns")

    # 2. Rename non-null catalog ID column & drop others
    _rename_or_drop_tid_columns(target)

    # 3. Drop columns with null values only
    LOGGER.info('Dropping columns with null values only ...')
    _drop_null_columns(target)

    # 4. Pair dates with their precision & drop precision columns
    _pair_dates(target)

    # 5. Aggregate denormalized data on target ID
    # TODO Token lists may contain duplicate tokens
    LOGGER.info("Aggregating denormalized data on '%s' column ...", keys.TID)
    target = target.groupby(keys.TID).agg(lambda x: list(set(x)))
    log_dataframe_info(LOGGER, target,
                       f"Data indexed and aggregated on '{keys.TID}' column")

    # 6. Shared preprocessing
    target = _shared_preprocessing(
        target,
        _will_handle_birth_dates(target),
        _will_handle_death_dates(target),
    )

    LOGGER.info('Target preprocessing done')

    return target
Beispiel #4
0
def preprocess_wikidata(goal: str,
                        wikidata_reader: JsonReader) -> Iterator[pd.DataFrame]:
    """Preprocess a Wikidata dataset: workflow step 2.

    This function consumes :class:`pandas.DataFrame` chunks and
    should be pipelined after :func:`build_wikidata`.

    **Preprocessing actions:**

    1. set QIDs as :class:`pandas.core.indexes.base.Index` of the chunk
    2. drop columns with null values only
    3. *(training)* ensure one target ID per QID
    4. tokenize names, URLs, genres, when applicable
    5. *(shared with* :func:`preprocess_target` *)*
       normalize columns with names, occupations, dates, when applicable

    :param goal: ``{'training', 'classification'}``.
      Whether the dataset is for training or classification
    :param wikidata_reader: a dataset reader as returned by
      :func:`build_wikidata`
    :return: the generator yielding preprocessed
      :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Preprocessing Wikidata %s set ...', goal)

    for i, chunk in enumerate(wikidata_reader, 1):
        # 1. QID as index
        chunk.set_index(keys.QID, inplace=True)
        log_dataframe_info(LOGGER, chunk,
                           f"Built index from '{keys.QID}' column")

        # 2. Drop columns with null values only
        _drop_null_columns(chunk)

        # 3. Training only: ensure 1 target ID
        if goal == 'training':
            # This wipes out QIDs with > 1 positive samples,
            # but the impact can be neglected
            chunk[keys.TID] = chunk[keys.TID].map(
                lambda cell: cell[0] if isinstance(cell, list) else cell)

        # 4. Tokenize names
        for column in constants.NAME_FIELDS:
            if chunk.get(column) is not None:
                chunk[f'{column}_tokens'] = chunk[column].apply(
                    _tokenize_values, args=(text_utils.tokenize, ))

        # 4b. Tokenize genres if available
        if chunk.get(keys.GENRES) is not None:
            chunk[keys.GENRES] = chunk[keys.GENRES].apply(
                _tokenize_values, args=(text_utils.tokenize, ))

        # 5. Tokenize URLs
        chunk[keys.URL_TOKENS] = chunk[keys.URL].apply(
            _tokenize_values, args=(url_utils.tokenize, ))

        # 6. Shared preprocessing
        chunk = _shared_preprocessing(
            chunk,
            _will_handle_birth_dates(chunk),
            _will_handle_death_dates(chunk),
        )

        LOGGER.info('Chunk %d done', i)

        yield chunk