def build_target( goal: str, catalog: str, entity: str, identifiers: Set[str] ) -> Iterator[pd.DataFrame]: """Build a target catalog dataset for training or classification purposes: workflow step 1. Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p`` database. This is where the :mod:`importer` inserts processed catalog dumps. The database is located in `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_ under the Wikimedia `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure. See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_. :param goal: ``{'training', 'classification'}``. Whether to build a dataset for training or classification :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param identifiers: a set of catalog IDs to gather data for :return: the generator yielding :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity) # Target catalog ORM entities/DB tables base, link, nlp = ( target_database.get_main_entity(catalog, entity), target_database.get_link_entity(catalog, entity), target_database.get_nlp_entity(catalog, entity), ) tables = [table for table in (base, link, nlp) if table] # Initial query with all non-null tables query = Query(tables) # Remove `base` to avoid outer join with itself tables.remove(base) # Outer joins for table in tables: query = query.outerjoin(table, base.catalog_id == table.catalog_id) # Condition query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads( False ) sql = query.statement LOGGER.debug('SQL query to be fired: %s', sql) # Avoid loading query result in memory db_engine = DBManager().get_engine().execution_options(stream_results=True) return read_sql(sql, db_engine, chunksize=1000)
def find_samples( goal: str, catalog: str, wikidata_column: pd.Series, chunk_number: int, target_db_entity: constants.DB_ENTITY, dir_io: str, ) -> pd.MultiIndex: """Build a blocking index by looking up target catalog identifiers given a Wikidata dataset column. A meaningful column should hold strings. Under the hood, run `full-text search <https://mariadb.com/kb/en/library/full-text-index-overview/>`_ in *natural language mode* against the target catalog database. This function uses multithreaded parallel processing. :param goal: ``{'training', 'classification'}``. Whether the samples are for training or classification :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param wikidata_column: a Wikidata dataset column holding values suitable for full-text search against the target database :param chunk_number: which Wikidata chunk will undergo blocking. Typically returned by calling :func:`enumerate` over :func:`preprocess_wikidata() <soweego.linker.workflow.preprocess_wikidata>` :param target_db_entity: an ORM entity (AKA table) of the target catalog database that full-text search should aim at :param dir_io: input/output directory where index chunks will be read/written :return: the blocking index holding candidate pairs """ check_goal_value(goal) samples_path = os.path.join( dir_io, constants.SAMPLES.format(catalog, target_db_entity.__name__, goal, chunk_number), ) # Early return cached samples, for development purposes if os.path.isfile(samples_path): LOGGER.info( "Will reuse existing %s %s samples index, chunk %d: '%s'", catalog, goal, chunk_number, samples_path, ) return pd.read_pickle(samples_path) LOGGER.info( "Blocking on Wikidata column '%s' " "via full-text search to find all samples ...", wikidata_column.name, ) wikidata_column.dropna(inplace=True) samples = _fire_queries(wikidata_column, target_db_entity) samples_index = pd.MultiIndex.from_tuples(samples, names=[keys.QID, keys.TID]) LOGGER.debug( '%s %s samples index chunk %d random example:\n%s', catalog, goal, chunk_number, samples_index.to_series().sample(5), ) os.makedirs(os.path.dirname(samples_path), exist_ok=True) pd.to_pickle(samples_index, samples_path) LOGGER.info( "%s %s samples index chunk %d dumped to '%s'", catalog, goal, chunk_number, samples_path, ) LOGGER.info('Built blocking index of all samples, chunk %d', chunk_number) return samples_index
def preprocess_target(goal: str, target_reader: Iterator[pd.DataFrame]) -> pd.DataFrame: """Preprocess a target catalog dataset: workflow step 2. This function consumes :class:`pandas.DataFrame` chunks and should be pipelined after :func:`build_target`. **Preprocessing actions:** 1. drop unneeded columns holding target DB primary keys 2. rename non-null catalog ID columns & drop others 3. drop columns with null values only 4. pair dates with their precision and drop precision columns when applicable 5. aggregate denormalized data on target ID 6. *(shared with* :func:`preprocess_wikidata` *)* normalize columns with names, occupations, dates, when applicable :param goal: ``{'training', 'classification'}``. Whether the dataset is for training or classification :param target_reader: a dataset reader as returned by :func:`build_target` :return: the generator yielding preprocessed :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Preprocessing target ...') # Target data is denormalized, so we must consume the input generator # to perform consistent aggregations later target = pd.concat([chunk for chunk in target_reader], sort=False) # 1. Drop target DB internal ID columns LOGGER.info("Dropping '%s' columns ...", keys.INTERNAL_ID) target.drop(columns=keys.INTERNAL_ID, inplace=True) log_dataframe_info(LOGGER, target, f"Dropped '{keys.INTERNAL_ID}'' columns") # 2. Rename non-null catalog ID column & drop others _rename_or_drop_tid_columns(target) # 3. Drop columns with null values only LOGGER.info('Dropping columns with null values only ...') _drop_null_columns(target) # 4. Pair dates with their precision & drop precision columns _pair_dates(target) # 5. Aggregate denormalized data on target ID # TODO Token lists may contain duplicate tokens LOGGER.info("Aggregating denormalized data on '%s' column ...", keys.TID) target = target.groupby(keys.TID).agg(lambda x: list(set(x))) log_dataframe_info(LOGGER, target, f"Data indexed and aggregated on '{keys.TID}' column") # 6. Shared preprocessing target = _shared_preprocessing( target, _will_handle_birth_dates(target), _will_handle_death_dates(target), ) LOGGER.info('Target preprocessing done') return target
def preprocess_wikidata(goal: str, wikidata_reader: JsonReader) -> Iterator[pd.DataFrame]: """Preprocess a Wikidata dataset: workflow step 2. This function consumes :class:`pandas.DataFrame` chunks and should be pipelined after :func:`build_wikidata`. **Preprocessing actions:** 1. set QIDs as :class:`pandas.core.indexes.base.Index` of the chunk 2. drop columns with null values only 3. *(training)* ensure one target ID per QID 4. tokenize names, URLs, genres, when applicable 5. *(shared with* :func:`preprocess_target` *)* normalize columns with names, occupations, dates, when applicable :param goal: ``{'training', 'classification'}``. Whether the dataset is for training or classification :param wikidata_reader: a dataset reader as returned by :func:`build_wikidata` :return: the generator yielding preprocessed :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Preprocessing Wikidata %s set ...', goal) for i, chunk in enumerate(wikidata_reader, 1): # 1. QID as index chunk.set_index(keys.QID, inplace=True) log_dataframe_info(LOGGER, chunk, f"Built index from '{keys.QID}' column") # 2. Drop columns with null values only _drop_null_columns(chunk) # 3. Training only: ensure 1 target ID if goal == 'training': # This wipes out QIDs with > 1 positive samples, # but the impact can be neglected chunk[keys.TID] = chunk[keys.TID].map( lambda cell: cell[0] if isinstance(cell, list) else cell) # 4. Tokenize names for column in constants.NAME_FIELDS: if chunk.get(column) is not None: chunk[f'{column}_tokens'] = chunk[column].apply( _tokenize_values, args=(text_utils.tokenize, )) # 4b. Tokenize genres if available if chunk.get(keys.GENRES) is not None: chunk[keys.GENRES] = chunk[keys.GENRES].apply( _tokenize_values, args=(text_utils.tokenize, )) # 5. Tokenize URLs chunk[keys.URL_TOKENS] = chunk[keys.URL].apply( _tokenize_values, args=(url_utils.tokenize, )) # 6. Shared preprocessing chunk = _shared_preprocessing( chunk, _will_handle_birth_dates(chunk), _will_handle_death_dates(chunk), ) LOGGER.info('Chunk %d done', i) yield chunk