def _rename_or_drop_tid_columns(target): LOGGER.info( "Renaming '%s' column with no null values to '%s' " "& dropping '%s' columns with null values ...", keys.CATALOG_ID, keys.TID, keys.CATALOG_ID, ) # If `catalog_id` is one column (i.e., a `Series`), # then it won't have None values if isinstance(target[keys.CATALOG_ID], pd.Series): target[keys.TID] = target[keys.CATALOG_ID] else: no_nulls = target[keys.CATALOG_ID].dropna(axis=1) # It may happen that more than 1 column has no null values: # in this case, they must be identical, # so take the first one target[keys.TID] = (no_nulls.iloc[:, 0] if isinstance( no_nulls, pd.DataFrame) else no_nulls) target.drop(columns=keys.CATALOG_ID, inplace=True) log_dataframe_info( LOGGER, target, f"Renamed '{keys.CATALOG_ID}' column with no null values to " f"'{keys.TID}' & dropped '{keys.CATALOG_ID}' columns with null values", )
def _handle_dates(df, column): # Datasets are hitting pandas timestamp limitations, see # http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations # Parse into Period instead, see # http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-oob if df.get(column) is None: LOGGER.warning( "No '%s' column in DataFrame, won't handle its dates. Perhaps it was dropped because it contained null values only", column, ) df[column] = df[column].map(_parse_dates_list, na_action='ignore') log_dataframe_info(LOGGER, df, 'Parsed dates')
def _pair_dates(target): if _will_handle_birth_dates(target): LOGGER.info('Pairing birth date columns with precision ones ...') target[keys.DATE_OF_BIRTH] = list( zip(target[keys.DATE_OF_BIRTH], target[keys.BIRTH_PRECISION])) target.drop(columns=[keys.BIRTH_PRECISION], inplace=True) log_dataframe_info(LOGGER, target, 'Paired birth date columns with precision ones') if _will_handle_death_dates(target): LOGGER.info('Pairing death date columns with precision ones ...') target[keys.DATE_OF_DEATH] = list( zip(target[keys.DATE_OF_DEATH], target[keys.DEATH_PRECISION])) target.drop(columns=[keys.DEATH_PRECISION], inplace=True) log_dataframe_info(LOGGER, target, 'Paired death date columns with precision ones')
def _drop_null_columns(target): target.dropna(axis=1, how='all', inplace=True) log_dataframe_info(LOGGER, target, 'Dropped columns with null values only')
def preprocess_target(goal: str, target_reader: Iterator[pd.DataFrame]) -> pd.DataFrame: """Preprocess a target catalog dataset: workflow step 2. This function consumes :class:`pandas.DataFrame` chunks and should be pipelined after :func:`build_target`. **Preprocessing actions:** 1. drop unneeded columns holding target DB primary keys 2. rename non-null catalog ID columns & drop others 3. drop columns with null values only 4. pair dates with their precision and drop precision columns when applicable 5. aggregate denormalized data on target ID 6. *(shared with* :func:`preprocess_wikidata` *)* normalize columns with names, occupations, dates, when applicable :param goal: ``{'training', 'classification'}``. Whether the dataset is for training or classification :param target_reader: a dataset reader as returned by :func:`build_target` :return: the generator yielding preprocessed :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Preprocessing target ...') # Target data is denormalized, so we must consume the input generator # to perform consistent aggregations later target = pd.concat([chunk for chunk in target_reader], sort=False) # 1. Drop target DB internal ID columns LOGGER.info("Dropping '%s' columns ...", keys.INTERNAL_ID) target.drop(columns=keys.INTERNAL_ID, inplace=True) log_dataframe_info(LOGGER, target, f"Dropped '{keys.INTERNAL_ID}'' columns") # 2. Rename non-null catalog ID column & drop others _rename_or_drop_tid_columns(target) # 3. Drop columns with null values only LOGGER.info('Dropping columns with null values only ...') _drop_null_columns(target) # 4. Pair dates with their precision & drop precision columns _pair_dates(target) # 5. Aggregate denormalized data on target ID # TODO Token lists may contain duplicate tokens LOGGER.info("Aggregating denormalized data on '%s' column ...", keys.TID) target = target.groupby(keys.TID).agg(lambda x: list(set(x))) log_dataframe_info(LOGGER, target, f"Data indexed and aggregated on '{keys.TID}' column") # 6. Shared preprocessing target = _shared_preprocessing( target, _will_handle_birth_dates(target), _will_handle_death_dates(target), ) LOGGER.info('Target preprocessing done') return target
def preprocess_wikidata(goal: str, wikidata_reader: JsonReader) -> Iterator[pd.DataFrame]: """Preprocess a Wikidata dataset: workflow step 2. This function consumes :class:`pandas.DataFrame` chunks and should be pipelined after :func:`build_wikidata`. **Preprocessing actions:** 1. set QIDs as :class:`pandas.core.indexes.base.Index` of the chunk 2. drop columns with null values only 3. *(training)* ensure one target ID per QID 4. tokenize names, URLs, genres, when applicable 5. *(shared with* :func:`preprocess_target` *)* normalize columns with names, occupations, dates, when applicable :param goal: ``{'training', 'classification'}``. Whether the dataset is for training or classification :param wikidata_reader: a dataset reader as returned by :func:`build_wikidata` :return: the generator yielding preprocessed :class:`pandas.DataFrame` chunks """ utils.check_goal_value(goal) LOGGER.info('Preprocessing Wikidata %s set ...', goal) for i, chunk in enumerate(wikidata_reader, 1): # 1. QID as index chunk.set_index(keys.QID, inplace=True) log_dataframe_info(LOGGER, chunk, f"Built index from '{keys.QID}' column") # 2. Drop columns with null values only _drop_null_columns(chunk) # 3. Training only: ensure 1 target ID if goal == 'training': # This wipes out QIDs with > 1 positive samples, # but the impact can be neglected chunk[keys.TID] = chunk[keys.TID].map( lambda cell: cell[0] if isinstance(cell, list) else cell) # 4. Tokenize names for column in constants.NAME_FIELDS: if chunk.get(column) is not None: chunk[f'{column}_tokens'] = chunk[column].apply( _tokenize_values, args=(text_utils.tokenize, )) # 4b. Tokenize genres if available if chunk.get(keys.GENRES) is not None: chunk[keys.GENRES] = chunk[keys.GENRES].apply( _tokenize_values, args=(text_utils.tokenize, )) # 5. Tokenize URLs chunk[keys.URL_TOKENS] = chunk[keys.URL].apply( _tokenize_values, args=(url_utils.tokenize, )) # 6. Shared preprocessing chunk = _shared_preprocessing( chunk, _will_handle_birth_dates(chunk), _will_handle_death_dates(chunk), ) LOGGER.info('Chunk %d done', i) yield chunk