def apply(self, df: dd, scheduler: Scheduler = "processes") -> np.ndarray: """Label Dask DataFrame of data points with LFs. Parameters ---------- df Dask DataFrame containing data points to be labeled by LFs scheduler A Dask scheduling configuration: either a string option or a ``Client``. For more information, see https://docs.dask.org/en/stable/scheduling.html# Returns ------- np.ndarray Matrix of labels emitted by LFs """ apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs) map_fn = df.map_partitions(lambda p_df: p_df.apply(apply_fn, axis=1)) labels = map_fn.compute(scheduler=scheduler) labels_with_index = rows_to_triplets(labels) return self._numpy_from_row_data(labels_with_index)
def gather_papers_data(metadata_dd: dd) -> dd: return metadata_dd.map_partitions(lambda df: df.assign( body=retrieve_paper_body_text_for_series(df.pdf_json_files)))
def preprocess_dataset(ddf: dataframe) -> dataframe: """Preprocesses a dataFrame: - constant missing value replacement - lower case - strip accentuated characters - extract year from title and simplifies it to avoid redundancy - Stop words removal and stemming Parameters ---------- ddf: str the dataframe to be processed. Returns ------- dataframe """ text_cols = [ 'country', 'designation', 'province', 'region_1', 'region_2', 'taster_name', 'variety', 'winery' ] ddf = ddf.map_partitions(lambda d: d.assign(country=d['country'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(designation=d[ 'designation'].fillna("_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(province=d['province'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(region_1=d['region_1'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(region_2=d['region_2'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(taster_name=d[ 'taster_name'].fillna("_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(variety=d['variety'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(winery=d['winery'].fillna( "_missing_").str.lower().apply(unidecode))) # Get year from the title ddf = ddf.map_partitions(lambda d: d.assign(year=d['title'].str.extract( '(\d{4,})', expand=False).astype(float))) # Remove year and geographical info from the tilte. They are in already other columns. ddf = ddf.map_partitions(lambda d: d.assign(title=d[ 'title'].fillna("_missing_").str.lower().apply(unidecode).str.replace( '(\d+ )', '').str.replace('\((.+)\)\s*$', '').str.replace( '\s{2,}', ' ').fillna("_missing_"))) ddf = ddf.map_partitions(lambda d: d.assign(description=d[ 'description'].fillna("_missing_").str.lower().apply(stem_description). fillna("_missing_"))) return ddf