Exemple #1
0
    def apply(self, df: dd, scheduler: Scheduler = "processes") -> np.ndarray:
        """Label Dask DataFrame of data points with LFs.

        Parameters
        ----------
        df
            Dask DataFrame containing data points to be labeled by LFs
        scheduler
            A Dask scheduling configuration: either a string option or
            a ``Client``. For more information, see
            https://docs.dask.org/en/stable/scheduling.html#

        Returns
        -------
        np.ndarray
            Matrix of labels emitted by LFs
        """
        apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs)
        map_fn = df.map_partitions(lambda p_df: p_df.apply(apply_fn, axis=1))
        labels = map_fn.compute(scheduler=scheduler)
        labels_with_index = rows_to_triplets(labels)
        return self._numpy_from_row_data(labels_with_index)
def gather_papers_data(metadata_dd: dd) -> dd:
    return metadata_dd.map_partitions(lambda df: df.assign(
        body=retrieve_paper_body_text_for_series(df.pdf_json_files)))
def preprocess_dataset(ddf: dataframe) -> dataframe:
    """Preprocesses a dataFrame:
        - constant missing value replacement
        - lower case
        - strip accentuated characters
        - extract year from title and simplifies it to avoid redundancy
        - Stop words removal and stemming

    Parameters
    ----------
    ddf: str
        the dataframe to be processed.

    Returns
    -------
    dataframe
    """

    text_cols = [
        'country', 'designation', 'province', 'region_1', 'region_2',
        'taster_name', 'variety', 'winery'
    ]

    ddf = ddf.map_partitions(lambda d: d.assign(country=d['country'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(designation=d[
        'designation'].fillna("_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(province=d['province'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(region_1=d['region_1'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(region_2=d['region_2'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(taster_name=d[
        'taster_name'].fillna("_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(variety=d['variety'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(winery=d['winery'].fillna(
        "_missing_").str.lower().apply(unidecode)))

    # Get year from the title
    ddf = ddf.map_partitions(lambda d: d.assign(year=d['title'].str.extract(
        '(\d{4,})', expand=False).astype(float)))

    # Remove year and geographical info from the tilte. They are in already other columns.
    ddf = ddf.map_partitions(lambda d: d.assign(title=d[
        'title'].fillna("_missing_").str.lower().apply(unidecode).str.replace(
            '(\d+ )', '').str.replace('\((.+)\)\s*$', '').str.replace(
                '\s{2,}', ' ').fillna("_missing_")))

    ddf = ddf.map_partitions(lambda d: d.assign(description=d[
        'description'].fillna("_missing_").str.lower().apply(stem_description).
                                                fillna("_missing_")))

    return ddf