Exemple #1
0
    def _preprocess(
            self, df: "dask.DataFrame",
            inferencing: bool) -> Tuple["dask.DataFrame", "dask.DataFrame"]:
        df = df.loc[:, df.columns != "index"]
        # remove nulls and/or NaNs scalably with dask
        print(f"step1: drop nulls from rows")
        df = df.dropna(subset=["nullable_feature"])

        print(f"step2: creating new_col and updatingfeature_1")
        df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] +
                         df["feature_3"]) / 3.
        df["feature_1"] = 2. * df["feature_1"] + 0.1
        # TODO: this doesn't work with more than 1 parquet file
        # df['mean_by_fruit'] = df.groupby('fruit')['feature_1'].transform('mean')

        print(f"step3: one-hot encoding fruit")
        df = df.astype({"fruit": "category"})
        df = df.categorize()
        df.persist()

        if inferencing:
            assert self.column_transformer is not None
            df_fruits = self.column_transformer.transform(df)
        else:
            assert self.column_transformer is None
            self.column_transformer = ColumnTransformer([
                ("one-hot", OneHotEncoder(sparse=False), ["fruit"])
            ])
            df_fruits = self.column_transformer.fit_transform(df)

        df_data = df.loc[:, (df.columns != "label") & (df.columns != "fruit")]
        df_data = dd.concat([df_data, df_fruits], axis=1)

        assert df_data.isnull().sum().sum().compute(
        ) == 0, "There are nulls or Nans in the data!"

        if inferencing:
            print(f"step4: standardrize inference dataset")
            assert self.scaler is not None
            df_data_inference = self.scaler.transform(df_data)
            return df_data_inference, None
        else:
            print(f"step4: standardrize train dataset")
            df_labels = df.loc[:, df.columns == "label"]
            df_data_train, df_data_test, df_label_train, df_label_test = train_test_split(
                df_data, df_labels)
            df_data_train.persist()
            assert self.scaler is None
            self.scaler = StandardScaler(
            )  # this just turns col values to z-scores
            df_data_train = self.scaler.fit_transform(df_data_train)
            df_data_test = self.scaler.transform(df_data_test)
            df_train = dd.concat([df_data_train, df_label_train], axis=1)
            df_test = dd.concat([df_data_test, df_label_test], axis=1)
            return df_train, df_test
Exemple #2
0
def scale_numerics(train_paths, test_paths, out_path):
    train = dd.read_parquet(train_paths)
    test = dd.read_parquet(test_paths)

    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)

    if os.path.exists(out_path):
        shutil.rmtree(out_path)

    train.to_parquet(out_path + '/train')
    test.to_parquet(out_path + '/test')
Exemple #3
0
def train_scaler(dset, varname=None, row_dim='time', transform=True):
    """
    Train a Standard Scaler on a Dataset along the time dimension 
    and return the Standardised dataset (optionally)

    If dask-ml is present, then try and train a Dask ML StandardScaler, 
    otherwise reverts to scikit-learn 

    Parameters
    ----------

    Return 
    ------ 

    dset : the dataset with variable standardized 
            None if `transform` = False 

    scaler : the trained standard scaler

    Example
    ------- 

    dset, scaler = train_scaler(dset, varname = 't2m', transform=True)

    or 

    _, scaler = train_scaler(dset, varname='t2m, transform=False) 
    """

    try:
        from dask_ml.preprocessing import StandardScaler
    except:
        from sklearn.preprocessing import StandardScaler

    dset = dset[varname]
    space_dims = tuple(x for x in dset.dims if x != row_dim)
    dset_stack = dset.stack(z=space_dims)
    scaler = StandardScaler()
    if transform:
        data_std = scaler.fit_transform(dset_stack.data)
        dset_stack.data = data_std
        dset = dset_stack.unstack()
        return dset, scaler
    else:
        return None, scaler
Exemple #4
0
    def set(self, purpose):
        """Set a preprocessing method

        Parameters
        ----------
        purpose : str
            Supported purposes are : 'training', 'inference'.

        Returns
        -------
            Preprocessor object.
        """

        logger.info("")

        if self.preprocessing == "minmaxscaler" and purpose == "training":
            from dask_ml.preprocessing import MinMaxScaler

            if self.kwargs is None:
                self.kwargs = {"feature_range": (-1, 1)}
            self.preprocessor = MinMaxScaler(**self.kwargs)
            preprocessor_name = "MinMaxScaler"

        elif self.preprocessing == "standardscaler" and purpose == "training":
            from dask_ml.preprocessing import StandardScaler

            if self.kwargs is None:
                self.kwargs = {}
            self.preprocessor = StandardScaler(**self.kwargs)
            preprocessor_name = "StandardScaler"

        elif self.preprocessing == "normalizer" and purpose == "training":
            if self.kwargs is None:
                self.kwargs = {"norm": "l2"}
            from sklearn.preprocessing import Normalizer

            self.preprocessor = Normalizer()
            preprocessor_name = "Normalizer"

        elif self.preprocessing is not None and purpose == "inference":
            self.preprocessor = joblib.load(self.preprocessing)

        else:
            logger.warning("Preprocessor is not supported.")
            self.preprocessor = preprocessor_name = None

        if purpose == "training" and preprocessor_name is not None:
            logger.info("Data preprocessing")
            logger.info("------------------")
            logger.info("Preprocessor: {}.".format(preprocessor_name))
            logger.info("Options:")
            for k, v in self.kwargs.items():
                logger.info("    - {}: {}.".format(k, v))

            logger.info(" ")

        return self.preprocessor
Exemple #5
0
def standardize(dataset: DataSet,
                inplace: bool = False) -> (DataSet, Series, Series):
    scaler = StandardScaler(copy=not inplace)

    data_by_columns = np.array(
        [instance.data.values.flatten() for instance in dataset.instances])
    data_by_columns = data_by_columns.reshape(-1, dataset.num_of_columns)

    data_by_columns = scaler.fit_transform(data_by_columns)

    data_by_instances = data_by_columns.reshape(dataset.num_of_instances, -1)
    standardized_array = np.array([
        instance.reshape(-1, dataset.num_of_columns)
        for instance in data_by_instances
    ])

    return DataSet(standardized_array), Series(
        scaler.mean_,
        dataset.columns,
    ), Series(scaler.scale_, dataset.columns)
def transformations():
    dask_data = fetch_data('adult')
    numerical_features = [
        'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'
    ]
    numerical_transformer = Pipeline(steps=[('onehot', StandardScaler())])

    categorical_features = [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex'
    ]
    categorical_transformer = Pipeline(
        steps=[('cat', Categorizer(
            columns=categorical_features)), ('onehot', OneHotEncoder())])
    preprocessor = ColumnTransformer(
        transformers=[('num', numerical_transformer, numerical_features
                       ), ('cat', categorical_transformer,
                           categorical_features)])
    processed_data = preprocessor.fit_transform(dask_data)
    processed_data['target'] = dask_data['target']

    return processed_data, preprocessor
Exemple #7
0
    


# In[17]:


# with ProgressBar():
#     X = dask_cudf.from_dask_dataframe(X)


# In[19]:

print ("scaling")

with ProgressBar():
    scaler = StandardScaler()
    scaler.fit(X)
    scaled_data = scaler.transform(X)


# In[20]:


X_train, X_test, y_train, y_test = train_test_split(scaled_data,y['p.ERK_c'],test_size=0.33, random_state=101)


# In[21]:


params = {
  'num_rounds':   100,
Exemple #8
0
    a= len(y[y[m].isna()== True])
    print(m,a)


# In[7]:


y = y.persist()


# In[8]:



print("Scaler")
scaler = StandardScaler()

scaler.fit(X)

X_scaled = scaler.transform(X)

# In[7]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled , 
                                                        y["p.ERK"], 
                                                        test_size=0.33, 
                                                        random_state=101,shuffle=True)


# In[9]:
Y = dataset['Y']
data = []
for pipeline in dataset.keys():
    if pipeline in config.pipelines:
        data.append(dataset[pipeline])
        if config.visualize == True:
            for j in range(5):
                i = random.randint(0, dataset[pipeline].shape[0])
                plt.title(pipeline + str(Y[i].compute()))
                plt.plot(dataset[pipeline][i, :].compute())
                plt.save_fig('./figures/' + pipeline +
                             str(Y[i].compute() + '.png'))
X = functions.da.concatenate(data, axis=1)

######################################make NN ################################################################
scaler = StandardScaler(copy=True)
le = LabelEncoder()
enc = OneHotEncoder(sparse=False)
epochs = 20
functions.np.random.seed(0)
print(Y[0:10].compute())
encY = le.fit_transform(Y.ravel())
dummy_y = enc.fit_transform(encY.reshape(-1, 1))

print(dummy_y)
print('scaling X')
if config.scaleX == True:
    X = scaler.fit_transform(X)
print('Done')
inputdim = X.shape[1]
outputdim = dummy_y.shape[1]
               y_slice].rechunk(chunks=(-1, 10 * coarsen, 10 * coarsen))
coarseIVs = IVs[:, ::coarsen, ::coarsen].reshape(
    (IVs.shape[0], -1)).T.persist()
IVs
# -

# Get metadata from netCDF file for plotting
EGY = xdata.Energy_set
multiplier = xdata.multiplier
plt.plot(EGY, multiplier)

# ## Principal Component Analysis
# To reduce the data to a reasonable number of dimensions, we use a pipeline of a standardscaler and PCA:

pca = PCA(n_components=dimensions, whiten=True, random_state=4)
pipe = make_pipeline(StandardScaler(), pca)
pipe_names = '_'.join(pipe.named_steps.keys())
pipe.fit(
    coarseIVs)  # Fit the standard scaler and PCA vectors to the coarsened data

plt.figure(figsize=[3.5, 3.5])
scree = np.concatenate([[0],
                        pipe.named_steps['pca'].explained_variance_ratio_])
plt.scatter(np.arange(dimensions) + 1,
            scree[1:],
            label='relative',
            facecolors='none',
            edgecolors=colors,
            linewidth=2)
plt.scatter(np.arange(dimensions + 1),
            np.cumsum(scree),
Exemple #11
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
Exemple #12
0
    def __init__(self, train, test, scaler=StandardScaler(copy=False)):

        self.train = train
        self.test = test
        self.scaler = scaler
Exemple #13
0
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.pipeline import make_pipeline
from dask_ml.preprocessing import StandardScaler
from dask_ml.linear_model import LogisticRegression

if __name__ == "__main__":
    client = Client()
    data = Path('./data')
    df = pd.read_csv(data / "01_heights_weights_genders.csv")
    y = 1 * (df.Gender == "Male").values
    X = df[['Height', 'Weight']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline = make_pipeline(StandardScaler(), LogisticRegression())
    grid = GridSearchCV(pipeline,
                        param_grid={'logisticregression__C': [.1, 1, 10, 100]},
                        cv=5)
    grid.fit(X_train, y_train)
    print("Score", grid.score(X_test, y_test))
Exemple #14
0
start = datetime.now()

# Split Train/Testmain
from dask_ml.model_selection import train_test_split

X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed']

#### 7e changement : pour la création des labels Y, conversion de Series en Dask array
y = prepared_data['new_confirmed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Scale des valeurs
from dask_ml.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

regr = MLPRegressor(max_iter=10,
                    hidden_layer_sizes=(100, 50, 25, 10, 5),
                    verbose=True)

#### 8e changement : Parallélisme pour l'entrainement et la prédiction
with joblib.parallel_backend('dask'):
    regr.fit(X_train, y_train)

# Prédiction et Score
with joblib.parallel_backend('dask'):
    score = regr.score(X_test, y_test)
Exemple #15
0
	data = []
	for pipeline in config.pipelines:
		data.append( dataset[pipeline] )
		if config.visualize == True:
			for j in range(5):
				i = random.randint( 0, dataset[pipeline].shape[0] )
				plt.title(pipeline + str(Y[i].compute()))
				plt.plot(dataset[pipeline][i,:].compute())
				plt.save_fig( './figures/'+pipeline + str(Y[i].compute()+'.png'))
	X = functions.da.concatenate(data, axis = 1  )

######################################make NN ################################################################
if config.make_networkmodel ==True:

	from dask_ml.preprocessing import StandardScaler
	scaler = StandardScaler()
	functions.np.random.seed(0)
	# encode class values as integers
	encoder = functions.LabelEncoder()
	encoder.fit(Y.ravel())
	encoded_Y = encoder.transform(Y.ravel())
	dummy_y = functions.utils.to_categorical(encoded_Y)
	print(dummy_y)
	inputdim=X.shape[1]
	print('scaling X')
	X = scaler.fit_transform( X, Y )
	print('Done')
	outputdim = dummy_y.shape[1]
	print(inputdim)
	print(outputdim)
	#output a configured model function with no inputs
def scale_dataset(df):
    # calling dask StandardScaler
    ss = StandardScaler()
    scaled_df = ss.fit_transform(df)
    return scaled_df