def _preprocess( self, df: "dask.DataFrame", inferencing: bool) -> Tuple["dask.DataFrame", "dask.DataFrame"]: df = df.loc[:, df.columns != "index"] # remove nulls and/or NaNs scalably with dask print(f"step1: drop nulls from rows") df = df.dropna(subset=["nullable_feature"]) print(f"step2: creating new_col and updatingfeature_1") df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] + df["feature_3"]) / 3. df["feature_1"] = 2. * df["feature_1"] + 0.1 # TODO: this doesn't work with more than 1 parquet file # df['mean_by_fruit'] = df.groupby('fruit')['feature_1'].transform('mean') print(f"step3: one-hot encoding fruit") df = df.astype({"fruit": "category"}) df = df.categorize() df.persist() if inferencing: assert self.column_transformer is not None df_fruits = self.column_transformer.transform(df) else: assert self.column_transformer is None self.column_transformer = ColumnTransformer([ ("one-hot", OneHotEncoder(sparse=False), ["fruit"]) ]) df_fruits = self.column_transformer.fit_transform(df) df_data = df.loc[:, (df.columns != "label") & (df.columns != "fruit")] df_data = dd.concat([df_data, df_fruits], axis=1) assert df_data.isnull().sum().sum().compute( ) == 0, "There are nulls or Nans in the data!" if inferencing: print(f"step4: standardrize inference dataset") assert self.scaler is not None df_data_inference = self.scaler.transform(df_data) return df_data_inference, None else: print(f"step4: standardrize train dataset") df_labels = df.loc[:, df.columns == "label"] df_data_train, df_data_test, df_label_train, df_label_test = train_test_split( df_data, df_labels) df_data_train.persist() assert self.scaler is None self.scaler = StandardScaler( ) # this just turns col values to z-scores df_data_train = self.scaler.fit_transform(df_data_train) df_data_test = self.scaler.transform(df_data_test) df_train = dd.concat([df_data_train, df_label_train], axis=1) df_test = dd.concat([df_data_test, df_label_test], axis=1) return df_train, df_test
def scale_numerics(train_paths, test_paths, out_path): train = dd.read_parquet(train_paths) test = dd.read_parquet(test_paths) scaler = StandardScaler() train = scaler.fit_transform(train) test = scaler.transform(test) if os.path.exists(out_path): shutil.rmtree(out_path) train.to_parquet(out_path + '/train') test.to_parquet(out_path + '/test')
def train_scaler(dset, varname=None, row_dim='time', transform=True): """ Train a Standard Scaler on a Dataset along the time dimension and return the Standardised dataset (optionally) If dask-ml is present, then try and train a Dask ML StandardScaler, otherwise reverts to scikit-learn Parameters ---------- Return ------ dset : the dataset with variable standardized None if `transform` = False scaler : the trained standard scaler Example ------- dset, scaler = train_scaler(dset, varname = 't2m', transform=True) or _, scaler = train_scaler(dset, varname='t2m, transform=False) """ try: from dask_ml.preprocessing import StandardScaler except: from sklearn.preprocessing import StandardScaler dset = dset[varname] space_dims = tuple(x for x in dset.dims if x != row_dim) dset_stack = dset.stack(z=space_dims) scaler = StandardScaler() if transform: data_std = scaler.fit_transform(dset_stack.data) dset_stack.data = data_std dset = dset_stack.unstack() return dset, scaler else: return None, scaler
def set(self, purpose): """Set a preprocessing method Parameters ---------- purpose : str Supported purposes are : 'training', 'inference'. Returns ------- Preprocessor object. """ logger.info("") if self.preprocessing == "minmaxscaler" and purpose == "training": from dask_ml.preprocessing import MinMaxScaler if self.kwargs is None: self.kwargs = {"feature_range": (-1, 1)} self.preprocessor = MinMaxScaler(**self.kwargs) preprocessor_name = "MinMaxScaler" elif self.preprocessing == "standardscaler" and purpose == "training": from dask_ml.preprocessing import StandardScaler if self.kwargs is None: self.kwargs = {} self.preprocessor = StandardScaler(**self.kwargs) preprocessor_name = "StandardScaler" elif self.preprocessing == "normalizer" and purpose == "training": if self.kwargs is None: self.kwargs = {"norm": "l2"} from sklearn.preprocessing import Normalizer self.preprocessor = Normalizer() preprocessor_name = "Normalizer" elif self.preprocessing is not None and purpose == "inference": self.preprocessor = joblib.load(self.preprocessing) else: logger.warning("Preprocessor is not supported.") self.preprocessor = preprocessor_name = None if purpose == "training" and preprocessor_name is not None: logger.info("Data preprocessing") logger.info("------------------") logger.info("Preprocessor: {}.".format(preprocessor_name)) logger.info("Options:") for k, v in self.kwargs.items(): logger.info(" - {}: {}.".format(k, v)) logger.info(" ") return self.preprocessor
def standardize(dataset: DataSet, inplace: bool = False) -> (DataSet, Series, Series): scaler = StandardScaler(copy=not inplace) data_by_columns = np.array( [instance.data.values.flatten() for instance in dataset.instances]) data_by_columns = data_by_columns.reshape(-1, dataset.num_of_columns) data_by_columns = scaler.fit_transform(data_by_columns) data_by_instances = data_by_columns.reshape(dataset.num_of_instances, -1) standardized_array = np.array([ instance.reshape(-1, dataset.num_of_columns) for instance in data_by_instances ]) return DataSet(standardized_array), Series( scaler.mean_, dataset.columns, ), Series(scaler.scale_, dataset.columns)
def transformations(): dask_data = fetch_data('adult') numerical_features = [ 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week' ] numerical_transformer = Pipeline(steps=[('onehot', StandardScaler())]) categorical_features = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex' ] categorical_transformer = Pipeline( steps=[('cat', Categorizer( columns=categorical_features)), ('onehot', OneHotEncoder())]) preprocessor = ColumnTransformer( transformers=[('num', numerical_transformer, numerical_features ), ('cat', categorical_transformer, categorical_features)]) processed_data = preprocessor.fit_transform(dask_data) processed_data['target'] = dask_data['target'] return processed_data, preprocessor
# In[17]: # with ProgressBar(): # X = dask_cudf.from_dask_dataframe(X) # In[19]: print ("scaling") with ProgressBar(): scaler = StandardScaler() scaler.fit(X) scaled_data = scaler.transform(X) # In[20]: X_train, X_test, y_train, y_test = train_test_split(scaled_data,y['p.ERK_c'],test_size=0.33, random_state=101) # In[21]: params = { 'num_rounds': 100,
a= len(y[y[m].isna()== True]) print(m,a) # In[7]: y = y.persist() # In[8]: print("Scaler") scaler = StandardScaler() scaler.fit(X) X_scaled = scaler.transform(X) # In[7]: X_train, X_test, y_train, y_test = train_test_split(X_scaled , y["p.ERK"], test_size=0.33, random_state=101,shuffle=True) # In[9]:
Y = dataset['Y'] data = [] for pipeline in dataset.keys(): if pipeline in config.pipelines: data.append(dataset[pipeline]) if config.visualize == True: for j in range(5): i = random.randint(0, dataset[pipeline].shape[0]) plt.title(pipeline + str(Y[i].compute())) plt.plot(dataset[pipeline][i, :].compute()) plt.save_fig('./figures/' + pipeline + str(Y[i].compute() + '.png')) X = functions.da.concatenate(data, axis=1) ######################################make NN ################################################################ scaler = StandardScaler(copy=True) le = LabelEncoder() enc = OneHotEncoder(sparse=False) epochs = 20 functions.np.random.seed(0) print(Y[0:10].compute()) encY = le.fit_transform(Y.ravel()) dummy_y = enc.fit_transform(encY.reshape(-1, 1)) print(dummy_y) print('scaling X') if config.scaleX == True: X = scaler.fit_transform(X) print('Done') inputdim = X.shape[1] outputdim = dummy_y.shape[1]
y_slice].rechunk(chunks=(-1, 10 * coarsen, 10 * coarsen)) coarseIVs = IVs[:, ::coarsen, ::coarsen].reshape( (IVs.shape[0], -1)).T.persist() IVs # - # Get metadata from netCDF file for plotting EGY = xdata.Energy_set multiplier = xdata.multiplier plt.plot(EGY, multiplier) # ## Principal Component Analysis # To reduce the data to a reasonable number of dimensions, we use a pipeline of a standardscaler and PCA: pca = PCA(n_components=dimensions, whiten=True, random_state=4) pipe = make_pipeline(StandardScaler(), pca) pipe_names = '_'.join(pipe.named_steps.keys()) pipe.fit( coarseIVs) # Fit the standard scaler and PCA vectors to the coarsened data plt.figure(figsize=[3.5, 3.5]) scree = np.concatenate([[0], pipe.named_steps['pca'].explained_variance_ratio_]) plt.scatter(np.arange(dimensions) + 1, scree[1:], label='relative', facecolors='none', edgecolors=colors, linewidth=2) plt.scatter(np.arange(dimensions + 1), np.cumsum(scree),
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
def __init__(self, train, test, scaler=StandardScaler(copy=False)): self.train = train self.test = test self.scaler = scaler
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.pipeline import make_pipeline from dask_ml.preprocessing import StandardScaler from dask_ml.linear_model import LogisticRegression if __name__ == "__main__": client = Client() data = Path('./data') df = pd.read_csv(data / "01_heights_weights_genders.csv") y = 1 * (df.Gender == "Male").values X = df[['Height', 'Weight']].values X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = make_pipeline(StandardScaler(), LogisticRegression()) grid = GridSearchCV(pipeline, param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5) grid.fit(X_train, y_train) print("Score", grid.score(X_test, y_test))
start = datetime.now() # Split Train/Testmain from dask_ml.model_selection import train_test_split X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed'] #### 7e changement : pour la création des labels Y, conversion de Series en Dask array y = prepared_data['new_confirmed'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Scale des valeurs from dask_ml.preprocessing import StandardScaler from sklearn.neural_network import MLPRegressor scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) regr = MLPRegressor(max_iter=10, hidden_layer_sizes=(100, 50, 25, 10, 5), verbose=True) #### 8e changement : Parallélisme pour l'entrainement et la prédiction with joblib.parallel_backend('dask'): regr.fit(X_train, y_train) # Prédiction et Score with joblib.parallel_backend('dask'): score = regr.score(X_test, y_test)
data = [] for pipeline in config.pipelines: data.append( dataset[pipeline] ) if config.visualize == True: for j in range(5): i = random.randint( 0, dataset[pipeline].shape[0] ) plt.title(pipeline + str(Y[i].compute())) plt.plot(dataset[pipeline][i,:].compute()) plt.save_fig( './figures/'+pipeline + str(Y[i].compute()+'.png')) X = functions.da.concatenate(data, axis = 1 ) ######################################make NN ################################################################ if config.make_networkmodel ==True: from dask_ml.preprocessing import StandardScaler scaler = StandardScaler() functions.np.random.seed(0) # encode class values as integers encoder = functions.LabelEncoder() encoder.fit(Y.ravel()) encoded_Y = encoder.transform(Y.ravel()) dummy_y = functions.utils.to_categorical(encoded_Y) print(dummy_y) inputdim=X.shape[1] print('scaling X') X = scaler.fit_transform( X, Y ) print('Done') outputdim = dummy_y.shape[1] print(inputdim) print(outputdim) #output a configured model function with no inputs
def scale_dataset(df): # calling dask StandardScaler ss = StandardScaler() scaled_df = ss.fit_transform(df) return scaled_df