def table_summary(context: MLClientCtx, dask_client: Union[DataItem, str], dask_key: str = 'my_dask_dataframe', target_path: str = '', name: str = 'table_summary.csv', key: str = 'table_summary') -> None: """Summarize a table :param context: the function context :param dask_client: path to the dask client scheduler json file, as string or artifact :param dask_key: key of dataframe in dask client 'datasets' attribute :param target_path: destimation folder for table summary file :param name: name of table summary file (with extension like .csv) :param key: key of table summary in artifact store """ print(context.__dict__) dask_client = Client(scheduler_file=str(dask_client)) df = dask_client.get_dataset('dask_key') print(df.head()) dscr = df.describe() filepath = os.path.join(target_path, name) dd.to_csv(dscr, filepath, single_file=True, index=False) context.log_artifact(key, target_path=filepath)
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None: """Train a model. :param context: The runtime context object. :param p1: A model parameter. :param p2: Another model parameter. """ # access input metadata, values, and inputs print(f'Run: {context.name} (uid={context.uid})') print(f'Params: p1={p1}, p2={p2}') context.logger.info('started training') # <insert training code here> # log the run results (scalar values) context.log_result('accuracy', p1 * 2) context.log_result('loss', p1 * 3) # add a lable/tag to this run context.set_label('category', 'tests') # log a simple artifact + label the artifact # If you want to upload a local file to the artifact repo add src_path=<local-path> context.log_artifact('model', body=b'abc is 123', local_path='model.txt', labels={'framework': 'tfkeras'})
def create_classification_data(context: MLClientCtx, n_samples: int, m_features: int, k_classes: int, header: Optional[List[str]], label_column: Optional[str] = 'labels', weight: float = 0.5, random_state: int = 1, filename: Optional[str] = None, key: str = 'classifier-data', file_ext: str = 'pqt', sk_params={}): """Create a binary classification sample dataset and save. If no filename is given it will default to: 'simdata-{n_samples}X{m_features}.parquet'. Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details. :param context: function context :param n_samples: number of rows/samples :param m_features: number of cols/features :param k_classes: number of classes :param header: header for features array :param label_column: column name of ground-truth series :param weight: fraction of sample negative value (ground-truth=0) :param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state) :param filename: optional name for saving simulated data file :param key: key of data in artifact store :param file_ext: (pqt) extension for parquet file :param sk_params: additional `sklearn.datasets.make_classification` outputs filename of created data (includes path) in the artifact store. """ if not filename: name = f"simdata-{n_samples:0.0e}X{m_features}.{file_ext}".replace( "+", "") filename = os.path.join(context.artifact_path, name) else: filename = os.path.join(context.artifact_path, filename) features, labels = make_classification(n_samples=n_samples, n_features=m_features, weights=weight, n_classes=k_classes, random_state=random_state, **sk_params) # make dataframes, add column names, concatenate (X, y) X = pd.DataFrame(features) if not header: X.columns = ["feat_" + str(x) for x in range(m_features)] else: X.columns = header y = pd.DataFrame(labels, columns=[label_column]) data = pd.concat([X, y], axis=1) pq.write_table(pa.Table.from_pandas(data), filename) context.log_artifact(key, local_path=name)
def get_toy_data(context: MLClientCtx, dataset: str, params: dict = {}) -> None: """Loads a scikit-learn toy dataset for classification or regression The following datasets are available ('name' : desription): 'boston' : boston house-prices dataset (regression) 'iris' : iris dataset (classification) 'diabetes' : diabetes dataset (regression) 'digits' : digits dataset (classification) 'linnerud' : linnerud dataset (multivariate regression) 'wine' : wine dataset (classification) 'cancer' : breast cancer wisconsin dataset (classification) The scikit-learn functions return a data bunch including the following items: - data the features matrix - target the ground truth labels - DESCR a description of the dataset - feature_names header for data The features (and their names) are stored with the target labels in a DataFrame. For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets :param context: function execution context :param dataset: name of the dataset to load :param params: params of the sklearn load_data method """ filepath = os.path.join(context.artifact_path, dataset) + '.pqt' # check to see if we haven't already downloaded the file if not os.path.isfile(filepath): artifact_path = context.artifact_path # reach into module and import the appropriate load_xxx function pkg_module = 'sklearn.datasets' fname = f'load_{dataset}' pkg_module = __import__(pkg_module, fromlist=[fname]) load_data_fn = getattr(pkg_module, fname) data = load_data_fn(**params) feature_names = data['feature_names'] # save xy = np.concatenate([data['data'], data['target'].reshape(-1, 1)], axis=1) feature_names.append('labels') df = pd.DataFrame(data=xy, columns=feature_names) df.to_parquet(filepath, engine='pyarrow', index=False) # either we just downloaded file, or it exists, log it: context.log_artifact(dataset, local_path=filepath.split('/')[-1])
def parquet_to_dask(context: MLClientCtx, parquet_url: Union[DataItem, str, Path, IO[AnyStr]], inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, shards: int = 4, threads_per: int = 4, processes: bool = False, memory_limit: str = '2GB', persist: bool = True, dask_key: str = 'my_dask_dataframe', target_path: str = '') -> None: """Load parquet dataset into dask cluster If no cluster is found loads a new one and persist the data to it. It shouold not be necessary to create a new cluster when the function is run as a 'dask' job. :param context: the function context :param parquet_url: url of the parquet file or partitioned dataset as either artifact DataItem, string, or path object (see pandas read_csv) :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param shards: number of workers to launch :param threads_per: number of threads per worker :param processes: """ if hasattr(context, 'dask_client'): context.logger.info('found cluster...') dask_client = context.dask_client else: context.logger.info('starting new cluster...') cluster = LocalCluster(n_workers=shards, threads_per_worker=threads_per, processes=processes, memory_limit=memory_limit) dask_client = Client(cluster) context.logger.info(dask_client) df = dd.read_parquet(parquet_url) if persist and context: df = dask_client.persist(df) dask_client.publish_dataset(dask_key=df) context.dask_client = dask_client # share the scheduler filepath = os.path.join(target_path, 'scheduler.json') dask_client.write_scheduler_file(filepath) context.log_artifact('scheduler', target_path=filepath) print(df.head())
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None: """Train a model. :param context: The runtime context object. :param p1: A model parameter. :param p2: Another model parameter. """ # access input metadata, values, and inputs print(f"Run: {context.name} (uid={context.uid})") print(f"Params: p1={p1}, p2={p2}") context.logger.info("started training") # <insert training code here> # log the run results (scalar values) context.log_result("accuracy", p1 * 2) context.log_result("loss", p1 * 3) # add a lable/tag to this run context.set_label("category", "tests") # log a simple artifact + label the artifact # If you want to upload a local file to the artifact repo add src_path=<local-path> context.log_artifact("somefile", body=b"abc is 123", local_path="myfile.txt") # create a dataframe artifact df = pd.DataFrame([{ "A": 10, "B": 100 }, { "A": 11, "B": 110 }, { "A": 12, "B": 120 }]) context.log_dataset("mydf", df=df) # Log an ML Model artifact, add metrics, params, and labels to it # and place it in a subdir ('models') under artifacts path context.log_model( "mymodel", body=b"abc is 123", model_file="model.txt", metrics={"accuracy": 0.85}, parameters={"xx": "abc"}, labels={"framework": "xgboost"}, artifact_path=context.artifact_subpath("models"), )
def validation(context: MLClientCtx, model: DataItem) -> None: """Model validation. Dummy validation function. :param context: The runtime context object. :param model: The extimated model object. """ # access input metadata, values, files, and secrets (passwords) print(f'Run: {context.name} (uid={context.uid})') print(f'file - {model.url}:\n{model.get()}\n') context.logger.info('started validation') context.log_artifact('validation', body=b'<b> validated </b>', format='html')
def load_dask( context: MLClientCtx, src_data: DataItem, dask_key: str = "dask_key", inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, dask_persist: bool = True, refresh_data: bool = True, scheduler_key: str = "scheduler" ) -> None: """Load dataset into an existing dask cluster dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected. :param context: the function context :param src_data: url of the data file or partitioned dataset as either artifact DataItem, string, or path object (similar to pandas read_csv) :param dask_key: destination key of data on dask cluster and artifact store :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param dask_persist: (True) should the data be persisted (through the `client.persist` op) :param refresh_data: (False) if the dask_key already exists in the dask cluster, this will raise an Exception. Set to True to replace the existing cluster data. :param scheduler_key: (scheduler) the dask scheduler configuration, json also logged as an artifact """ if hasattr(context, "dask_client"): dask_client = context.dask_client else: raise Exception("a dask client was not found in the execution context") df = src_data.as_df(df_module=dd) if dask_persist: df = dask_client.persist(df) if dask_client.datasets and dask_key in dask_client.datasets: dask_client.unpublish_dataset(dask_key) dask_client.publish_dataset(df, name=dask_key) if context: context.dask_client = dask_client # share the scheduler, whether data is persisted or not dask_client.write_scheduler_file(scheduler_key + ".json") # we don't use log_dataset here until it can take into account # dask origin and apply dask describe. context.log_artifact(scheduler_key, local_path=scheduler_key + ".json")
def learning_curves(context: MLClientCtx, results: dict, figsz: Tuple[int, int] = (10, 10), plots_dest: str = "plots") -> None: """plot xgb learning curves this will also log a model's learning curves """ plt.clf() plt.figure(figsize=figsz) plt.plot(results["train"]["my_rmsle"], label="train-my-rmsle") plt.plot(results["valid"]["my_rmsle"], label="valid-my-rmsle") plt.title(f"learning curves") plt.legend() context.log_artifact(PlotArtifact(f"learning-curves", body=plt.gcf()), local_path=f"{plots_dest}/learning-curves.html")
def pandas_profiling_report( context: MLClientCtx, data: DataItem, ) -> None: """Create a Pandas Profiling Report for a dataset. :param context: the function context :param data: Dataset to create report for """ df = data.as_df() profile = df.profile_report(title="Pandas Profiling Report") context.log_artifact( "Pandas Profiling Report", body=profile.to_html(), local_path="pandas_profiling_report.html", )
def validation(context: MLClientCtx, model: DataItem) -> None: """Model validation. Dummy validation function. :param context: The runtime context object. :param model: The extimated model object. """ # access input metadata, values, files, and secrets (passwords) print(f"Run: {context.name} (uid={context.uid})") context.logger.info("started validation") # get the model file, class (metadata), and extra_data (dict of key: DataItem) model_file, model_obj, _ = get_model(model) # update model object elements and data update_model(model_obj, parameters={"one_more": 5}) print(f"path to local copy of model file - {model_file}") print("parameters:", model_obj.parameters) print("metrics:", model_obj.metrics) context.log_artifact("validation", body=b"<b> validated </b>", format="html")
def plot_confusion_matrix(context: MLClientCtx, labels, predictions, key: str = "confusion_matrix", plots_dir: str = "plots", colormap: str = "Blues", fmt: str = "png", sample_weight=None): """Create a confusion matrix. Plot and save a confusion matrix using test data from a modelline step. See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html TODO: fix label alignment TODO: consider using another packaged version TODO: refactor to take params dict for plot options :param context: function context :param labels: validation data ground-truth labels :param predictions: validation data predictions :param key: str :param plots_dir: relative path of plots in artifact store :param colormap: colourmap for confusion matrix :param fmt: plot format :param sample_weight: sample weights """ _gcf_clear(plt) cm = metrics.confusion_matrix(labels, predictions, sample_weight=None) sns.heatmap(cm, annot=True, cmap=colormap, square=True) fig = plt.gcf() fname = f"{plots_dir}/{key}.{fmt}" fig.savefig(os.path.join(context.artifact_path, fname)) context.log_artifact(PlotArtifact(key, body=fig), local_path=fname)
def open_archive( context: MLClientCtx, archive_url: DataItem, subdir: str = "content", key: str = "content", target_path: str = None, ): """Open a file/object archive into a target directory Currently supports zip and tar.gz :param context: function execution context :param archive_url: url of archive file :param subdir: path within artifact store where extracted files are stored :param key: key of archive contents in artifact store :param target_path: file system path to store extracted files (use either this or subdir) """ os.makedirs(target_path or subdir, exist_ok=True) archive_url = archive_url.local() if archive_url.endswith("gz"): with tarfile.open(archive_url, mode="r|gz") as ref: ref.extractall(target_path or subdir) elif archive_url.endswith("zip"): with zipfile.ZipFile(archive_url, "r") as ref: ref.extractall(target_path or subdir) else: raise ValueError(f"unsupported archive type in {archive_url}") kwargs = {} if target_path: kwargs = {"target_path": target_path} else: kwargs = {"local_path": subdir} context.log_artifact(key, **kwargs)
def train_model( context: MLClientCtx, model_pkg_class: str, data_key: Union[DataItem, str], sample: int, label_column: str, model_key: str = "model", test_size: float = 0.05, train_val_split: float = 0.75, test_set_key: str = "test_set", rng: int = 1, models_dir: str = "models", plots_dir: str = "plots", score_method: str = "micro", class_params_updates: Union[DataItem, dict] = {}, fit_params_updates: Union[DataItem, dict] = {}, ) -> None: """train a classifier. :param context: the function context :param model_pkg_class: the model to train, e.g, 'sklearn.neural_networks.MLPClassifier' :param data_key: ("raw") name of raw data file :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param label_column: ground-truth (y) labels :param model_key: ('model') name of model in artifact store, points to a directory :param test_size: (0.05) test set size :param train_val_split: (0.75) Once the test set has been removed the training set gets this proportion. :param test_set_key: store the test data set under this key in the artifact store :param rng: (1) sklearn rng seed :param models_dir: models subfolder on artifact path :param plots_dir: plot subfolder on artifact path :param score_method: for multiclass classification :param class_updates: update these scikit-learn classifier params, input as a dict :param fit_updates: update scikit-learn fit parameters, input as a dict. """ # extract file name from DataItem srcfilepath = str(data_key) # TODO: this should be part of data's metadata dealt with in another step get a data set, sample, etc... # get all data or a sample if (sample == -1) or (sample >= 1): # get all rows, or contiguous sample starting at row 1. raw = pq.read_table(srcfilepath).to_pandas().dropna() labels = raw.pop(label_column) raw = raw.iloc[:sample, :] labels = labels.iloc[:sample] else: # grab a random sample raw = pq.read_table(srcfilepath).to_pandas().dropna().sample(sample * -1) labels = raw.pop(label_column) # TODO: this should be part of data's metadata dealt with in another step context.header = raw.columns.values # TODO: all of this should be part of a spitter component that does cv too, dealt with in another step # make a hot encode copy of labels before the split yb = label_binarize(labels, classes=list(range(raw.shape[1]))) # double split to generate 3 data sets: train, validation and test # with xtest,ytest set aside x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=0), labels, test_size=test_size, random_state=rng) xtrain, xvalid, ytrain, yvalid = train_test_split( x, y, train_size=train_val_split, random_state=rng) # extract the hot_encoded labels ytrainb = xtrain[:, -yb.shape[1]:].copy() xtrain = xtrain[:, :-yb.shape[1]].copy() # extract the hot_encoded labels yvalidb = xvalid[:, -yb.shape[1]:].copy() xvalid = xvalid[:, :-yb.shape[1]].copy() # extract the hot_encoded labels ytestb = xtest[:, -yb.shape[1]:].copy() xtest = xtest[:, :-yb.shape[1]].copy() # set-aside test_set test_set = pd.concat( [ pd.DataFrame(data=xtest, columns=context.header), pd.DataFrame(data=ytest, columns=[label_column]), pd.DataFrame(data=ytestb, columns=[label_column]) ], axis=1, ) filepath = os.path.join(base_path, test_set_key + ".pqt") test_set.to_parquet(filepath, index=False) context.log_artifact(test_set_key, local_path=test_set_key + ".pqt") # load the model config model_config = get_model_configs(model_pkg_class) # get update params if any if isinstance(class_params_updates, DataItem): class_params_updates = json.loads(class_params_updates.get()) if isinstance(fit_params_updates, DataItem): fit_params_updates = json.loads(fit_params_updates.get()) # update the parameters # add data to fit params fit_params_updates.update({'X': xtrain, 'y': ytrain}) model_config = update_model_config(model_config, class_params_update, fit_params_updates) # create class and fit ClassifierClass = _create_class(model_config["META"]["class"]) model = ClassifierClass(**class_params) model.fit(**fit_params) # save model filepath = os.path.join(base_path, f"{models_dir}/{model_key}.pkl") dump(model, open(filepath, "wb")) context.log_artifact(model_key, local_path=models_dir) # compute validation metrics ypred = model.predict(xvalid) y_score = model.predict_proba(xvalid) average_precision = average_precision_score(yvalidb, y_score, average=score_method) context.log_result(f"accuracy", float(model.score(xvalid, yvalid))) context.log_result(f"rocauc", roc_auc_score(yvalidb, y_score)) context.log_result(f"f1_score", f1_score(yvalid, ypred, average=score_method)) context.log_result(f"avg_precscore", average_precision) # validation plots plot_roc(context, yvalidb, y_score) plot_confusion_matrix(context, yvalid, ypred, key="confusion", fmt="png")
def summarize( context: MLClientCtx, table: DataItem, label_column: str = None, class_labels: List[str] = [], plot_hist: bool = True, plots_dest: str = "plots", update_dataset=False, ) -> None: """Summarize a table :param context: the function context :param table: MLRun input pointing to pandas dataframe (csv/parquet file path) :param label_column: ground truth column label :param class_labels: label for each class in tables and plots :param plot_hist: (True) set this to False for large tables :param plots_dest: destination folder of summary plots (relative to artifact_path) :param update_dataset: when the table is a registered dataset update the charts in-place """ df = table.as_df() header = df.columns.values extra_data = {} try: gcf_clear(plt) snsplt = sns.pairplot(df, hue=label_column) # , diag_kws={"bw": 1.5}) extra_data["histograms"] = context.log_artifact( PlotArtifact("histograms", body=plt.gcf()), local_path=f"{plots_dest}/hist.html", db_key=False, ) except Exception as e: context.logger.error( f"Failed to create pairplot histograms due to: {e}") try: gcf_clear(plt) plot_cols = 3 plot_rows = int((len(header) - 1) / plot_cols) + 1 fig, ax = plt.subplots(plot_rows, plot_cols, figsize=(15, 4)) fig.tight_layout(pad=2.0) for i in range(plot_rows * plot_cols): if i < len(header): sns.violinplot( x=df[header[i]], ax=ax[int(i / plot_cols)][i % plot_cols], orient="h", width=0.7, inner="quartile", ) else: fig.delaxes(ax[int(i / plot_cols)][i % plot_cols]) i += 1 extra_data["violin"] = context.log_artifact( PlotArtifact("violin", body=plt.gcf(), title="Violin Plot"), local_path=f"{plots_dest}/violin.html", db_key=False, ) except Exception as e: context.logger.warn( f"Failed to create violin distribution plots due to: {e}") if label_column: labels = df.pop(label_column) imbtable = labels.value_counts(normalize=True).sort_index() try: gcf_clear(plt) balancebar = imbtable.plot(kind="bar", title="class imbalance - labels") balancebar.set_xlabel("class") balancebar.set_ylabel("proportion of total") extra_data["imbalance"] = context.log_artifact( PlotArtifact("imbalance", body=plt.gcf()), local_path=f"{plots_dest}/imbalance.html", ) except Exception as e: context.logger.warn( f"Failed to create class imbalance plot due to: {e}") context.log_artifact( TableArtifact("imbalance-weights-vec", df=pd.DataFrame({"weights": imbtable})), local_path=f"{plots_dest}/imbalance-weights-vec.csv", db_key=False, ) tblcorr = df.corr() mask = np.zeros_like(tblcorr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True dfcorr = pd.DataFrame(data=tblcorr, columns=header, index=header) dfcorr = dfcorr[ np.arange(dfcorr.shape[0])[:, None] > np.arange(dfcorr.shape[1])] context.log_artifact( TableArtifact("correlation-matrix", df=tblcorr, visible=True), local_path=f"{plots_dest}/correlation-matrix.csv", db_key=False, ) try: gcf_clear(plt) ax = plt.axes() sns.heatmap(tblcorr, ax=ax, mask=mask, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") extra_data["correlation"] = context.log_artifact( PlotArtifact("correlation", body=plt.gcf(), title="Correlation Matrix"), local_path=f"{plots_dest}/corr.html", db_key=False, ) except Exception as e: context.logger.warn( f"Failed to create features correlation plot due to: {e}") gcf_clear(plt) if update_dataset and table.meta and table.meta.kind == "dataset": from mlrun.artifacts import update_dataset_meta update_dataset_meta(table.meta, extra_data=extra_data)
def describe( context: MLClientCtx, table: Union[DataItem, str], label_column: str, class_labels: List[str], key: str = "table-summary", ) -> None: """Summarize a table TODO: merge with dask version :param context: the function context :param table: pandas dataframe :param key: key of table summary in artifact store """ _gcf_clear(plt) base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) os.makedirs(base_path + "/plots", exist_ok=True) print(f'TABLE {table}') table = pd.read_parquet(str(table)) header = table.columns.values # describe table sumtbl = table.describe() sumtbl = sumtbl.append(len(table.index) - table.count(), ignore_index=True) sumtbl.insert( 0, "metric", ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nans"]) sumtbl.to_csv(os.path.join(base_path, key + ".csv"), index=False) context.log_artifact(key, local_path=key + ".csv") # plot class balance, record relative class weight _gcf_clear(plt) labels = table.pop(label_column) class_balance_model = ClassBalance(labels=class_labels) class_balance_model.fit(labels) scale_pos_weight = class_balance_model.support_[ 0] / class_balance_model.support_[1] #context.log_artifact("scale_pos_weight", f"{scale_pos_weight:0.2f}") context.log_artifact("scale_pos_weight", str(scale_pos_weight)) class_balance_model.show( outpath=os.path.join(base_path, "plots/imbalance.png")) context.log_artifact(PlotArtifact("imbalance", body=plt.gcf()), local_path="plots/imbalance.html") # plot feature correlation _gcf_clear(plt) tblcorr = table.corr() ax = plt.axes() sns.heatmap(tblcorr, ax=ax, annot=False, cmap=plt.cm.Reds) ax.set_title("features correlation") plt.savefig(os.path.join(base_path, "plots/corr.png")) context.log_artifact(PlotArtifact("correlation", body=plt.gcf()), local_path="plots/corr.html") # plot histogram _gcf_clear(plt) """
def data_clean( context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data", ): """process a raw churn data file Data has 3 states here: `raw`, `cleaned` and `encoded` * `raw` kept by default, the pipeline begins with a raw data artifact * `cleaned` kept for charts, presentations * `encoded` is input for a cross validation and training function steps (not necessarily in correct order, some parallel) * column name maps * deal with nans and other types of missings/junk * label encode binary and ordinal category columns * create category ranges from numerical columns And finally, * test Why we don't one-hot-encode here? One hot encoding isn't a necessary step for all algorithms. It can also generate a very large feature matrix that doesn't need to be serialized (even if sparse). So we leave one-hot-encoding for the training step. What about scaling numerical columns? Same as why we don't one hot encode here. Do we scale before train-test split? IMHO, no. Scaling before splitting introduces a type of data leakage. In addition, many estimators are completely immune to the monotonic transformations implied by scaling, so why waste the cycles? TODO: * parallelize where possible * more abstraction (more parameters, chain sklearn transformers) * convert to marketplace function :param context: the function execution context :param src: an artifact or file path :param file_ext: file type for artifacts :param models_dest: label encoders and other preprocessing steps should be saved together with other pipeline models :param cleaned_key: key of cleaned data table in artifact store :param encoded_key: key of encoded data table in artifact store """ df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels", } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) # VALUE transformations # clean # truncate reply to "No" df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact( "preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp, ) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) # label encoding - generate model for each column saved in dict # some of these columns may be hot encoded in the training step fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels", ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model( "model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl", )
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
def permutation_importance( context: MLClientCtx, model: DataItem, dataset: DataItem, labels: str, figsz=(10, 5), plots_dest: str = "plots", fitype: str = "permute", ) -> pd.DataFrame: """calculate change in metric type 'permute' uses a pre-estimated model type 'dropcol' uses a re-estimates model :param context: the function's execution context :param model: a trained model :param dataset: features and ground truths, regression targets :param labels name of the ground truths column :param figsz: matplotlib figure size :param plots_dest: path within artifact store : """ model_file, model_data, _ = get_model(model.url, suffix=".pkl") model = load(open(str(model_file), "rb")) X = dataset.as_df() y = X.pop(labels) header = X.columns metric = _oob_classifier_accuracy baseline = metric(model, X, y) imp = [] for col in X.columns: if fitype is "permute": save = X[col].copy() X[col] = np.random.permutation(X[col]) m = metric(model, X, y) X[col] = save imp.append(baseline - m) elif fitype is "dropcol": X_ = X.drop(col, axis=1) model_ = clone(model) #model_.random_state = random_state model_.fit(X_, y) o = model_.oob_score_ imp.append(baseline - o) else: raise ValueError( "unknown fitype, only 'permute' or 'dropcol' permitted") zipped = zip(imp, header) feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) feature_imp.sort_values(by="importance", ascending=False, inplace=True) plt.clf() plt.figure(figsize=figsz) sns.barplot(x="importance", y="feature", data=feature_imp) plt.title(f"feature importances-{fitype}") plt.tight_layout() context.log_artifact( PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), local_path=f"{plots_dest}/feature-permutations.html", ) context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
def data_clean(context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data"): df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations old_cols = df.columns rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels" } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] tenure = df.tenure.copy(deep=True) df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact("preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels" ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model("model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl")