def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None: """Train a model. :param context: The runtime context object. :param p1: A model parameter. :param p2: Another model parameter. """ # access input metadata, values, and inputs print(f'Run: {context.name} (uid={context.uid})') print(f'Params: p1={p1}, p2={p2}') context.logger.info('started training') # <insert training code here> # log the run results (scalar values) context.log_result('accuracy', p1 * 2) context.log_result('loss', p1 * 3) # add a lable/tag to this run context.set_label('category', 'tests') # log a simple artifact + label the artifact # If you want to upload a local file to the artifact repo add src_path=<local-path> context.log_artifact('model', body=b'abc is 123', local_path='model.txt', labels={'framework': 'tfkeras'})
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None: """Train a model. :param context: The runtime context object. :param p1: A model parameter. :param p2: Another model parameter. """ # access input metadata, values, and inputs print(f"Run: {context.name} (uid={context.uid})") print(f"Params: p1={p1}, p2={p2}") context.logger.info("started training") # <insert training code here> # log the run results (scalar values) context.log_result("accuracy", p1 * 2) context.log_result("loss", p1 * 3) # add a lable/tag to this run context.set_label("category", "tests") # log a simple artifact + label the artifact # If you want to upload a local file to the artifact repo add src_path=<local-path> context.log_artifact("somefile", body=b"abc is 123", local_path="myfile.txt") # create a dataframe artifact df = pd.DataFrame([{ "A": 10, "B": 100 }, { "A": 11, "B": 110 }, { "A": 12, "B": 120 }]) context.log_dataset("mydf", df=df) # Log an ML Model artifact, add metrics, params, and labels to it # and place it in a subdir ('models') under artifacts path context.log_model( "mymodel", body=b"abc is 123", model_file="model.txt", metrics={"accuracy": 0.85}, parameters={"xx": "abc"}, labels={"framework": "xgboost"}, artifact_path=context.artifact_subpath("models"), )
def train_model( context: MLClientCtx, model_pkg_class: str, data_key: Union[DataItem, str], sample: int, label_column: str, model_key: str = "model", test_size: float = 0.05, train_val_split: float = 0.75, test_set_key: str = "test_set", rng: int = 1, models_dir: str = "models", plots_dir: str = "plots", score_method: str = "micro", class_params_updates: Union[DataItem, dict] = {}, fit_params_updates: Union[DataItem, dict] = {}, ) -> None: """train a classifier. :param context: the function context :param model_pkg_class: the model to train, e.g, 'sklearn.neural_networks.MLPClassifier' :param data_key: ("raw") name of raw data file :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param label_column: ground-truth (y) labels :param model_key: ('model') name of model in artifact store, points to a directory :param test_size: (0.05) test set size :param train_val_split: (0.75) Once the test set has been removed the training set gets this proportion. :param test_set_key: store the test data set under this key in the artifact store :param rng: (1) sklearn rng seed :param models_dir: models subfolder on artifact path :param plots_dir: plot subfolder on artifact path :param score_method: for multiclass classification :param class_updates: update these scikit-learn classifier params, input as a dict :param fit_updates: update scikit-learn fit parameters, input as a dict. """ # extract file name from DataItem srcfilepath = str(data_key) # TODO: this should be part of data's metadata dealt with in another step get a data set, sample, etc... # get all data or a sample if (sample == -1) or (sample >= 1): # get all rows, or contiguous sample starting at row 1. raw = pq.read_table(srcfilepath).to_pandas().dropna() labels = raw.pop(label_column) raw = raw.iloc[:sample, :] labels = labels.iloc[:sample] else: # grab a random sample raw = pq.read_table(srcfilepath).to_pandas().dropna().sample(sample * -1) labels = raw.pop(label_column) # TODO: this should be part of data's metadata dealt with in another step context.header = raw.columns.values # TODO: all of this should be part of a spitter component that does cv too, dealt with in another step # make a hot encode copy of labels before the split yb = label_binarize(labels, classes=list(range(raw.shape[1]))) # double split to generate 3 data sets: train, validation and test # with xtest,ytest set aside x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=0), labels, test_size=test_size, random_state=rng) xtrain, xvalid, ytrain, yvalid = train_test_split( x, y, train_size=train_val_split, random_state=rng) # extract the hot_encoded labels ytrainb = xtrain[:, -yb.shape[1]:].copy() xtrain = xtrain[:, :-yb.shape[1]].copy() # extract the hot_encoded labels yvalidb = xvalid[:, -yb.shape[1]:].copy() xvalid = xvalid[:, :-yb.shape[1]].copy() # extract the hot_encoded labels ytestb = xtest[:, -yb.shape[1]:].copy() xtest = xtest[:, :-yb.shape[1]].copy() # set-aside test_set test_set = pd.concat( [ pd.DataFrame(data=xtest, columns=context.header), pd.DataFrame(data=ytest, columns=[label_column]), pd.DataFrame(data=ytestb, columns=[label_column]) ], axis=1, ) filepath = os.path.join(base_path, test_set_key + ".pqt") test_set.to_parquet(filepath, index=False) context.log_artifact(test_set_key, local_path=test_set_key + ".pqt") # load the model config model_config = get_model_configs(model_pkg_class) # get update params if any if isinstance(class_params_updates, DataItem): class_params_updates = json.loads(class_params_updates.get()) if isinstance(fit_params_updates, DataItem): fit_params_updates = json.loads(fit_params_updates.get()) # update the parameters # add data to fit params fit_params_updates.update({'X': xtrain, 'y': ytrain}) model_config = update_model_config(model_config, class_params_update, fit_params_updates) # create class and fit ClassifierClass = _create_class(model_config["META"]["class"]) model = ClassifierClass(**class_params) model.fit(**fit_params) # save model filepath = os.path.join(base_path, f"{models_dir}/{model_key}.pkl") dump(model, open(filepath, "wb")) context.log_artifact(model_key, local_path=models_dir) # compute validation metrics ypred = model.predict(xvalid) y_score = model.predict_proba(xvalid) average_precision = average_precision_score(yvalidb, y_score, average=score_method) context.log_result(f"accuracy", float(model.score(xvalid, yvalid))) context.log_result(f"rocauc", roc_auc_score(yvalidb, y_score)) context.log_result(f"f1_score", f1_score(yvalid, ypred, average=score_method)) context.log_result(f"avg_precscore", average_precision) # validation plots plot_roc(context, yvalidb, y_score) plot_confusion_matrix(context, yvalid, ypred, key="confusion", fmt="png")