Esempio n. 1
0
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None:
    """Train a model.

    :param context: The runtime context object.
    :param p1: A model parameter.
    :param p2: Another model parameter.
    """
    # access input metadata, values, and inputs
    print(f'Run: {context.name} (uid={context.uid})')
    print(f'Params: p1={p1}, p2={p2}')
    context.logger.info('started training')

    # <insert training code here>

    # log the run results (scalar values)
    context.log_result('accuracy', p1 * 2)
    context.log_result('loss', p1 * 3)

    # add a lable/tag to this run
    context.set_label('category', 'tests')

    # log a simple artifact + label the artifact
    # If you want to upload a local file to the artifact repo add src_path=<local-path>
    context.log_artifact('model',
                         body=b'abc is 123',
                         local_path='model.txt',
                         labels={'framework': 'tfkeras'})
Esempio n. 2
0
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None:
    """Train a model.

    :param context: The runtime context object.
    :param p1: A model parameter.
    :param p2: Another model parameter.
    """
    # access input metadata, values, and inputs
    print(f"Run: {context.name} (uid={context.uid})")
    print(f"Params: p1={p1}, p2={p2}")
    context.logger.info("started training")

    # <insert training code here>

    # log the run results (scalar values)
    context.log_result("accuracy", p1 * 2)
    context.log_result("loss", p1 * 3)

    # add a lable/tag to this run
    context.set_label("category", "tests")

    # log a simple artifact + label the artifact
    # If you want to upload a local file to the artifact repo add src_path=<local-path>
    context.log_artifact("somefile",
                         body=b"abc is 123",
                         local_path="myfile.txt")

    # create a dataframe artifact
    df = pd.DataFrame([{
        "A": 10,
        "B": 100
    }, {
        "A": 11,
        "B": 110
    }, {
        "A": 12,
        "B": 120
    }])
    context.log_dataset("mydf", df=df)

    # Log an ML Model artifact, add metrics, params, and labels to it
    # and place it in a subdir ('models') under artifacts path
    context.log_model(
        "mymodel",
        body=b"abc is 123",
        model_file="model.txt",
        metrics={"accuracy": 0.85},
        parameters={"xx": "abc"},
        labels={"framework": "xgboost"},
        artifact_path=context.artifact_subpath("models"),
    )
def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    data_key: Union[DataItem, str],
    sample: int,
    label_column: str,
    model_key: str = "model",
    test_size: float = 0.05,
    train_val_split: float = 0.75,
    test_set_key: str = "test_set",
    rng: int = 1,
    models_dir: str = "models",
    plots_dir: str = "plots",
    score_method: str = "micro",
    class_params_updates: Union[DataItem, dict] = {},
    fit_params_updates: Union[DataItem, dict] = {},
) -> None:
    """train a classifier.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, 'sklearn.neural_networks.MLPClassifier'
    :param data_key:          ("raw") name of raw data file
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param label_column:      ground-truth (y) labels
    :param model_key:         ('model') name of model in artifact store,
                              points to a directory
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      store the test data set under this key in the
                              artifact store
    :param rng:               (1) sklearn rng seed
    :param models_dir:        models subfolder on artifact path
    :param plots_dir:         plot subfolder on artifact path
    :param score_method:      for multiclass classification
    :param class_updates:     update these scikit-learn classifier params,
                              input as a dict
    :param fit_updates:       update scikit-learn fit parameters, input as
                              a dict.
    """
    # extract file name from DataItem
    srcfilepath = str(data_key)

    # TODO: this should be part of data's metadata dealt with in another step get a data set, sample, etc...
    # get all data or a sample
    if (sample == -1) or (sample >= 1):
        # get all rows, or contiguous sample starting at row 1.
        raw = pq.read_table(srcfilepath).to_pandas().dropna()
        labels = raw.pop(label_column)
        raw = raw.iloc[:sample, :]
        labels = labels.iloc[:sample]
    else:
        # grab a random sample
        raw = pq.read_table(srcfilepath).to_pandas().dropna().sample(sample *
                                                                     -1)
        labels = raw.pop(label_column)

    # TODO: this should be part of data's metadata dealt with in another step
    context.header = raw.columns.values

    # TODO: all of this should be part of a spitter component that does cv too, dealt with in another step
    # make a hot encode copy of labels before the split
    yb = label_binarize(labels, classes=list(range(raw.shape[1])))
    # double split to generate 3 data sets: train, validation and test
    # with xtest,ytest set aside
    x, xtest, y, ytest = train_test_split(np.concatenate([raw, yb], axis=0),
                                          labels,
                                          test_size=test_size,
                                          random_state=rng)
    xtrain, xvalid, ytrain, yvalid = train_test_split(
        x, y, train_size=train_val_split, random_state=rng)
    # extract the hot_encoded labels
    ytrainb = xtrain[:, -yb.shape[1]:].copy()
    xtrain = xtrain[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    yvalidb = xvalid[:, -yb.shape[1]:].copy()
    xvalid = xvalid[:, :-yb.shape[1]].copy()
    # extract the hot_encoded labels
    ytestb = xtest[:, -yb.shape[1]:].copy()
    xtest = xtest[:, :-yb.shape[1]].copy()
    # set-aside test_set
    test_set = pd.concat(
        [
            pd.DataFrame(data=xtest, columns=context.header),
            pd.DataFrame(data=ytest, columns=[label_column]),
            pd.DataFrame(data=ytestb, columns=[label_column])
        ],
        axis=1,
    )
    filepath = os.path.join(base_path, test_set_key + ".pqt")
    test_set.to_parquet(filepath, index=False)
    context.log_artifact(test_set_key, local_path=test_set_key + ".pqt")

    # load the model config
    model_config = get_model_configs(model_pkg_class)
    # get update params if any
    if isinstance(class_params_updates, DataItem):
        class_params_updates = json.loads(class_params_updates.get())
    if isinstance(fit_params_updates, DataItem):
        fit_params_updates = json.loads(fit_params_updates.get())
    # update the parameters
    # add data to fit params
    fit_params_updates.update({'X': xtrain, 'y': ytrain})
    model_config = update_model_config(model_config, class_params_update,
                                       fit_params_updates)

    # create class and fit
    ClassifierClass = _create_class(model_config["META"]["class"])
    model = ClassifierClass(**class_params)
    model.fit(**fit_params)

    # save model
    filepath = os.path.join(base_path, f"{models_dir}/{model_key}.pkl")
    dump(model, open(filepath, "wb"))
    context.log_artifact(model_key, local_path=models_dir)

    # compute validation metrics
    ypred = model.predict(xvalid)
    y_score = model.predict_proba(xvalid)

    average_precision = average_precision_score(yvalidb,
                                                y_score,
                                                average=score_method)

    context.log_result(f"accuracy", float(model.score(xvalid, yvalid)))
    context.log_result(f"rocauc", roc_auc_score(yvalidb, y_score))
    context.log_result(f"f1_score",
                       f1_score(yvalid, ypred, average=score_method))
    context.log_result(f"avg_precscore", average_precision)

    # validation plots

    plot_roc(context, yvalidb, y_score)
    plot_confusion_matrix(context, yvalid, ypred, key="confusion", fmt="png")