Ejemplo n.º 1
0
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None:
    """Train a model.

    :param context: The runtime context object.
    :param p1: A model parameter.
    :param p2: Another model parameter.
    """
    # access input metadata, values, and inputs
    print(f"Run: {context.name} (uid={context.uid})")
    print(f"Params: p1={p1}, p2={p2}")
    context.logger.info("started training")

    # <insert training code here>

    # log the run results (scalar values)
    context.log_result("accuracy", p1 * 2)
    context.log_result("loss", p1 * 3)

    # add a lable/tag to this run
    context.set_label("category", "tests")

    # log a simple artifact + label the artifact
    # If you want to upload a local file to the artifact repo add src_path=<local-path>
    context.log_artifact("somefile",
                         body=b"abc is 123",
                         local_path="myfile.txt")

    # create a dataframe artifact
    df = pd.DataFrame([{
        "A": 10,
        "B": 100
    }, {
        "A": 11,
        "B": 110
    }, {
        "A": 12,
        "B": 120
    }])
    context.log_dataset("mydf", df=df)

    # Log an ML Model artifact, add metrics, params, and labels to it
    # and place it in a subdir ('models') under artifacts path
    context.log_model(
        "mymodel",
        body=b"abc is 123",
        model_file="model.txt",
        metrics={"accuracy": 0.85},
        parameters={"xx": "abc"},
        labels={"framework": "xgboost"},
        artifact_path=context.artifact_subpath("models"),
    )
Ejemplo n.º 2
0
def fit(context: MLClientCtx,
        dataset: DataItem,
        num_boost_round: int = 10,
        evals: List[Tuple[DMatrix, str]] = [],
        obj: Union[Callable, str] = "",
        feval: Union[Callable, str] = None,
        maximize: bool = False,
        early_stopping_rounds: int = None,
        evals_result: dict = {},
        verbose_eval: bool = True,
        xgb_model: DataItem = None,
        callbacks: List[Callable] = [],
        label_column: str = "labels",
        encode_cols: dict = {},
        sample: int = -1,
        test_size: float = 0.25,
        valid_size: float = 0.75,
        random_state: int = 1994,
        models_dest: str = "models",
        plots_dest: str = "plots",
        file_ext: str = "csv",
        test_set_key: str = "test-set",
        gpus: bool = False) -> None:
    """low level xgboost train api

    for the xgboost `train` params see:
    https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train

    Note:  the first parameter of xgboost's `train` method is a dict of parameters
           supplied to the booster (engine).  To modify one of those simply
           add a task parameter (when running you supply an mlrun NewTask) with the
           prefix "XGB_". So for example, to set the 'tree_method' parameter to 'approx',
           add {"XGB_tree_method":"approx"} to the task params key.

    :param context:           the function context
    :param dataset:           the full data set, train, valid and test will be extracted and
                              each converted to a DMatrix for input to xgboost's `train`
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param valid_size:        (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param random_state:      (1) sklearn rng seed
    :param models_dest:       destination subfolder for model artifacts
    :param plots_dest:        destination subfolder for plot artifacts
    :param file_ext:          format for test_set_key hold out data
    :param test_set_key:      (test-set), key of held out data in artifact store
    :param gpus:              (False), run on gpus
    """
    raw, labels, header = get_sample(dataset, sample, label_column)

    # hot-encode
    if encode_cols:
        raw = pd.get_dummies(raw,
                             columns=list(encode_cols.keys()),
                             prefix=list(encode_cols.values()),
                             drop_first=True)

    # split the sample into train validate, test and calibration sets:
    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = \
        get_splits(raw, labels, 3, test_size, valid_size, random_state)

    # save test data as regular dataframe as it may be used by other process
    context.log_dataset(test_set_key,
                        df=pd.concat([xtest, ytest], axis=1),
                        format=file_ext,
                        index=False)

    # convert to xgboost DMatrix (todo - dask, gpu)
    dtrain = DMatrix(xtrain, label=ytrain)
    dvalid = DMatrix(xvalid, label=yvalid)

    boost_params = {
        "tree_method": "gpu_hist" if gpus else "hist",
        "seed": random_state,
        "disable_default_eval_metric": 1,
        "objective": "reg:squaredlogerror",
        "eval_metric": "rmsle"
    }

    # enable user to customize `booster param` parameters
    for k, v in context.parameters.items():
        if k.startswith('XGB_'):
            boost_params[k[4:]] = v

    # collect learning curves / training history
    results = dict()

    booster = train(
        boost_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        evals_result=results,
        obj=squared_log,
        feval=rmsle,
        maximize=maximize,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=verbose_eval,
        # xgb_model=xgb_model,
        # callbacks: List[Callable] = []
    )

    context.log_model("model",
                      body=dumps(booster),
                      model_file="model.pkl",
                      artifact_path='/User/artifacts/tttt')

    learning_curves(context, results)
Ejemplo n.º 3
0
def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    encode_cols: List[str] = [],
    sample: int = -1,
    test_size: float = 0.30,
    train_val_split: float = 0.75,
    test_set_key: str = "test_set",
    model_evaluator=None,
    models_dest: str = "",
    plots_dest: str = "plots",
    file_ext: str = "parquet",
    model_pkg_file: str = "",
    random_state: int = 1,
) -> None:
    """train a classifier
    
    An optional cutom model evaluator can be supplied that should have the signature:
    `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of 
    scalar "results", a "plots" keys with a list of PlotArtifacts, and 
    and "tables" key containing a returned list of TableArtifacts.
    
    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      key of held out data in artifact store
    :param model_evaluator:   (None) a custom model evaluator can be specified
    :param models_dest:       ("") models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param file_ext:          ("parquet") format for test_set_key hold out data
    :param random_state:      (1) sklearn rng seed

    """
    models_dest = models_dest or "model"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(raw,
                             columns=list(encode_cols.keys()),
                             prefix=list(encode_cols.values()),
                             drop_first=True)

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, 1 - train_val_split, random_state)

    context.log_dataset(test_set_key,
                        df=pd.concat([xtest, ytest.to_frame()], axis=1),
                        format=file_ext,
                        index=False,
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": xtrain, "y": ytrain.values})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    if model_evaluator:
        eval_metrics = model_evaluator(context,
                                       xvalid,
                                       yvalid,
                                       model,
                                       plots_artifact_path=plots_path)
    else:
        eval_metrics = eval_model_v2(context,
                                     xvalid,
                                     yvalid,
                                     model,
                                     plots_artifact_path=plots_path)

    context.set_label('class', model_pkg_class)
    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      extra_data=eval_metrics,
                      model_file="model.pkl",
                      metrics=context.results,
                      labels={"class": model_pkg_class})
Ejemplo n.º 4
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
Ejemplo n.º 5
0
def data_clean(context: MLClientCtx,
               src: DataItem,
               file_ext: str = "csv",
               models_dest: str = "models/encoders",
               cleaned_key: str = "cleaned-data",
               encoded_key: str = "encoded-data"):
    df = src.as_df()

    # drop columns
    drop_cols_list = ["customerID", "TotalCharges"]
    df.drop(drop_cols_list, axis=1, inplace=True)

    # header transformations
    old_cols = df.columns
    rename_cols_map = {
        "SeniorCitizen": "senior",
        "Partner": "partner",
        "Dependents": "deps",
        "Churn": "labels"
    }
    df.rename(rename_cols_map, axis=1, inplace=True)

    # add drop column to logs:
    for col in drop_cols_list:
        rename_cols_map.update({col: "_DROPPED_"})

    # log the op
    tp = os.path.join(models_dest, "preproc-column_map.json")
    context.log_artifact("preproc-column_map.json",
                         body=json.dumps(rename_cols_map),
                         local_path=tp)
    df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x)

    # encode numerical type as category bins (ordinal)
    bins = [0, 12, 24, 36, 48, 60, np.inf]
    labels = [0, 1, 2, 3, 4, 5]
    tenure = df.tenure.copy(deep=True)
    df["tenure_map"] = pd.cut(df.tenure, bins, labels=False)
    tenure_map = dict(zip(bins, labels))
    # save this transformation
    tp = os.path.join(models_dest, "preproc-numcat_map.json")
    context.log_artifact("preproc-numcat_map.json",
                         body=bytes(json.dumps(tenure_map).encode("utf-8")),
                         local_path=tp)

    context.log_dataset(cleaned_key, df=df, format=file_ext, index=False)
    fix_cols = [
        "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup",
        "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
        "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService",
        "Contract", "PaymentMethod", "labels"
    ]

    d = defaultdict(LabelEncoder)
    df[fix_cols] = df[fix_cols].apply(
        lambda x: d[x.name].fit_transform(x.astype(str)))
    context.log_dataset(encoded_key, df=df, format=file_ext, index=False)

    model_bin = dumps(d)
    context.log_model("model",
                      body=model_bin,
                      artifact_path=os.path.join(context.artifact_path,
                                                 models_dest),
                      model_file="model.pkl")
Ejemplo n.º 6
0
def data_clean(
    context: MLClientCtx,
    src: DataItem,
    file_ext: str = "csv",
    models_dest: str = "models/encoders",
    cleaned_key: str = "cleaned-data",
    encoded_key: str = "encoded-data",
):
    """process a raw churn data file

    Data has 3 states here: `raw`, `cleaned` and `encoded`

    * `raw` kept by default, the pipeline begins with a raw data artifact
    * `cleaned` kept for charts, presentations
    * `encoded` is input for a cross validation and training function

    steps (not necessarily in correct order, some parallel)
    * column name maps
    * deal with nans and other types of missings/junk
    * label encode binary and ordinal category columns
    * create category ranges from numerical columns
    And finally,
    * test

    Why we don't one-hot-encode here? One hot encoding isn't a necessary
    step for all algorithms. It can also generate a very large feature
    matrix that doesn't need to be serialized (even if sparse).
    So we leave one-hot-encoding for the training step.

    What about scaling numerical columns? Same as why we don't one hot
    encode here. Do we scale before train-test split?  IMHO, no.  Scaling
    before splitting introduces a type of data leakage.  In addition,
    many estimators are completely immune to the monotonic transformations
    implied by scaling, so why waste the cycles?

    TODO:
        * parallelize where possible
        * more abstraction (more parameters, chain sklearn transformers)
        * convert to marketplace function

    :param context:          the function execution context
    :param src:              an artifact or file path
    :param file_ext:         file type for artifacts
    :param models_dest:       label encoders and other preprocessing steps
                             should be saved together with other pipeline
                             models
    :param cleaned_key:      key of cleaned data table in artifact store
    :param encoded_key:      key of encoded data table in artifact store
    """
    df = src.as_df()

    # drop columns
    drop_cols_list = ["customerID", "TotalCharges"]
    df.drop(drop_cols_list, axis=1, inplace=True)

    # header transformations
    rename_cols_map = {
        "SeniorCitizen": "senior",
        "Partner": "partner",
        "Dependents": "deps",
        "Churn": "labels",
    }
    df.rename(rename_cols_map, axis=1, inplace=True)

    # add drop column to logs:
    for col in drop_cols_list:
        rename_cols_map.update({col: "_DROPPED_"})

    # log the op
    tp = os.path.join(models_dest, "preproc-column_map.json")
    context.log_artifact("preproc-column_map.json",
                         body=json.dumps(rename_cols_map),
                         local_path=tp)

    # VALUE transformations

    # clean
    # truncate reply to "No"
    df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x)

    # encode numerical type as category bins (ordinal)
    bins = [0, 12, 24, 36, 48, 60, np.inf]
    labels = [0, 1, 2, 3, 4, 5]
    df["tenure_map"] = pd.cut(df.tenure, bins, labels=False)
    tenure_map = dict(zip(bins, labels))
    # save this transformation
    tp = os.path.join(models_dest, "preproc-numcat_map.json")
    context.log_artifact(
        "preproc-numcat_map.json",
        body=bytes(json.dumps(tenure_map).encode("utf-8")),
        local_path=tp,
    )

    context.log_dataset(cleaned_key, df=df, format=file_ext, index=False)

    # label encoding - generate model for each column saved in dict
    # some of these columns may be hot encoded in the training step
    fix_cols = [
        "gender",
        "partner",
        "deps",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "PhoneService",
        "MultipleLines",
        "PaperlessBilling",
        "InternetService",
        "Contract",
        "PaymentMethod",
        "labels",
    ]

    d = defaultdict(LabelEncoder)
    df[fix_cols] = df[fix_cols].apply(
        lambda x: d[x.name].fit_transform(x.astype(str)))
    context.log_dataset(encoded_key, df=df, format=file_ext, index=False)

    model_bin = dumps(d)
    context.log_model(
        "model",
        body=model_bin,
        artifact_path=os.path.join(context.artifact_path, models_dest),
        model_file="model.pkl",
    )
Ejemplo n.º 7
0
def train_model(
    context: MLClientCtx,
    model_type: str,
    dataset: Union[DataItem, pd.core.frame.DataFrame],
    label_column: str = "labels",
    encode_cols: dict = {},
    sample: int = -1,
    imbal_vec=[],
    test_size: float = 0.25,
    valid_size: float = 0.75,
    random_state: int = 1,
    models_dest: str = "models",
    plots_dest: str = "plots",
    eval_metrics: list = ["error", "auc"],
    file_ext: str = "parquet",
    test_set: str = "test_set",
) -> None:
    """train an xgboost model.

    Note on imabalanced data:  the `imbal_vec` parameter represents the measured
    class representations in the sample and can be used as a first step in tuning
    an XGBoost model.  This isn't a hyperparamter, merely an estimate that should
    be set as 'constant' throughout tuning process.

    :param context:           the function context
    :param model_type:        the model type to train, "classifier", "regressor"...
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param imbal_vec:         ([]) vector of class weights seen in sample
    :param test_size:         (0.05) test set size
    :param valid_size:        (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param random_state:      (1) sklearn rng seed
    :param models_dest:       destination subfolder for model artifacts
    :param plots_dest:        destination subfolder for plot artifacts
    :param eval_metrics:      (["error", "auc"]) learning curve metrics
    :param file_ext:          format for test_set_key hold out data
    :param test-set:          (test_set) key of held out data in artifact store
    """
    models_dest = models_dest or "models"
    plots_dest = plots_dest or f"plots/{context.name}"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(
            raw,
            columns=list(encode_cols.keys()),
            prefix=list(encode_cols.values()),
            drop_first=True,
        )

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, valid_size, random_state)

    context.log_dataset(test_set,
                        df=pd.concat([xtest, ytest], axis=1),
                        format=file_ext,
                        index=False)

    model_config = _gen_xgb_model(model_type, context.parameters.items())

    XGBBoostClass = create_class(model_config["META"]["class"])
    model = XGBBoostClass(**model_config["CLASS"])

    model_config["FIT"].update({
        "X": xtrain,
        "y": ytrain.values,
        "eval_set": [(xtrain, ytrain), (xvalid, yvalid)],
        "eval_metric": eval_metrics,
    })

    model.fit(**model_config["FIT"])

    eval_metrics = eval_model_v2(context, xvalid, yvalid, model)

    model_bin = dumps(model)
    context.log_model(
        "model",
        body=model_bin,
        artifact_path=os.path.join(context.artifact_path, models_dest),
        model_file="model.pkl",
    )