Ejemplo n.º 1
0
 def get_model(self, suffix=''):
     model_file, self.model_spec, extra_dataitems = get_model(
         self.model_dir, suffix)
     if self.model_spec and self.model_spec.parameters:
         for key, value in self.model_spec.parameters.items():
             self._params[key] = value
     return model_file, extra_dataitems
Ejemplo n.º 2
0
def xgb_test(
    context,
    models_path: DataItem,
    test_set: DataItem,
    label_column: str,
    plots_dest: str = "plots",
    default_model: str = "model.pkl",
) -> None:
    """Test one or more classifier models against held-out dataset

    Using held-out test features, evaluates the peformance of the estimated model

    Can be part of a kubeflow pipeline as a test step that is run post EDA and
    training/validation cycles

    :param context:         the function context
    :param models_path:     model artifact to be tested
    :param test_set:        test features and labels
    :param label_column:    column name for ground truth labels
    :param plots_dest:      dir for test plots
    :param default_model:   'model.pkl', default model artifact file name
    """
    xtest = test_set.as_df()
    ytest = xtest.pop(label_column)

    try:
        model_file, model_obj, _ = get_model(models_path.url, suffix=".pkl")
        model_obj = load(open(model_file, "rb"))
    except Exception as a:
        raise Exception("model location likely misspecified")

    eval_metrics = eval_model_v2(context, xtest, ytest.values, model_obj)
Ejemplo n.º 3
0
def test_classifier(
    context,
    models_path: DataItem,
    test_set: DataItem,
    label_column: str,
    score_method: str = "micro",
    plots_dest: str = "",
    model_evaluator=None,
    default_model: str = "model.pkl",
    predictions_column: str = "yscore",
    model_update=True,
) -> None:
    """Test one or more classifier models against held-out dataset

    Using held-out test features, evaluates the peformance of the estimated model

    Can be part of a kubeflow pipeline as a test step that is run post EDA and
    training/validation cycles

    :param context:            the function context
    :param models_path:        artifact models representing a file or a folder
    :param test_set:           test features and labels
    :param label_column:       column name for ground truth labels
    :param score_method:       for multiclass classification
    :param plots_dest:         dir for test plots
    :param model_evaluator:    NOT IMPLEMENTED: specific method to generate eval, passed in as string
                               or available in this folder
    :param predictions_column: column name for the predictions column on the resulted artifact
    :param model_update:       (True) update model, when running as stand alone no need in update
    """
    xtest = test_set.as_df()
    ytest = xtest.pop(label_column)

    try:
        model_file, model_obj, _ = get_model(models_path, suffix=".pkl")
        model_obj = load(open(model_file, "rb"))
    except Exception as a:
        raise Exception("model location likely specified")

    extra_data = eval_model_v2(context, xtest, ytest.values, model_obj)
    if model_obj and model_update == True:
        update_model(
            models_path,
            extra_data=extra_data,
            metrics=context.results,
            key_prefix="validation-",
        )

    y_hat = model_obj.predict(xtest)
    if y_hat.ndim == 1 or y_hat.shape[1] == 1:
        score_names = [predictions_column]
    else:
        score_names = [
            f"{predictions_column}_" + str(x) for x in range(y_hat.shape[1])
        ]

    df = pd.concat(
        [xtest, ytest, pd.DataFrame(y_hat, columns=score_names)], axis=1)
    context.log_dataset("test_set_preds", df=df, format="parquet", index=False)
Ejemplo n.º 4
0
def cox_test(
    context,
    models_path: DataItem,
    test_set: DataItem,
    label_column: str,
    plots_dest: str = "plots",
    model_evaluator=None,
) -> None:
    """Test one or more classifier models against held-out dataset

    Using held-out test features, evaluates the peformance of the estimated model

    Can be part of a kubeflow pipeline as a test step that is run post EDA and
    training/validation cycles

    :param context:         the function context
    :param model_file:      model artifact to be tested
    :param test_set:        test features and labels
    :param label_column:    column name for ground truth labels
    :param score_method:    for multiclass classification
    :param plots_dest:      dir for test plots
    :param model_evaluator: WIP: specific method to generate eval, passed in as string
                            or available in this folder
    """
    xtest = test_set.as_df()
    ytest = xtest.pop(label_column)

    model_file, model_obj, _ = get_model(models_path.url, suffix=".pkl")
    model_obj = load(open(str(model_file), "rb"))

    try:
        if not model_evaluator:
            eval_metrics = eval_class_model(context, xtest, ytest, model_obj)

        model_plots = eval_metrics.pop("plots")
        model_tables = eval_metrics.pop("tables")
        for plot in model_plots:
            context.log_artifact(plot,
                                 local_path=f"{plots_dest}/{plot.key}.html")
        for tbl in model_tables:
            context.log_artifact(tbl,
                                 local_path=f"{plots_dest}/{plot.key}.csv")

        context.log_results(eval_metrics)
    except:
        context.log_dataset("cox-test-summary",
                            df=model_obj.summary,
                            index=True,
                            format="csv")
        context.logger.info("cox tester not implemented")
Ejemplo n.º 5
0
def validation(context: MLClientCtx, model: DataItem) -> None:
    """Model validation.

    Dummy validation function.

    :param context: The runtime context object.
    :param model: The extimated model object.
    """
    # access input metadata, values, files, and secrets (passwords)
    print(f"Run: {context.name} (uid={context.uid})")
    context.logger.info("started validation")

    # get the model file, class (metadata), and extra_data (dict of key: DataItem)
    model_file, model_obj, _ = get_model(model)

    # update model object elements and data
    update_model(model_obj, parameters={"one_more": 5})

    print(f"path to local copy of model file - {model_file}")
    print("parameters:", model_obj.parameters)
    print("metrics:", model_obj.metrics)
    context.log_artifact("validation",
                         body=b"<b> validated </b>",
                         format="html")
Ejemplo n.º 6
0
def permutation_importance(
    context: MLClientCtx,
    model: DataItem,
    dataset: DataItem,
    labels: str,
    figsz=(10, 5),
    plots_dest: str = "plots",
    fitype: str = "permute",
) -> pd.DataFrame:
    """calculate change in metric

    type 'permute' uses a pre-estimated model
    type 'dropcol' uses a re-estimates model

    :param context:     the function's execution context
    :param model:       a trained model
    :param dataset:     features and ground truths, regression targets
    :param labels       name of the ground truths column
    :param figsz:       matplotlib figure size
    :param plots_dest:  path within artifact store
    :
    """
    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
    model = load(open(str(model_file), "rb"))

    X = dataset.as_df()
    y = X.pop(labels)
    header = X.columns

    metric = _oob_classifier_accuracy

    baseline = metric(model, X, y)

    imp = []
    for col in X.columns:
        if fitype is "permute":
            save = X[col].copy()
            X[col] = np.random.permutation(X[col])
            m = metric(model, X, y)
            X[col] = save
            imp.append(baseline - m)
        elif fitype is "dropcol":
            X_ = X.drop(col, axis=1)
            model_ = clone(model)
            #model_.random_state = random_state
            model_.fit(X_, y)
            o = model_.oob_score_
            imp.append(baseline - o)
        else:
            raise ValueError(
                "unknown fitype, only 'permute' or 'dropcol' permitted")

    zipped = zip(imp, header)
    feature_imp = pd.DataFrame(sorted(zipped),
                               columns=["importance", "feature"])
    feature_imp.sort_values(by="importance", ascending=False, inplace=True)

    plt.clf()
    plt.figure(figsize=figsz)
    sns.barplot(x="importance", y="feature", data=feature_imp)
    plt.title(f"feature importances-{fitype}")
    plt.tight_layout()

    context.log_artifact(
        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
        local_path=f"{plots_dest}/feature-permutations.html",
    )
    context.log_dataset(f"feature-importances-{fitype}-tbl",
                        df=feature_imp,
                        index=False)
Ejemplo n.º 7
0
    def create_or_patch(access_key: str, model_endpoint: ModelEndpoint):
        """
        Creates or patch a KV record with the given model_endpoint record

        :param access_key: V3IO access key for managing user permissions
        :param model_endpoint: An object representing a model endpoint
        """
        if model_endpoint.spec.model_uri or model_endpoint.status.feature_stats:
            logger.info(
                "Getting feature metadata",
                project=model_endpoint.metadata.project,
                model=model_endpoint.spec.model,
                function=model_endpoint.spec.function_uri,
                model_uri=model_endpoint.spec.model_uri,
            )

        # If model artifact was supplied, grab model meta data from artifact
        if model_endpoint.spec.model_uri:
            logger.info(
                "Getting model object, inferring column names and collecting feature stats"
            )
            model_obj: tuple = get_model(model_endpoint.spec.model_uri)

            model_obj: ModelArtifact = model_obj[1]

            if not model_endpoint.status.feature_stats:
                model_endpoint.status.feature_stats = model_obj.feature_stats

            if not model_endpoint.spec.label_names:
                model_label_names = [
                    _clean_feature_name(f.name) for f in model_obj.outputs
                ]
                model_endpoint.spec.label_names = model_label_names

            if not model_endpoint.spec.algorithm:
                model_endpoint.spec.algorithm = model_obj.algorithm

        # If feature_stats was either populated by model_uri or by manual input, make sure to keep the names
        # of the features. If feature_names was supplied, replace the names set in feature_stats, otherwise - make
        # sure to keep a clean version of the names
        if model_endpoint.status.feature_stats:
            logger.info("Feature stats found, cleaning feature names")
            if model_endpoint.spec.feature_names:
                if len(model_endpoint.status.feature_stats) != len(
                        model_endpoint.spec.feature_names):
                    raise MLRunInvalidArgumentError(
                        f"feature_stats and feature_names have a different number of names, while expected to match"
                        f"feature_stats({len(model_endpoint.status.feature_stats)}), "
                        f"feature_names({len(model_endpoint.spec.feature_names)})"
                    )
            clean_feature_stats = {}
            clean_feature_names = []
            for i, (feature, stats) in enumerate(
                    model_endpoint.status.feature_stats.items()):
                if model_endpoint.spec.feature_names:
                    clean_name = _clean_feature_name(
                        model_endpoint.spec.feature_names[i])
                else:
                    clean_name = _clean_feature_name(feature)
                clean_feature_stats[clean_name] = stats
                clean_feature_names.append(clean_name)
            model_endpoint.status.feature_stats = clean_feature_stats
            model_endpoint.spec.feature_names = clean_feature_names

            logger.info(
                "Done preparing feature names and stats",
                feature_names=model_endpoint.spec.feature_names,
            )

        # If none of the above was supplied, feature names will be assigned on first contact with the model monitoring
        # system
        logger.info("Updating model endpoint",
                    endpoint_id=model_endpoint.metadata.uid)

        write_endpoint_to_kv(
            access_key=access_key,
            endpoint=model_endpoint,
            update=True,
        )

        logger.info("Model endpoint updated",
                    endpoint_id=model_endpoint.metadata.uid)

        return model_endpoint