Esempio n. 1
0
def main():
    pm_options = parse_args()
    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    classifier = pickle.load(file_obj)

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))

    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    test_features = test_data[np.random.choice(test_data.shape[0],
                                               num_samples,
                                               replace=False)]

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(test_features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Predict labels
    result = classifier.predict(test_features)

    # Label distribution in prediction
    value, counts = np.unique(result, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Terminate MLOPs
    mlops.done()
Esempio n. 2
0
def gen_data_dist_stats(spark_ctx):

    spark_session = SparkSession(spark_ctx)

    # Import Data
    ##################################
    K = 3  # fixed number of centers
    num_attr = 10  # fixed number of attributes
    num_rows = 60000  # number of rows in the dataset
    input_data = generate_dataset(num_attr, num_rows, K, spark_ctx)

    column_names_all = input_data.columns
    for col_index in range(0, len(column_names_all)):
        input_data = input_data.withColumnRenamed(column_names_all[col_index],
                                                  'c' + str(col_index))

    input_data = input_data.cache()

    input_train = input_data

    # SparkML pipeline
    ##################################
    exclude_cols = []
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    vector_assembler = VectorAssembler(inputCols=input_col_names,
                                       outputCol="features")

    kmeans_pipe = KMeans(k=K,
                         initMode="k-means||",
                         initSteps=5,
                         tol=1e-4,
                         maxIter=100,
                         featuresCol="features")
    full_pipe = [vector_assembler, kmeans_pipe]
    model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

    try:
        mlops.set_data_distribution_stat(data=input_train, model=model_kmeans)
        m = mlops.Model(model_format=ModelFormat.SPARKML)
        m.set_data_distribution_stat(data=input_train)
        print("PM: done generating histogram")
    except Exception as e:
        print("PM: failed to generate histogram using pm.stat")
        print(e)

    # Indicating that model statistics were reported
    mlops.set_stat(E2EConstants.MODEL_STATS_REPORTED_STAT_NAME, 1)
    return model_kmeans
Esempio n. 3
0
 def _materialize(self, parent_data_objs, user_data):
     df_infer_set = self._gen_inf_dataset(parent_data_objs[0])
     # Initialize MLOps Library
     mlops.init()
     #Record the data distribution stats for the DataFrame
     mlops.set_data_distribution_stat(df_infer_set)
     # Terminate MLOPs
     mlops.done()
     return [df_infer_set]
Esempio n. 4
0
def test_data_distribution_stat_api(generate_da_with_missing_data):
    pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE)
    pm._set_api_test_mode()

    # basic test
    data = np.array([[1, 2], [3, 4]])
    pm.set_data_distribution_stat(data)

    # test with column missing
    blah = pd.read_csv(generate_da_with_missing_data)
    pm.set_data_distribution_stat(blah)
    pm.done()
Esempio n. 5
0
def test_data_distribution_stat_api_spark(spark_session, generate_da_with_missing_data):
    sc = spark_session.sparkContext
    pm.init(ctx=sc, mlops_mode=MLOpsMode.STAND_ALONE)
    pm._set_api_test_mode()

    pdf = pd.read_csv(generate_da_with_missing_data)
    spark_df = spark_session.createDataFrame(pdf)

    pm.set_data_distribution_stat(data=spark_df)

    sc.stop()

    pm.done()
def main():
    ## MLOPS start
    # Initialize mlops
    mlops.init()
    ## MLOPS end

    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    print("Loading model from: {}".format(args.model_dir))
    if os.path.isdir(args.model_dir):
        print("Found model")
    else:
        print("No model found. Exiting.")
        exit(0)

    # load the model
    model = SavedModel(args.model_dir, args.sig_name)

    # get the input
    input = MnistStreamInput(args.input_dir, args.total_records, args.random)

    test_data = input._samples
    mlops.set_data_distribution_stat(test_data)

    # track confidence
    conf_tracker = ConfidenceTracker(args.track_conf, args.conf_thresh,
                                     args.conf_percent, args.output_low_conf)

    # perform inferences on the input
    infer_loop(model, input, args.output_file, args.stats_interval,
               conf_tracker)

    del model
    del input

    ### MLOPS start
    mlops.done()
    ### MLOPS end
    print("Inference batch complete")
    exit(0)
Esempio n. 7
0
def main():

    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    mnist_data = get_input(args.input_dir)

    X = mnist_data.train.images

    ## MLOps start
    # Initialize the mlops library
    mlops.init()

    # Report the feature distribution for the training data
    mlops.set_data_distribution_stat(X)
    ## MLOps end

    train(mnist_data, args.epochs, args.model_dir, args.display_step)

    ## MLOps start
    # Release mlops resources
    mlops.done()
Esempio n. 8
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))
    print("PM: # Classes:                   [{}]".format(
        pm_options.num_classes))

    print("PM: C:                           [{}]".format(pm_options.C))
    print("PM: Kernel:                      [{}]".format(pm_options.kernel))
    print("PM: Degree:                      [{}]".format(pm_options.degree))
    print("PM: Gamma:                       [{}]".format(pm_options.gamma))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum iterations:          [{}]".format(pm_options.max_iter))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)
    num_classes = int(pm_options.num_classes)

    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=num_samples,
                               n_features=num_features,
                               n_informative=2,
                               n_redundant=1,
                               n_classes=num_classes,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Create a model that should be deployed into production
    final_model = SVC(C=float(pm_options.C),
                      probability=True,
                      kernel=pm_options.kernel,
                      degree=int(pm_options.degree),
                      gamma=str(pm_options.gamma),
                      tol=float(pm_options.tol),
                      max_iter=int(pm_options.max_iter))

    final_model.fit(features, labels)

    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T

    # Output actual label distribution as a BarGraph using MCenter
    bar = BarGraph().name("User Defined: Actual Label Distribution") \
        .cols((label_distribution[:, 0]).astype(str).tolist()) \
        .data((label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    pos_label = 1

    # calculate classification prediction
    labels_pred = final_model.predict(features)
    # calculate decision scores [n_sample, n_class]
    labels_decision_score = final_model.decision_function(features)
    # calculate classification probabilities [n_sample, n_class]
    labels_prob = final_model.predict_proba(features)
    # calculate classification probabilities of positive labels
    label_pos_class_prob = list(map(lambda x: x[pos_label], labels_prob))
    # list of sorted labels. i.e. [0, 1, 2, ..]
    labels_ordered = sorted(set(labels))

    value_pred, counts_pred = np.unique(labels_pred, return_counts=True)
    label_distribution_pred = np.asarray((value_pred, counts_pred)).T

    # Output prediction label distribution as a BarGraph using MCenter
    bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \
        .data((label_distribution_pred[:, 1]).tolist())
    mlops.set_stat(bar_pred)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    ################################################################
    #################### Start: Output Accuracy ####################
    ################################################################

    accuracy = final_model.score(features, labels)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output accuracy of the chosen model using MCenter
    # mlops.set_stat("User Defined: Accuracy", accuracy, st.TIME_SERIES)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.ACCURACY_SCORE, accuracy)

    # OR

    # Third Way
    mlops.metrics.accuracy_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output Accuracy ####################
    ##############################################################

    ################################################################
    #################### Start: Output AUC ####################
    ################################################################

    fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels,
                                                     labels_pred,
                                                     pos_label=pos_label)
    auc = sklearn.metrics.auc(fpr, tpr)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output auc of the chosen model using MCenter
    # mlops.set_stat("User Defined: AUC", auc)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.AUC, auc)

    # OR

    # Third Way
    mlops.metrics.auc(x=fpr, y=tpr)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output AUC ####################
    ##############################################################

    ###############################################################################
    #################### Start: Output Average Precision Score ####################
    ###############################################################################

    # average precision is not supported for multiclass
    if len(labels_ordered) <= 2:
        aps = sklearn.metrics.average_precision_score(labels,
                                                      labels_decision_score)

        #################### OLD WAY ####################
        # First Way
        #
        # # Output aps of the chosen model using MCenter
        # mlops.set_stat("User Defined: Average Precision Score", aps)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.AVERAGE_PRECISION_SCORE, aps)

        # OR

        # Third Way
        mlops.metrics.average_precision_score(y_true=labels,
                                              y_score=labels_decision_score)
        #################### DONE NEW WAY ####################

    #############################################################################
    #################### End: Output Average Precision Score ####################
    #############################################################################

    #########################################################################
    #################### Start: Output Balanced Accuracy ####################
    #########################################################################

    bas = sklearn.metrics.balanced_accuracy_score(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output bas of the chosen model using MCenter
    # mlops.set_stat("User Defined: Balanced Accuracy Score", bas)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, data=bas)

    # OR

    # Third Way
    mlops.metrics.balanced_accuracy_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #######################################################################
    #################### End: Output Balanced Accuracy ####################
    #######################################################################

    ########################################################################
    #################### Start: Output Brier Score Loss ####################
    ########################################################################

    bsl = sklearn.metrics.brier_score_loss(labels,
                                           label_pos_class_prob,
                                           pos_label=pos_label)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output bsl of the chosen model using MCenter
    # mlops.set_stat("User Defined: Brier Score Loss", bsl)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.BRIER_SCORE_LOSS, data=bsl)

    # OR

    # Third Way
    mlops.metrics.brier_score_loss(y_true=labels,
                                   y_prob=label_pos_class_prob,
                                   pos_label=pos_label)
    #################### DONE NEW WAY ####################

    ######################################################################
    #################### End: Output Brier Score Loss ####################
    ######################################################################

    #############################################################################
    #################### Start: Output Classification Report ####################
    #############################################################################
    cr = sklearn.metrics.classification_report(labels, labels_pred)
    print("Classification Report\n{}".format(cr))
    #################### OLD WAY ####################
    # First Way
    #
    # from parallelm.mlops.stats.table import Table
    #
    # arrayReport = list()
    # for row in cr.split("\n"):
    #     parsed_row = [x for x in row.split("  ") if len(x) > 0]
    #     if len(parsed_row) > 0:
    #         arrayReport.append(parsed_row)
    #
    # header = arrayReport[0]
    # cr_table = Table().name("User Defined: Classification Report").cols(header)
    #
    # for index in range(1, len(arrayReport)):
    #     row_title = arrayReport[index][0]
    #     row_value = arrayReport[index][:-1]
    #     cr_table.add_row(row_title, row_value)
    #
    # # output classification report using MCenter
    # mlops.set_stat(cr_table)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.CLASSIFICATION_REPORT, data=cr)

    # OR

    # Third Way
    mlops.metrics.classification_report(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ###########################################################################
    #################### End: Output Classification Report ####################
    ###########################################################################

    #########################################################################
    #################### Start: Output Cohen Kappa Score ####################
    #########################################################################

    cks = sklearn.metrics.cohen_kappa_score(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output cks of the chosen model using MCenter
    # mlops.set_stat("User Defined: Cohen Kappa Score", cks)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.COHEN_KAPPA_SCORE, data=cks)

    # OR

    # Third Way
    mlops.metrics.cohen_kappa_score(labels, labels_pred)
    #################### DONE NEW WAY ####################

    #######################################################################
    #################### End: Output Cohen Kappa Score ####################
    #######################################################################

    ########################################################################
    #################### Start: Output Confusion Matrix ####################
    ########################################################################

    cm = sklearn.metrics.confusion_matrix(labels,
                                          labels_pred,
                                          labels=labels_ordered)

    #################### OLD WAY ####################
    # First Way
    # from parallelm.mlops.stats.table import Table

    # labels_string = [str(i) for i in labels_ordered]
    # cm_matrix = Table().name("User Defined: Confusion Matrix").cols(labels_string)
    #
    # for index in range(len(cm)):
    #     cm_matrix.add_row(labels_string[index], list(cm[index]))
    #
    # mlops.set_stat(cm_matrix)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.CONFUSION_MATRIX,
                   cm,
                   labels=labels_ordered)

    # OR

    # Third Way
    mlops.metrics.confusion_matrix(y_true=labels,
                                   y_pred=labels_pred,
                                   labels=labels_ordered)
    #################### DONE NEW WAY ####################

    ######################################################################
    #################### End: Output Confusion Matrix ####################
    ######################################################################

    ################################################################
    #################### Start: Output F1 Score ####################
    ################################################################

    f1 = sklearn.metrics.f1_score(labels,
                                  labels_pred,
                                  pos_label=pos_label,
                                  average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output f1 score of the chosen model using MCenter
    # mlops.set_stat("User Defined: F1 Score", f1)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.F1_SCORE, data=f1)

    # OR

    # Third Way
    mlops.metrics.f1_score(labels,
                           labels_pred,
                           pos_label=pos_label,
                           average=None)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output F1 Score ####################
    ##############################################################

    ################################################################
    #################### Start: Output FBeta Score ####################
    ################################################################

    fbeta = sklearn.metrics.fbeta_score(labels,
                                        labels_pred,
                                        beta=0.5,
                                        average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output fbeta score of the chosen model using MCenter
    # mlops.set_stat("User Defined: F-beta Score", fbeta)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.FBETA_SCORE, data=fbeta)

    # OR

    # Third Way
    mlops.metrics.fbeta_score(labels,
                              labels_pred,
                              pos_label=pos_label,
                              beta=0.5,
                              average=None)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Output FBeta Score ####################
    #################################################################

    ####################################################################
    #################### Start: Output Hamming Loss ####################
    ####################################################################

    hamming_loss = sklearn.metrics.hamming_loss(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output hamming loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Hamming Loss", hamming_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.HAMMING_LOSS, data=hamming_loss)

    # OR

    # Third Way
    mlops.metrics.hamming_loss(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ##################################################################
    #################### End: Output Hamming Loss ####################
    ##################################################################

    ##################################################################
    #################### Start: Output Hinge Loss ####################
    ##################################################################

    hinge_loss = sklearn.metrics.hinge_loss(labels, labels_decision_score)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output hinge loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Hinge Loss", hinge_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.HINGE_LOSS, data=hinge_loss)

    # OR

    # Third Way
    mlops.metrics.hinge_loss(labels, labels_decision_score)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Output Hinge Loss ####################
    ################################################################

    ##############################################################################
    #################### Start: Output Jaccard Similarity Score ####################
    ##############################################################################

    jaccard_sim_score = sklearn.metrics.jaccard_similarity_score(
        labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output jaccard similarity score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Jaccard Similarity Score", jaccard_sim_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.JACCARD_SIMILARITY_SCORE,
                   data=jaccard_sim_score)

    # OR

    # Third Way
    mlops.metrics.jaccard_similarity_score(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Jaccard Similary Score ####################
    ############################################################################

    ################################################################
    #################### Start: Output Log Loss ####################
    ################################################################

    log_loss = sklearn.metrics.log_loss(labels, labels_prob)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output log loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Log Loss", log_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.LOG_LOSS, data=log_loss)

    # OR

    # Third Way
    mlops.metrics.log_loss(labels, labels_prob)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output Log Loss ####################
    ##############################################################

    ########################################################################################
    #################### Start: Output Matthews Correlation Coefficient ####################
    ########################################################################################

    mcc = sklearn.metrics.matthews_corrcoef(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output mcc of the chosen model using MCenter
    # mlops.set_stat("User Defined: Matthews Correlation Coefficient", mcc)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT,
                   data=mcc)

    # OR

    # Third Way
    mlops.metrics.matthews_corrcoef(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ######################################################################################
    #################### End: Output Matthews Correlation Coefficient ####################
    ######################################################################################

    ##############################################################################
    #################### Start: Output Precision Recall Curve ####################
    ##############################################################################

    # precision_recall_curve is not supported for multiclass
    if len(labels_ordered) <= 2:
        precision, recall, thresholds = sklearn.metrics.precision_recall_curve(
            labels, labels_decision_score, pos_label=pos_label)
        classes = len(labels_ordered)
        average_precision = sklearn.metrics.average_precision_score(
            labels, labels_decision_score, average="macro")

        graph_label_str = "{}-class Precision Recall Curve -- AP: {}".format(
            classes, average_precision)

        #################### OLD WAY ####################
        # First Way
        # from parallelm.mlops.stats.graph import Graph
        #
        # p_r_curve = Graph() \
        #     .name("User Defined: Precision Recall Curve") \
        #     .set_x_series(list(recall)) \
        #     .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(precision))
        #
        # p_r_curve.x_title("Recall")
        # p_r_curve.y_title("Precision")
        # mlops.set_stat(p_r_curve)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.PRECISION_RECALL_CURVE,
                       [precision, recall],
                       legend=graph_label_str)

        # OR

        # Third Way
        mlops.metrics.precision_recall_curve(y_true=labels,
                                             probas_pred=labels_decision_score,
                                             pos_label=pos_label,
                                             average="macro")

        #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Precision Recall Curve ####################
    ############################################################################

    #######################################################################
    #################### Start: Output Precision Score ####################
    #######################################################################

    precision_score = sklearn.metrics.precision_score(labels,
                                                      labels_pred,
                                                      pos_label=pos_label,
                                                      average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output precision score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Precision Score", precision_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.PRECISION_SCORE, data=precision_score)

    # OR

    # Third Way
    mlops.metrics.precision_score(labels,
                                  labels_pred,
                                  pos_label=pos_label,
                                  average=None)
    #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Precision Score ###########################
    ############################################################################

    ####################################################################
    #################### Start: Output Recall Score ####################
    ####################################################################

    recall_score = sklearn.metrics.recall_score(labels,
                                                labels_pred,
                                                pos_label=pos_label,
                                                average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output recall score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Recall Score", recall_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.RECALL_SCORE, data=recall_score)

    # OR

    # Third Way
    mlops.metrics.recall_score(labels,
                               labels_pred,
                               pos_label=pos_label,
                               average=None)
    #################### DONE NEW WAY ####################

    #########################################################################
    #################### End: Output Recall Score ###########################
    #########################################################################

    #####################################################################
    #################### Start: Output ROC AUC Score ####################
    #####################################################################

    # roc_auc_score is not supported for multiclass
    if len(labels_ordered) <= 2:
        roc_auc_score = sklearn.metrics.roc_auc_score(labels,
                                                      labels_decision_score)

        #################### OLD WAY ####################
        # First Way
        #
        # # Output roc auc score of the chosen model using MCenter
        # mlops.set_stat("User Defined: ROC AUC Score", roc_auc_score)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.ROC_AUC_SCORE, data=roc_auc_score)

        # OR

        # Third Way
        mlops.metrics.roc_auc_score(labels, labels_decision_score)
        #################### DONE NEW WAY ####################

    ###################################################################
    #################### End: Output ROC AUC Score ####################
    ###################################################################

    #################################################################
    #################### Start: Output ROC Curve ####################
    #################################################################

    # roc_auc_score is not supported for multiclass
    if len(labels_ordered) <= 2:
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels,
                                                         labels_decision_score,
                                                         pos_label=pos_label)

        roc_auc_score = sklearn.metrics.roc_auc_score(labels,
                                                      labels_decision_score)

        graph_label_str = "ROC Curve, AUC: {}".format(roc_auc_score)

        #################### OLD WAY ####################
        # First Way
        # from parallelm.mlops.stats.graph import Graph
        #
        # roc_curve = Graph() \
        #     .name("User Defined: ROC Curve") \
        #     .set_x_series(list(fpr)) \
        #     .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(tpr))
        #
        # roc_curve.x_title("False Positive Rate")
        # roc_curve.y_title("True Positive Rate")
        #
        # mlops.set_stat(roc_curve)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        mlops.set_stat(ClassificationMetrics.ROC_CURVE, [tpr, fpr],
                       legend=graph_label_str)

        # OR

        # Third Way
        mlops.metrics.roc_curve(y_true=labels,
                                y_score=labels_decision_score,
                                pos_label=pos_label)
        #################### DONE NEW WAY ####################

    ###############################################################
    #################### End: Output ROC Curve ####################
    ###############################################################

    #####################################################################
    #################### Start: Output Zero One Loss ####################
    #####################################################################

    zol = sklearn.metrics.zero_one_loss(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output zol of the chosen model using MCenter
    # mlops.set_stat("User Defined: Zero One Loss", zol)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.ZERO_ONE_LOSS, data=zol)

    # OR

    # Third Way
    mlops.metrics.zero_one_loss(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ###################################################################
    #################### End: Output Zero One Loss ####################
    ###################################################################

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
Esempio n. 9
0
def main():
    # Initialize spark and MLOps
    spark = SparkSession.builder.appName(
        "RandomForestClassifier").getOrCreate()
    mlops.init(spark.sparkContext)

    # parse the arguments to component
    options = parse_args()

    # Load the model, exit gracefully if model is not found
    try:
        model_rf = \
            SparkPipelineModelHelper() \
                .set_shared_context(spark_context=spark.sparkContext) \
                .set_local_path(local_path=options.input_model) \
                .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \
                .load_sparkml_model()
    except Exception as e:
        print(e)
        mlops.done()
        spark.sparkContext.stop()
        exit()

    # Generate synthetic data for inference (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = 50
    num_features = 20

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))
    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    test_features = test_data[np.random.choice(test_data.shape[0],
                                               num_samples,
                                               replace=False)]
    feature_names = [
        "".join(ascii_lowercase[a]) for a in range(num_features + 1)
    ]

    # Create a spark dataframe from the synthetic data generated
    inferenceData = spark.createDataFrame(
        pd.DataFrame(test_features, columns=feature_names[1:num_features + 1]))

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score
    mlops.set_data_distribution_stat(inferenceData)

    num_samples = inferenceData.count()

    # Report the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Make inference predictions
    predicted_df = model_rf.transform(inferenceData)

    # Create a bar graph with label and confidence distributions
    histogram_predictions = predicted_df.groupby("prediction").count()
    prediction_values = np.array(
        histogram_predictions.select("prediction").collect())
    prediction_counts = np.array(
        histogram_predictions.select("count").collect())

    # Report label distribution as a BarGraph using MCenter
    bar_predictions = BarGraph().name("Prediction Distribution").cols(
        (prediction_values[0]).astype(str).tolist()).data(
            (prediction_counts[0]).tolist())
    mlops.set_stat(bar_predictions)

    # Stop spark context and MLOps
    spark.sparkContext.stop()
    mlops.done()
Esempio n. 10
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    # Loading the data
    loan_df = pd.read_csv(pm_options.input_file)
    X = loan_df

    # Cleaning NAs
    mlops.set_data_distribution_stat(loan_df)
    print("dataset_size = ", loan_df.shape[0])
    print("number of NAs per columns = \n", loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # ## Inference
    pred_labels = final_model.predict(X)
    pred_probs = final_model.predict_proba(X)

    # Prediction distribution and prediction confidence distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Inference Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))
    export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:,
                                                                            1],
                     "Inference - XGBoost Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".
          format(average_confidence))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence,
                     "Validation - XGBoost Average confidence per class")

    # Feature importance comparison
    export_feature_importance(final_model, list(X.columns), 5, "XGBoost")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0 = np.where(pred_labels == 0)[0]
    y_test1 = np.where(pred_labels == 1)[0]
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))
    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    total_psi, psi_table = get_psi(max_pred_probs[y_test0],
                                   max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    print("XGBoost PSI Stats: \n {}".format(psi_table))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # ## Finish the program
    mlops.done()
Esempio n. 11
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: Data file:            [{}]".format(pm_options.data_file))
    print("PM: Output model:         [{}]".format(pm_options.output_model))
    print("PM: regularization_range:         [{}]".format(
        pm_options.regularization_range))

    mlops.init()

    # Read the Samsung datafile
    dataset = pd.read_csv(pm_options.data_file)

    # Separate into features and labels
    features = dataset.iloc[:, 1:].values
    labels = dataset.iloc[:, 0].values

    # Hyper-parameter search using k-fold cross-validation
    # Applying k_fold cross validation
    regularization_range = pm_options.regularization_range.split(',')
    regularization = [
        float(regularization_var)
        for regularization_var in regularization_range
    ]
    tune_parameters = [{'C': regularization}]

    # Initialize logistic regression algorithm
    LR = LogisticRegression(class_weight='balanced',
                            multi_class='multinomial',
                            solver='lbfgs')
    clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy')
    clf.fit(features, labels)
    print("best parameter = ", clf.best_params_)
    accuracy = clf.cv_results_['mean_test_score']
    print(
        'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format(
            accuracy, regularization))

    ########## Start of ParallelM instrumentation ##############
    # Report Hyper-parameter Table
    tbl = Table().name("Hyper-parameter Search Results").cols(
        ["Mean accuracy from k-fold cross-validation"])
    print("length of regularization", len(regularization))
    index_max = np.argmax(accuracy)
    for a in range(0, len(regularization)):
        print("adding row", regularization[a])
        if a == index_max:
            tbl.add_row("[Best] Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
        else:
            tbl.add_row("Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
    mlops.set_stat(tbl)
    ########## End of ParallelM instrumentation ##############

    # Label distribution in training
    label_distribution = dataset['label'].value_counts()
    column_names = np.array(label_distribution.index).astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    ########## Start of ParallelM instrumentation ##############
    # Report label distribution as a BarGraph
    bar = BarGraph().name("Label Distribution").cols(
        np.array(label_distribution.index).astype(str).tolist()).data(
            label_distribution.values.tolist())
    mlops.set_stat(bar)
    ########## Start of ParallelM instrumentation ##############

    #################### Start of ParallelM instrumentation ################
    # Report accuracy of the chosen model
    mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max],
                   st.TIME_SERIES)
    #################### End of ParallelM instrumentation ################

    # Histogram input
    mlops.set_data_distribution_stat(dataset)

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(clf, model_file)
    model_file.close()
    mlops.done()
Esempio n. 12
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: C:                           [{}]".format(pm_options.C))
    print("PM: Kernel:                      [{}]".format(pm_options.kernel))
    print("PM: Degree:                      [{}]".format(pm_options.degree))
    print("PM: Gamma:                       [{}]".format(pm_options.gamma))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum iterations:          [{}]".format(pm_options.max_iter))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)
    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=num_samples,
                               n_features=num_features,
                               n_informative=2,
                               n_redundant=1,
                               n_classes=3,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Create a model that should be deployed into production
    final_model = SVC(C=float(pm_options.C),
                      kernel=pm_options.kernel,
                      degree=int(pm_options.degree),
                      gamma=str(pm_options.gamma),
                      tol=float(pm_options.tol),
                      max_iter=int(pm_options.max_iter))

    final_model.fit(features, labels)

    # Accuracy for the chosen model
    accuracy = final_model.score(features, labels)
    print("Accuracy values: \n {0}".format(accuracy))

    # Label distribution in training
    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
Esempio n. 13
0
def main():
    # Initialize spark and MLOps
    spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate()
    mlops.init(spark.sparkContext)

    # parse the arguments to component
    options = parse_args()
    print("PM: Configuration:")
    print("PM: Number of trees:                [{}]".format(options.num_trees))
    print("PM: Maximum depth:                  [{}]".format(options.max_depth))
    print("PM: Output model:                   [{}]".format(options.output_model))
    print("PM: Temp shared path:               [{}]".format(options.temp_shared_path))

    # Generate synthetic data using scikit learn
    num_samples = 50
    num_features = 20
    num_classes = 3
    X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1,
                               n_classes=num_classes, n_clusters_per_class=1, random_state=42)
    X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features))

    feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)]
    feature_names[0] = "label"

    # Create a spark dataframe from the synthetic data generated 
    trainingData = spark.createDataFrame(
        pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names))

    # Histogram of label distribution
    value, counts = np.unique(y, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data(
        (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output Health Statistics to MCenter
    # Report features whose distribution should be compared during inference
    mlops.set_data_distribution_stat(trainingData)

    # Fit a random forest classifiction model
    assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features")
    layers = [num_features, 5, 4, num_classes]
    classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth))

    pipeline = Pipeline(stages=[assembler, classifier])
    model = pipeline.fit(trainingData)
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute training error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Report accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Save the spark model 
    SparkPipelineModelHelper() \
        .set_shared_context(spark_context=spark.sparkContext) \
        .set_local_path(local_path=options.output_model) \
        .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \
        .save_sparkml_model(model)

    # Stop spark context and MLOps
    spark.sparkContext.stop()
    mlops.done()
Esempio n. 14
0
def get_data_distribution_stat(df_clean):
    """
    Record the data distribution stats for the DataFrame
    """
    mlops.set_data_distribution_stat(df_clean)
Esempio n. 15
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv").option(
        "header", pm_options.with_headers).option(
            "ignoreLeadingWhiteSpace",
            "true").option("ignoreTrailingWhiteSpace", "true").option(
                "inferschema",
                "true").load(pm_options.data_file)).repartition(10)

    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(
                column_names_all[col_index], 'c' + str(col_index))

    input_data = input_data.cache()

    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    exclude_cols = []
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    vector_assembler = VectorAssembler(inputCols=input_col_names,
                                       outputCol="features")
    kmeans_pipe = KMeans(k=int(pm_options.K),
                         initMode="k-means||",
                         initSteps=2,
                         tol=1e-4,
                         maxIter=100,
                         featuresCol="features")
    full_pipe = [vector_assembler, kmeans_pipe]
    model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

    # Test validation and statistics collection
    ############################################################
    predicted_df = model_kmeans.transform(input_test)

    print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

    sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
    print("Sum of Errors for Kmeans = " + str(sum_errors))

    # Shows the result.
    kmeans_centers = model_kmeans.stages[1].clusterCenters()
    print("Kmeans Centers: ")
    for center in kmeans_centers:
        print(center)

    # calculating stats
    ############################################################

    # Calculating Inter cluster distance
    inter_cluster_distance = np.zeros(
        (len(kmeans_centers), len(kmeans_centers)))

    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            inter_cluster_distance[centerIndex1, centerIndex2] =\
                eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

    print("inter_cluster_distance = ", inter_cluster_distance)
    # Calculating Intra cluster distances and the bars for the cluster distribution
    intra_cluster_distance = np.zeros(len(kmeans_centers))
    cluster_dist = np.zeros(len(kmeans_centers))

    for centerIndex1 in range(0, len(kmeans_centers)):
        filtered_df = predicted_df.filter(
            predicted_df["prediction"] == centerIndex1)
        cluster_dist[centerIndex1] = filtered_df.count()
        if cluster_dist[centerIndex1] == 0:
            intra_cluster_distance[centerIndex1] = 0
        else:
            filtered_df =\
                filtered_df.withColumn('distance',
                                       udf(eq_dist, FloatType())(col("features"),
                                            array([lit(v) for v in kmeans_centers[centerIndex1]])))
            intra_cluster_distance[centerIndex1] =\
                filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

    # calculating Davis-Boulding Index
    ############################################################
    # R[i,j] = (S[i] + S[j])/M[i,j]
    # D[i] = max(R[i,j]) for i !=j
    # DB = (1/K) * sum(D[i])
    r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            r_index[centerIndex1, centerIndex2] = 0
            if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                r_index[centerIndex1, centerIndex2] =\
                    (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\
                    / inter_cluster_distance[centerIndex1, centerIndex2]
    d_index = np.max(r_index, axis=0)
    db_index = np.sum(d_index, axis=0) / len(kmeans_centers)

    # pmml model generation
    ############################################################
    pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8")

    # PM stats
    ############################################################
    print("Sum of Errors for Kmeans = " + str(sum_errors))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index))
    pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(kmeans_centers)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row(
            str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance])
    pm.set_stat(tbl)

    tbl_col_name1 = []
    for j in range(0, len(kmeans_centers[0])):
        tbl_col_name1.append(str(j))
    tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row("center" + str(j) + ":",
                    ["%.2f" % x for x in kmeans_centers[j]])
    pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(
        cluster_dist.tolist())
    pm.stat(bar)

    print("PM: generating histogram from data-frame and model")
    print("PM:" + pmml_file)
    try:
        pm.set_data_distribution_stat(data=input_train, model=pmml_file)
        print("PM: done generating histogram")
    except Exception as e:
        print("PM: failed to generate histogram using pm.stat")
        print(e)

    return pmml_file
Esempio n. 16
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()
    print("PM: Configuration:")
    print("PM: Step size:                  [{}]".format(args.step_size))
    print("PM: Iterations:                 [{}]".format(args.iterations))
    print("PM: Model version:              [{}]".format(args.model_version))
    print("PM: Stats interval:             [{}]".format(args.stats_interval))
    print("PM: Save dir:                   [{}]".format(args.save_dir))

    # Initialize MLOps Library
    mlops.init()

    # print the number of iteration used by optimization algorithm
    print('Training for %i iterations' % args.iterations)

    # Create sythetic data using scikit learn
    num_samples = 50
    num_features = 20

    features, labels = make_classification(n_samples=50,
                                           n_features=20,
                                           n_informative=2,
                                           n_redundant=1,
                                           n_classes=3,
                                           n_clusters_per_class=1,
                                           random_state=42)

    # Add noise to the data
    noisy_features = np.random.uniform(0, 5) * np.random.normal(
        0, 1, (num_samples, num_features))
    features = features + noisy_features

    num_features = (features.shape[1])
    num_labels = len(np.unique(labels))

    # One-hot encode labels for all data
    onehot_labels = np.eye(num_labels)[labels]

    # Label distribution in training
    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output Health Statistics to MCenter
    # Report features whose distribution should be compared during inference
    mlops.set_data_distribution_stat(features)

    # Algorithm parameters parsed from arguments
    learning_rate = args.step_size
    training_epochs = args.iterations
    display_step = args.stats_interval

    # tf Graph Input
    x = tf.placeholder(tf.float32, [None, num_features], name="features")
    y = tf.placeholder(tf.float32, [None, num_labels], name="labels")

    # Set model weights
    W = tf.Variable(tf.zeros([num_features, num_labels]))
    b = tf.Variable(tf.zeros([num_labels]))

    # Store values for saving model
    serialized_tf_example = tf.placeholder(tf.string, name='tf_example')

    # Construct model
    pred = tf.nn.softmax(tf.matmul(x, W) + b, name="predictions")  # Softmax

    # Minimize error using cross entropy
    cost = tf.reduce_mean(-tf.reduce_sum(y *
                                         tf.log(pred), reduction_indices=1))

    # Gradient Descent
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

    # Evaluation
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(pred, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

    # Start timer
    training_start_time = time.time()

    # Initialize the variables in a tf session
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    iteration_array = []
    cost_array = []
    accuracy_array = []

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0
        temp, c, a = sess.run([optimizer, cost, accuracy],
                              feed_dict={
                                  x: features,
                                  y: onehot_labels
                              })
        # Compute average loss
        avg_cost += c / num_samples
        # Display logs per epoch step
        if (epoch + 1) % display_step == 0:
            iteration_array.append(epoch)
            cost_array.append(avg_cost)
            accuracy_array.append(np.float(a))
            print("accuracy", a)
            print("Epoch:", '%04d' % (epoch + 1), "cost=",
                  "{:.9f}".format(avg_cost))

    # Plot the cost function using MCenter
    gg = Graph().name("Cost function across epochs").set_x_series(
        iteration_array).add_y_series(label="Cost Function Across Iterations",
                                      data=cost_array)
    gg.x_title("Average Cost")
    gg.y_title('Iterations')
    mlops.set_stat(gg)

    # Plot the accuracy function using MCenter
    gg1 = Graph().name("Accuracy across epochs").set_x_series(
        iteration_array).add_y_series(label="Accuracy Across Iterations",
                                      data=accuracy_array)
    gg1.x_title("Accuracy")
    gg1.y_title('Iterations')
    mlops.set_stat(gg1)

    # Plot accuracy and cost across epochs using MCenter
    mg = MultiGraph().name("Cost and Accuracy Progress Across Epochs")
    mg.add_series(x=iteration_array,
                  label="Cost Function Across Iterations",
                  y=cost_array)
    mg.add_series(x=iteration_array,
                  label="Accuracy across epochs",
                  y=accuracy_array)
    mlops.set_stat(mg)

    # Plot final cost and accuracy in this session using MCenter
    mlt = MultiLineGraph().name("Final Accuracy and Cost").labels(
        ["Cost", "Accuracy"])
    mlt.data([cost_array[-1], accuracy_array[-1]])
    mlops.set_stat(mlt)

    # Save the model
    export_path = args.save_dir
    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    values, indices = tf.nn.top_k(y, num_labels)
    table = tf.contrib.lookup.index_to_string_table_from_tensor(
        tf.constant([str(i) for i in range(num_labels)]))
    prediction_classes = table.lookup(tf.to_int64(indices))

    # Build the signature_def_map.
    classification_inputs = tf.saved_model.utils.build_tensor_info(
        serialized_tf_example)
    classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
        prediction_classes)
    classification_outputs_scores = tf.saved_model.utils.build_tensor_info(
        values)

    classification_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                classification_inputs
            },
            outputs={
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                classification_outputs_classes,
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                classification_outputs_scores
            },
            method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME
        ))

    tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
    tensor_info_y = tf.saved_model.utils.build_tensor_info(y)

    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'inputs': tensor_info_x},
            outputs={'outputs': tensor_info_y},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')

    builder.add_meta_graph_and_variables(
        sess, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            'predict_images':
            prediction_signature,
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            classification_signature,
        },
        legacy_init_op=legacy_init_op)

    builder.save(as_text=args.use_text)
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    if args.training_iteration <= 0:
        print('Please specify a positive value for training iteration.')
        sys.exit(-1)

    # Read the train and test data sets
    mnist = mnist_input_data.read_data_sets(args.input_cache_dir, one_hot=True)

    ## MLOps start
    # Initialize the mlops library
    mlops.init()

    # Report the feature distribution for the training data
    train_images = mnist.train.images
    mlops.set_data_distribution_stat(train_images)

    # Initialize a table to track training accuracy and cost
    train_table = Table().name("Training Stats").cols(["Accuracy", "Cost"])
    ## MLOps end

    # Create the model
    sess = tf.InteractiveSession()
    serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
    feature_configs = {
        'x': tf.FixedLenFeature(shape=[784], dtype=tf.float32),
    }
    tf_example = tf.parse_example(serialized_tf_example, feature_configs)
    x = tf.identity(tf_example['x'],
                    name='x')  # use tf.identity() to assign name
    y_ = tf.placeholder('float', shape=[None, 10])
    w = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    sess.run(tf.global_variables_initializer())
    y = tf.nn.softmax(tf.matmul(x, w) + b, name='y')

    # Set the cost function and optimizer
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(
        cross_entropy)
    values, indices = tf.nn.top_k(y, 10)
    table = tf.contrib.lookup.index_to_string_table_from_tensor(
        tf.constant([str(i) for i in range(10)]))
    prediction_classes = table.lookup(tf.to_int64(indices))

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

    # Train the model
    print('Training model...')
    for i in range(args.training_iteration):
        batch = mnist.train.next_batch(50)
        _, train_cost, train_acc = sess.run(
            [train_step, cross_entropy, accuracy],
            feed_dict={
                x: batch[0],
                y_: batch[1]
            })

        # Display stats
        if (i + 1
            ) % args.display_step == 0 or i + 1 == args.training_iteration:
            # Report training accuracy and cost

            print("Training. step={}, accuracy={}, cost={}".format(
                i + 1, train_acc, train_cost))

            # MLOps start
            # multiply by 1 to convert into double
            train_table.add_row("Iterations: {}".format(i + 1),
                                [train_acc * 100, train_cost * 1])
            mlops.set_stat(train_table)
            # MLOps end

    print('Done training!')

    # Report final cost and accuracy on test set
    test_cost, test_acc = sess.run([cross_entropy, accuracy],
                                   feed_dict={
                                       x: mnist.test.images,
                                       y_: mnist.test.labels
                                   })
    print("Testing. accuracy={}, cost={}".format(test_acc, test_cost))

    ## MLOps start
    acc_table = Table().name("Test Accuracy").cols(["Accuracy"])
    acc_table.add_row("Total iterations: {}".format(args.training_iteration),
                      [test_acc])
    mlops.set_stat(acc_table)

    # Release mlops resources
    mlops.done()
    ## MLOps end

    # Export the trained model so it can be used for inference
    # WARNING(break-tutorial-inline-code): The following code snippet is
    # in-lined in tutorials, please update tutorial documents accordingly
    # whenever code changes.
    export_path = args.save_dir
    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    # Build the signature_def_map.
    classification_inputs = tf.saved_model.utils.build_tensor_info(
        serialized_tf_example)
    classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
        prediction_classes)
    classification_outputs_scores = tf.saved_model.utils.build_tensor_info(
        values)

    classification_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                classification_inputs
            },
            outputs={
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                classification_outputs_classes,
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                classification_outputs_scores
            },
            method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME
        ))

    tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
    tensor_info_y = tf.saved_model.utils.build_tensor_info(y)

    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'inputs': tensor_info_x},
            outputs={'outputs': tensor_info_y},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
    builder.add_meta_graph_and_variables(
        sess, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            'predict_images':
            prediction_signature,
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            classification_signature,
        },
        legacy_init_op=legacy_init_op)

    builder.save()

    print('Done exporting!')
Esempio n. 18
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data  # select columns 1 through end

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            #                                binary classification only!
            n_classes=2,
            random_state=42)

        # Add random noise to the data randomly
        import random
        if random.randint(1, 21) / 2 == 0:
            print("Adding Random Noise!")

            noisy_features = np.random.uniform(0, 1) * \
                             np.random.normal(0, 1,
                                              (num_samples, num_features))
            X = X + noisy_features

    # Separate into features and labels
    features = X

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                   st.TIME_SERIES)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(features)
    pred_probs = final_model.predict_proba(features)

    print("Pred Labels: ", pred_labels)  # Remove printout can be huge
    print("Pred Probabilities: ", pred_probs)  # Remove printout can be huge

    # Pred Label distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Pred Label distributions: \n {0}".format(pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name("Pred Label Distribution").cols(
        (pred_label_distribution[:, 0]).astype(str).tolist()).data(
            (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # Pred Label confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        print(" np.sum(confidence[index_class])",
              np.sum(max_pred_probs[index_class]))
        print("counts_elements[i] ", pred_counts[i])
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0

    # BarGraph showing confidence per class
    pred_values1 = [str(i) for i in pred_value]
    bar = BarGraph().name("Average Confidence Per Class").cols(
        pred_values1).data(average_confidence.tolist())
    mlops.set_stat(bar)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[pred_labels == 1],
                  max_pred_probs[pred_labels == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    if not np.isnan(ks_stat):
        print("printing KS_stat ")
        mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)
    else:
        print("not printing KS_stat ")

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Inference] KS Violation From Inference Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1],
                                   max_pred_probs[pred_labels == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Inference] PSI Violation From Inference Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Terminate MLOPs
    mlops.done()
Esempio n. 19
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # Validation Split:          [{}]".format(
        pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(
        pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(
        pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(
        pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(
        pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(
        pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(
        pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(
        pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            # binary classification only!
            n_classes=2,
            random_state=42)

        print("Adding Random Noise!")

        noisy_features = np.random.uniform(0, 1) * \
                         np.random.normal(0, 1,
                                          (num_samples, num_features))
        X = X + noisy_features

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(pm_options.validation_split), random_state=42)

    import xgboost as xgb

    # Create a model that should be deployed into production
    final_model = xgb.XGBClassifier(
        max_depth=int(pm_options.max_depth),
        min_child_weight=int(pm_options.min_child_weight),
        learning_rate=float(pm_options.learning_rate),
        n_estimators=int(pm_options.n_estimators),
        silent=True,
        objective=str(pm_options.objective),
        gamma=float(pm_options.gamma),
        max_delta_step=int(pm_options.max_delta_step),
        subsample=float(pm_options.subsample),
        colsample_bytree=1,
        colsample_bylevel=1,
        reg_alpha=float(pm_options.reg_alpha),
        reg_lambda=float(pm_options.reg_lambda),
        scale_pos_weight=float(pm_options.scale_pos_weight),
        seed=1,
        missing=None)

    final_model.fit(X_train, y_train)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(X_train)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)

    print("Pred Labels: ", pred_labels)
    print("Pred Probabilities: ", pred_probs)

    accuracy = accuracy_score(y_test, pred_labels)
    print("Accuracy values: \n {0}".format(accuracy))
    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    # column_names = value.astype(str).tolist()
    print("Validation Actual Label distributions: \n {0}".format(
        label_distribution))

    # Output Label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Validation Actual Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Validation Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name(
        "Validation Prediction Label Distributions").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # ROC for the chosen model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("ROC AUC values: \n {}".format(roc_auc))

    #     Output ROC of the chosen model using MCenter
    mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert(
            "[Training] AUC Violation From Training Node",
            "AUC Went Below {}. Current AUC Is {}".format(
                min_auc_requirement, roc_auc))

    # ROC Curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    cg = MultiGraph().name(
        "Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='ROC Curve (Area = {0:0.2f})'
                  ''.format(roc_auc),
                  x=fpr.tolist(),
                  y=tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    max_pred_probs = pred_probs.max(axis=1)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test == 1],
                                   max_pred_probs[y_test == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    # Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
Esempio n. 20
0
    def _prep_and_train(self, df_dataset):
        self.min_auc_requirement = self._params["auc_threshold"]
        self.max_ks_requirement = self._params["ks_threshold"]
        self.min_psi_requirement = self._params["psi_threshold"]
        train_on_col = self._params["train_on_column"]

        #mlops Init
        mlops.init()

        y = df_dataset[train_on_col]
        self._logger.info("train_on_col= {}".format(train_on_col))
        self._logger.info("df_dataset {}".format(df_dataset.shape[1]))
        X = df_dataset.drop(train_on_col, axis=1)
        mlops.set_data_distribution_stat(X)
        self._logger.info("df_dataset {}".format(X.shape[1]))

        # Splitting the data to train and test sets:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self._params["validation_split"], random_state=42)
        All_columns = X_train.columns.tolist()
        categorical_columns = self._params["categorical_cols"]
        mapper_list = []
        for d in All_columns:
            if d in categorical_columns:
                mapper_list.append(
                    ([d], OneHotEncoder(handle_unknown='ignore')))
            else:
                mapper_list.append(([d], MinMaxScaler()))

        mapper = DataFrameMapper(mapper_list)

        ## Training
        # XGBoost Training:
        n_cpu = multiprocessing.cpu_count()

        xgboost_model = xgb.XGBClassifier(
            max_depth=int(self._params["max_depth"]),
            min_child_weight=int(self._params["min_child_weight"]),
            learning_rate=float(self._params["learning_rate"]),
            n_estimators=int(self._params["n_estimators"]),
            silent=True,
            objective=self._params["objective"],
            gamma=float(self._params["gamma"]),
            max_delta_step=int(self._params["max_delta_step"]),
            subsample=float(self._params["subsample"]),
            colsample_bytree=1,
            colsample_bylevel=1,
            reg_alpha=float(self._params["reg_alpha"]),
            reg_lambda=float(self._params["reg_lambda"]),
            scale_pos_weight=float(self._params["scale_pos_weight"]),
            seed=1,
            n_jobs=n_cpu,
            missing=None)

        final_model = Pipeline([("mapper", mapper),
                                ("xgboost", xgboost_model)])
        final_model.fit(X_train, y_train)

        # Prediction and prediction distribution
        pred_labels = final_model.predict(X_test)
        pred_probs = final_model.predict_proba(X_test)

        # Accuracy calculation
        # Accuracy for the xgboost model
        accuracy = accuracy_score(y_test, pred_labels)
        self._logger.info("XGBoost Accuracy value: {0}".format(accuracy))
        #     Output accuracy of the chosen model using MCenter
        mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

        # Label distribution:
        # Label distribution in training
        value, counts = np.unique(y_test, return_counts=True)
        label_distribution = np.asarray((value, counts)).T
        self._logger.info(
            "Validation Actual Label distributions: \n {0}".format(
                label_distribution))
        # Output Label distribution as a BarGraph using MCenter
        export_bar_table(label_distribution[:, 0], label_distribution[:, 1],
                         "Validation - Actual Label Distribution")

        # Prediction distribution and prediction confidence distribution
        # Pred Label distribution in training
        pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
        pred_label_distribution = np.asarray((pred_value, pred_counts)).T
        self._logger.info(
            "XGBoost Validation Prediction Label Distributions: \n {0}".format(
                pred_label_distribution))
        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_label_distribution[:, 0],
                         pred_label_distribution[:, 1],
                         "Validation - XGBoost Prediction Distribution")

        # Pred confidence per label
        label_number = len(pred_counts)
        average_confidence = np.zeros(label_number)
        max_pred_probs = pred_probs.max(axis=1)
        for i in range(0, label_number):
            index_class = np.where(pred_labels == i)[0]
            if pred_counts[i] > 0:
                average_confidence[i] = np.sum(
                    max_pred_probs[index_class]) / (float(pred_counts[i]))
            else:
                average_confidence[i] = 0
        self._logger.info(
            "XGBoost Validation Average Prediction confidence per label: \n {0}"
            .format(average_confidence))

        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_value, average_confidence,
                         "Validation - XGBoost Average confidence per class")

        # Confusion Matrix
        # XGBoost Confusion Matrix
        confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
        self._logger.info(
            "Confusion Matrix for XGBoost: \n {0}".format(confmat))
        # Output Confusion Matrix as a Table using MCenter
        export_confusion_table(confmat, "XGBoost")

        # Classification Report
        # XGBoost Classification Report
        class_rep = classification_report(y_true=y_test,
                                          y_pred=pred_labels,
                                          output_dict=True)
        self._logger.info(
            "XGBoost Classification Report: \n {0}".format(class_rep))

        # AUC and ROC Curves
        # ROC for XGBoost model
        roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
        self._logger.info("XGBoost ROC AUC value: {}".format(roc_auc))

        # Output ROC of the chosen model using MCenter
        mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)

        if roc_auc <= self.min_auc_requirement:
            mlops.health_alert(
                "[Training] AUC Violation From Training Node",
                "AUC Went Below {}. Current AUC Is {}".format(
                    self.min_auc_requirement, roc_auc))

        # ROC curve
        fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])

        cg = MultiGraph().name(
            "Receiver Operating Characteristic ").set_continuous()
        cg.add_series(label='Random curve ' '', x=fpr.tolist(), y=fpr.tolist())
        cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'
                      ''.format(roc_auc),
                      x=fpr.tolist(),
                      y=tpr.tolist())
        cg.x_title('False Positive Rate')
        cg.y_title('True Positive Rate')
        mlops.set_stat(cg)

        # Feature importance comparison
        # XGBoost Feature importance
        export_feature_importance(final_model, list(X_train.columns), 5,
                                  "XGBoost")

        # KS Analysis
        max_pred_probs = pred_probs.max(axis=1)
        y_test0 = np.where(y_test == 0)[0]
        y_test1 = np.where(y_test == 1)[0]

        # KS for the XGBoost model
        ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
        ks_stat = ks.statistic
        ks_pvalue = ks.pvalue
        self._logger.info(
            "KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
                ks_stat, ks_pvalue))

        # Output KS Stat of the chosen model using MCenter
        mlops.set_stat("KS Stats for CGBoost", ks_stat, st.TIME_SERIES)

        # raising alert if ks-stat goes above required threshold
        if ks_stat >= self.max_ks_requirement:
            mlops.health_alert(
                "[Training] KS Violation From Training Node",
                "KS Stat Went Above {}. Current KS Stat Is {}".format(
                    self.max_ks_requirement, ks_stat))

        ks_table = Table().name("KS Stats for XGBoost").cols(
            ["Statistic", "pValue"])
        ks_table.add_row([ks_stat, ks_pvalue])
        mlops.set_stat(ks_table)

        # PSI Analysis
        # Calculating PSI
        total_psi, psi_table = get_psi(self, max_pred_probs[y_test0],
                                       max_pred_probs[y_test1])
        psi_table_stat = Table().name("PSI Stats for XGBoost").cols([
            "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound",
            "Base Percent", "Curr Percent", "Segment PSI"
        ])
        row_num = 1
        for each_value in psi_table.values:
            str_values = [str(i) for i in each_value]
            psi_table_stat.add_row(str(row_num), str_values)
            row_num += 1
        mlops.set_stat(psi_table_stat)
        self._logger.info("Total XGBoost PSI values: \n {}".format(total_psi))
        #     Output Total PSI of the chosen model using MCenter
        mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

        if total_psi >= self.min_psi_requirement:
            mlops.health_alert(
                "[Training] PSI Violation From Training Node",
                "PSI Went Below {}. Current PSI Is {}".format(
                    self.min_psi_requirement, total_psi))

        # ## Save the XGBoost Model
        model_file = open(self._params["output-model"], 'wb')
        pickle.dump(final_model, model_file)
        model_file.close()

        # ## Finish the program
        mlops.done()

        return (model_file)
Esempio n. 21
0
    def _prep_and_infer(self, df_dataset):
        # Get number of features
        self.num_features = df_dataset.shape[1]
        # Get number of samples
        self.num_samples = df_dataset.shape[0]
        #get input model
        self.input_model = self._params["input-model"]

        self._logger.info("PM: Configuration:")
        self._logger.info("PM: # Sample:                    [{}]".format(
            self.num_samples))
        self._logger.info("PM: # Features:                  [{}]".format(
            self.num_features))
        self._logger.info("PM: # Input-Model:               [{}]".format(
            self.input_model))

        # Initialize MLOps Library
        mlops.init()
        # Load the model
        if self.input_model is not None:
            try:
                filename = self._params["input-model"]
                model_file_obj = open(filename, 'rb')
                mlops.set_stat("# Model Files Used", 1)
            except Exception as e:
                #self._logger.error("Model Not Found")
                self._logger.error("Got Exception: {}".format(e))
                mlops.set_stat("# Model Files Used", 0)
                mlops.done()
                return 0

        final_model = pickle.load(model_file_obj)
        features = df_dataset

        # Output Health Statistics to MCenter
        # MLOps API to report the distribution statistics of each feature in the data
        # and compare it automatically with the ones
        mlops.set_data_distribution_stat(features)

        # Output the number of samples being processed using MCenter
        mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                       st.TIME_SERIES)

        # Accuracy for the chosen model
        pred_labels = final_model.predict(features)
        pred_probs = final_model.predict_proba(features)

        self._logger.info("Pred Labels: {}".format(
            pred_labels))  # Remove printout can be huge
        self._logger.info("Pred Probabilities: {}".format(
            pred_probs))  # Remove printout can be huge

        # Pred Label distribution
        pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
        pred_label_distribution = np.asarray((pred_value, pred_counts)).T
        # pred_column_names = pred_value.astype(str).tolist()
        self._logger.info(
            "Pred Label distributions: \n {}".format(pred_label_distribution))

        # Output Pred label distribution as a BarGraph using MCenter
        pred_bar = BarGraph().name("Pred Label Distribution").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
        mlops.set_stat(pred_bar)

        # Pred Label confidence per label
        label_number = len(pred_counts)
        average_confidence = np.zeros(label_number)
        max_pred_probs = pred_probs.max(axis=1)
        for i in range(0, label_number):
            index_class = np.where(pred_labels == i)[0]
            self._logger.info("np.sum(confidence[index_class]) {}".format(
                np.sum(max_pred_probs[index_class])))
            self._logger.info("counts_elements[i] {}".format(pred_counts[i]))
            if pred_counts[i] > 0:
                average_confidence[i] = np.sum(
                    max_pred_probs[index_class]) / (float(pred_counts[i]))
            else:
                average_confidence[i] = 0

        # BarGraph showing confidence per class
        pred_values1 = [str(i) for i in pred_value]
        bar = BarGraph().name("Average Confidence Per Class").cols(
            pred_values1).data(average_confidence.tolist())
        mlops.set_stat(bar)
        # Terminate MLOPs
        mlops.done()

        df_result = pd.concat([
            df_dataset,
            pd.DataFrame({'predict': pred_labels}),
            pd.DataFrame({
                'probs-0': pred_probs[:, 0],
                'probs-1': pred_probs[:, 1]
            })
        ],
                              axis=1)

        df_result.insert(0,
                         'idx', [x for x in range(1, df_result.shape[0] + 1)],
                         allow_duplicates=False)

        return df_result
Esempio n. 22
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))
    print("PM: # Classes:                   [{}]".format(
        pm_options.num_cluster))

    print("PM: Init:                        [{}]".format(pm_options.init))
    print("PM: N Init:                      [{}]".format(pm_options.n_init))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum Iterations:          [{}]".format(pm_options.max_iter))
    print("PM: Pre-Compute Distances:       [{}]".format(
        pm_options.precompute_distances))
    print("PM: Algorithm:                   [{}]".format(pm_options.algorithm))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    n_samples = int(pm_options.num_samples)
    n_features = int(pm_options.num_features)
    n_clusters = int(pm_options.num_cluster)

    init = str(pm_options.init)
    n_init = int(pm_options.n_init)
    max_iter = int(pm_options.max_iter)
    tol = float(pm_options.tol)
    precompute_distances = str(pm_options.precompute_distances)
    algorithm = str(pm_options.algorithm)
    verbose = 0
    n_jobs = 1

    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=10,
                               n_redundant=1,
                               n_classes=n_clusters,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels_true = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (n_samples, n_features))
    features = features + noisy_features

    kmeans_model = KMeans(n_clusters=n_clusters,
                          init=init,
                          n_init=n_init,
                          max_iter=max_iter,
                          tol=tol,
                          precompute_distances=precompute_distances,
                          verbose=verbose,
                          random_state=None,
                          copy_x=True,
                          n_jobs=n_jobs,
                          algorithm=algorithm).fit(features, labels_true)

    mlops.set_stat("User Defined: Training Inertia", kmeans_model.inertia_)
    mlops.set_stat("User Defined: Training Iteration", kmeans_model.n_iter_)

    value, counts = np.unique(labels_true, return_counts=True)
    label_distribution = np.asarray((value, counts)).T

    # Output actual label distribution as a BarGraph using MCenter
    bar_true = BarGraph().name("User Defined: Actual Label Distribution") \
        .cols((label_distribution[:, 0]).astype(str).tolist()) \
        .data((label_distribution[:, 1]).tolist())
    mlops.set_stat(bar_true)

    # prediction labels
    labels_pred = kmeans_model.predict(features)

    value_pred, counts_pred = np.unique(labels_pred, return_counts=True)
    label_distribution_pred = np.asarray((value_pred, counts_pred)).T

    # Output prediction label distribution as a BarGraph using MCenter
    bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \
        .data((label_distribution_pred[:, 1]).tolist())
    mlops.set_stat(bar_pred)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    ###########################################################################
    #################### Start: Adjusted Mutual Info Score ####################
    ###########################################################################

    adjusted_mutual_info_score = sklearn.metrics \
        .adjusted_mutual_info_score(labels_true=labels_true,
                                    labels_pred=labels_pred)

    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Adjusted Mutual Info Score", adjusted_mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.ADJUSTED_MUTUAL_INFO_SCORE,
                   adjusted_mutual_info_score)

    # OR

    # Third Way
    mlops.metrics.adjusted_mutual_info_score(labels_true=labels_true,
                                             labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #########################################################################
    #################### End: Adjusted Mutual Info Score ####################
    #########################################################################

    ####################################################################
    #################### Start: Adjusted Rand Score ####################
    ####################################################################

    adjusted_rand_score = sklearn.metrics \
        .adjusted_rand_score(labels_true=labels_true,
                             labels_pred=labels_pred)

    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Adjusted Rand Score", adjusted_rand_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.ADJUSTED_RAND_SCORE, adjusted_rand_score)

    # OR

    # Third Way
    mlops.metrics.adjusted_rand_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##################################################################
    #################### End: Adjusted Rand Score ####################
    ##################################################################

    #######################################################################
    #################### Start: Calinski Harabaz Score ####################
    #######################################################################

    calinski_harabaz_score = sklearn.metrics \
        .calinski_harabaz_score(X=features, labels=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Calinski Harabaz Score", calinski_harabaz_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.CALINSKI_HARABAZ_SCORE,
                   calinski_harabaz_score)

    # OR

    # Third Way
    mlops.metrics.calinski_harabaz_score(X=features, labels=labels_pred)
    #################### DONE NEW WAY ####################

    #####################################################################
    #################### End: Calinski Harabaz Score ####################
    #####################################################################

    ###################################################################
    #################### Start: Completeness Score ####################
    ###################################################################

    completeness_score = sklearn.metrics \
        .completeness_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Completeness Score", completeness_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.COMPLETENESS_SCORE, completeness_score)

    # OR

    # Third Way
    mlops.metrics.completeness_score(labels_true=labels_true,
                                     labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Completeness Score ####################
    #################################################################

    ###################################################################
    #################### Start: Contingency Matrix ####################
    ###################################################################

    contingency_matrix = sklearn.metrics.cluster \
        .contingency_matrix(labels_true, labels_pred)

    # list of sorted labels. i.e. [0, 1, 2, ..]
    pred_labels_list = sorted(set(labels_pred))
    true_labels_list = sorted(set(labels_true))

    #################### OLD WAY ####################
    # First Way
    # from parallelm.mlops.stats.table import Table
    #
    # cm_cols_ordered_string = [str(i) for i in pred_labels_list]
    # cm_rows_ordered_string = [str(i) for i in true_labels_list]
    # cm_matrix = Table().name("User Defined: Contingency Matrix").cols(cm_cols_ordered_string)
    #
    # for index in range(len(contingency_matrix)):
    #     cm_matrix.add_row(cm_rows_ordered_string[index], list(contingency_matrix[index]))
    #
    # mlops.set_stat(cm_matrix)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.CONTINGENCY_MATRIX,
                   data=contingency_matrix,
                   true_labels=true_labels_list,
                   pred_labels=pred_labels_list)

    # OR

    # Third Way
    mlops.metrics.cluster.contingency_matrix(labels_true, labels_pred)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Contingency Matrix ####################
    #################################################################

    ######################################################################
    #################### Start: Fowlkes Mallows Score ####################
    ######################################################################

    fowlkes_mallows_score = \
        sklearn.metrics.fowlkes_mallows_score(labels_true=labels_true,
                                              labels_pred=labels_pred,
                                              sparse=False)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Fowlkes Mallows Score", fowlkes_mallows_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.FOWLKES_MALLOWS_SCORE,
                   fowlkes_mallows_score)

    # OR

    # Third Way
    mlops.metrics.fowlkes_mallows_score(labels_true=labels_true,
                                        labels_pred=labels_pred,
                                        sparse=False)
    #################### DONE NEW WAY ####################

    ####################################################################
    #################### End: Fowlkes Mallows Score ####################
    ####################################################################

    #####################################################################################
    #################### Start: Homogeneity, Completeness, V Measure ####################
    #####################################################################################

    homogeneity, completeness, v_measure = sklearn.metrics \
        .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # multiline_object = MultiLineGraph() \
    #     .name("User Defined: Homogeneity - Completeness - V Measure") \
    #     .labels(["Homogeneity", "Completeness", "V Measure"])
    #
    # multiline_object.data([homogeneity, completeness, v_measure])
    #
    # mlops.set_stat(multiline_object)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.HOMOGENEITY_COMPLETENESS_V_MEASURE,
                   data=[homogeneity, completeness, v_measure])

    # OR

    # Third Way
    mlops.metrics \
        .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ###################################################################################
    #################### End: Homogeneity, Completeness, V Measure ####################
    ###################################################################################

    ##################################################################
    #################### Start: Homogeneity Score ####################
    ##################################################################

    homogeneity_score = sklearn.metrics \
        .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Homogeneity Score", homogeneity_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.HOMOGENEITY_SCORE, homogeneity_score)

    # OR

    # Third Way
    mlops.metrics \
        .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Homogeneity Score ####################
    ################################################################

    ##################################################################
    #################### Start: Mutual Info Score ####################
    ##################################################################

    mutual_info_score = sklearn.metrics \
        .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Mutual Info Score", mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.MUTUAL_INFO_SCORE, mutual_info_score)

    # OR

    # Third Way
    mlops.metrics \
        .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Mutual Info Score ####################
    ################################################################

    #############################################################################
    #################### Start: Normalized Mutual Info Score ####################
    #############################################################################

    normalized_mutual_info_score = sklearn.metrics \
        .normalized_mutual_info_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Normalized Mutual Info Score", normalized_mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.NORMALIZED_MUTUAL_INFO_SCORE,
                   normalized_mutual_info_score)

    # OR

    # Third Way
    mlops.metrics \
        .normalized_mutual_info_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ###########################################################################
    #################### End:  Normalized Mutual Info Score ####################
    ###########################################################################

    #################################################################
    #################### Start: Silhouette Score ####################
    #################################################################

    silhouette_score = sklearn.metrics \
        .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Silhouette Score", silhouette_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.SILHOUETTE_SCORE, silhouette_score)

    # OR

    # Third Way
    mlops.metrics \
        .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None)
    #################### DONE NEW WAY ####################

    ###############################################################
    #################### End: Silhouette Score ####################
    ###############################################################

    ################################################################
    #################### Start: V Measure Score ####################
    ################################################################

    v_measure_score = sklearn.metrics.v_measure_score(labels_true=labels_true,
                                                      labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: V Measure Score", v_measure_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, v_measure_score)

    # OR

    # Third Way
    mlops.metrics \
        .v_measure_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: V Measure Score ####################
    ##############################################################

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(kmeans_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
Esempio n. 23
0
def main(args):
    # Parse arguments
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    # Initialize MLOps Library
    mlops.init()

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = 50
    num_features = 20

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))

    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    features = test_data[np.random.choice(test_data.shape[0],
                                          num_samples,
                                          replace=False)]

    # Start tensorflow session
    sess = tf.InteractiveSession()
    tag_set = ["serve"]
    if args.model_dir is not None:
        try:
            print("args.model_dir = ", args.model_dir)
            tf.saved_model.loader.load(sess, tag_set, args.model_dir)
        except Exception as e:
            print("Model not found")
            print("Got exception: " + str(e))
            return 0

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(data=features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features))

    graph = tf.get_default_graph()
    x = graph.get_tensor_by_name("features:0")
    y_pred = graph.get_tensor_by_name("predictions:0")
    predictions = sess.run(y_pred, {x: features})
    print('predictions', np.array(predictions))

    # Ouput prediction distribution as a BarGraph using MCenter
    predict_int = np.argmax(predictions, axis=1)
    unique, counts = np.unique(predict_int, return_counts=True)
    counts = list(map(int, counts))
    x_series = list(map(str, unique))
    mlt = BarGraph().name("Prediction Distribution").cols(x_series).data(
        list(counts))
    mlops.set_stat(mlt)

    # Show average prediction probability value for each prediction
    num_labels = len(np.unique(predict_int))
    probability = np.zeros((num_labels, ))
    for a in range(0, num_labels):
        temp = predictions[np.argmax(predictions, axis=1) == a, :]
        print(temp)
        probability[a] = np.mean(temp[:, a])
    print("probability", list(np.squeeze(probability)))

    # Plot average probability in each class using MCenter
    bg = BarGraph().name("Probability of Each Label").cols(x_series).data(
        list(np.squeeze(probability)))
    mlops.set_stat(bg)
Esempio n. 24
0
def main():
    pm_options = parse_args()
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    classifier = pickle.load(file_obj)

    # Load the data
    test_dataset = pd.read_csv(pm_options.input_file)

    mlops.set_data_distribution_stat(test_dataset)
    # Extract numpy array
    test_features = test_dataset.values
    # Predict labels
    result = classifier.predict(test_features)
    # Predict probability
    class_probability = classifier.predict_proba(test_features)
    maximum_prob = np.max(class_probability, axis=1)

    # Tag samples that are below a certain probability and write to a file
    confidence = 0.8
    low_prob_samples = test_features[np.where(maximum_prob < confidence)]
    low_prob_predictions = result[np.where(maximum_prob < confidence)]
    unique_elements_low, counts_elements_low = np.unique(low_prob_predictions,
                                                         return_counts=True)
    unique_elements_low = [str(i) for i in unique_elements_low]
    print("Low confidence predictions: \n {0} \n with frequency {1}".format(
        unique_elements_low, counts_elements_low))

    ########## Start of ParallelM instrumentation ##############
    # BarGraph showing distribution of low confidence labels
    bar = BarGraph().name("Low confidence label distribution").cols(
        unique_elements_low).data(counts_elements_low.tolist())
    mlops.set_stat(bar)
    ########## End of ParallelM instrumentation ################

    # Samples with high probability
    high_prob_samples = test_features[np.where(maximum_prob >= confidence)]
    high_prob_predictions = result[np.where(maximum_prob >= confidence)]
    unique_elements_high, counts_elements_high = np.unique(
        high_prob_predictions, return_counts=True)
    unique_elements_high = [str(i) for i in unique_elements_high]
    print("High confidence predictions: \n {0} \n with frequency {1}".format(
        unique_elements_high, counts_elements_high))

    ########## Start of ParallelM instrumentation ##############
    # BarGraph showing distribution of high confidence labels
    bar = BarGraph().name("High confidence label distribution").cols(
        unique_elements_high).data(counts_elements_high.tolist())
    mlops.set_stat(bar)
    ########## End of ParallelM instrumentation ################

    mlops.done()
Esempio n. 25
0
def main():
    pm_options = parse_args()
    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    regression = pickle.load(file_obj)

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)

    mae_threshold = float(pm_options.threshold)

    # Create synthetic data using scikit learn
    X, y = make_regression(n_samples=num_samples,
                           n_features=num_features,
                           n_informative=2,
                           random_state=42)

    # for making labels all positive
    y = y + -1 * np.min(y)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Predict labels
    labels_pred = regression.predict(features)

    hist_pred, bin_edges_pred = np.histogram(labels_pred)

    # Output prediction label distribution as a BarGraph using MCenter
    pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols(bin_edges_pred.astype(str).tolist()) \
        .data(hist_pred.tolist()) \
        .as_continuous()

    mlops.set_stat(pred_label_bar)

    ##########################################################################
    #################### Start: Output Sample/Conversions ####################
    ########################################################################@@
    mae = np.absolute(labels_pred - labels)
    conversions = sum(i < mae_threshold for i in mae)
    samples = num_samples

    mlops.set_stat("samples", samples)

    mlops.set_stat("conversions", conversions)

    ########################################################################
    #################### End: Output Sample/Conversions ####################
    ########################################################################

    # Terminate MLOPs
    mlops.done()
Esempio n. 26
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")

    print("PM: # Validation Split:          [{}]".format(pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(pm_options.input_file))
    print("PM: Output model:                [{}]".format(pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # mlops Init
    mlops.init()

    # Loading and cleaning the data
    # This section goes though the various stages of loading and cleaning the data:
    loan_df = pd.read_csv(pm_options.input_file)

    # Cleaning NAs
    print("dataset_size = ", loan_df.shape[0])
    mlops.set_data_distribution_stat(loan_df)
    print("number of NAs per columns = ",  loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # Marking the label field. remove it from the features set:
    y = loan_df["bad_loan"]
    X = loan_df.drop("bad_loan", axis=1)

    from sklearn_pandas import DataFrameMapper

    # Splitting the data to train and test sets:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=float(pm_options.validation_split),
                                                        random_state=42)

    All_columns = X_train.columns.tolist()
    categorical_columns = ["verification_status", "addr_state", "purpose", "home_ownership", "term"]
    mapper_list =[]
    for d in All_columns:
        if d in categorical_columns:
            mapper_list.append(([d], OneHotEncoder(handle_unknown='ignore')))
        else:
            mapper_list.append(([d], MinMaxScaler()))

    mapper = DataFrameMapper(mapper_list)

    # ## Training
    # XGBoost Training:
    import xgboost as xgb
    xgboost_model = xgb.XGBClassifier(max_depth=int(pm_options.max_depth),
                                    min_child_weight=int(pm_options.min_child_weight),
                                    learning_rate=float(pm_options.learning_rate),
                                    n_estimators=int(pm_options.n_estimators),
                                    silent=True,
                                    objective=pm_options.objective,
                                    gamma=float(pm_options.gamma),
                                    max_delta_step=int(pm_options.max_delta_step),
                                    subsample=float(pm_options.subsample),
                                    colsample_bytree=1,
                                    colsample_bylevel=1,
                                    reg_alpha=float(pm_options.reg_alpha),
                                    reg_lambda=float(pm_options.reg_lambda),
                                    scale_pos_weight=float(pm_options.scale_pos_weight),
                                    seed=1,
                                    n_jobs=1,
                                    missing=None)

    final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)])

    final_model.fit(X_train, y_train)
    # Random Forest Training
    from sklearn.ensemble import RandomForestClassifier
    rf_only_model = RandomForestClassifier(n_estimators=int(pm_options.n_estimators), max_depth=int(pm_options.max_depth)+3, random_state=42, n_jobs=1, class_weight="balanced")
    rf_model = Pipeline([("mapper", mapper), ("rf", rf_only_model)])

    rf_model.fit(X_train, y_train)

    # ## Statistics on Test Dataset

    # Prediction and prediction distribution
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)
    rf_pred_labels = rf_model.predict(X_test)
    rf_pred_probs = rf_model.predict_proba(X_test)

    # Accuracy calculation
    # Accuracy for the xgboost model
    accuracy = accuracy_score(y_test, pred_labels)
    print("XGBoost Accuracy value: {0}".format(accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

    # Accuracy for the RF model
    rf_accuracy = accuracy_score(y_test, rf_pred_labels)
    print("RF Accuracy value: {0}".format(rf_accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("RF Accuracy", rf_accuracy, st.TIME_SERIES)

    # Label distribution:
    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    print("Validation Actual Label distributions: \n {0}".format(label_distribution))
    # Output Label distribution as a BarGraph using MCenter
    export_bar_table(label_distribution[:,0], label_distribution[:,1], "Validation - Actual Label Distribution")

    # Prediction distribution and prediction confidence distribution
    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Validation Prediction Label Distributions: \n {0}".format(pred_label_distribution))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_label_distribution[:,0], pred_label_distribution[:,1], "Validation - XGBoost Prediction Distribution")

    rf_pred_value, rf_pred_counts = np.unique(rf_pred_labels, return_counts=True)
    rf_pred_label_distribution = np.asarray((rf_pred_value, rf_pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("RF Validation Prediction Label Distributions: \n {0}".format(rf_pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(rf_pred_label_distribution[:,0], rf_pred_label_distribution[:,1], "Validation - RF Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(max_pred_probs[index_class])/(float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".format(average_confidence))

    #  Pred confidence per label
    rf_label_number = len(rf_pred_counts)
    rf_average_confidence = np.zeros(rf_label_number)
    rf_max_pred_probs = rf_pred_probs.max(axis=1)
    for i in range(0, rf_label_number):
        rf_index_class = np.where(rf_pred_labels == i)[0]
        if rf_pred_counts[i] > 0:
            rf_average_confidence[i] = np.sum(rf_max_pred_probs[rf_index_class])/(float(rf_pred_counts[i]))
        else:
            rf_average_confidence[i] = 0
    print("RF Validation Average Prediction confidence per label: \n {0}".format(rf_average_confidence))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class")
    export_bar_table(rf_pred_value, rf_average_confidence, "Validation - RF Average confidence per class")

    # Confusion Matrix
    # XGBoost Confusion Matrix
    confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
    print("Confusion Matrix for XGBoost: \n {0}".format(confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(confmat, "XGBoost")
    # RF Confusion Matrix
    rf_confmat = confusion_matrix(y_true=y_test, y_pred=rf_pred_labels)
    print("Confusion Matrix for RF: \n {0}".format(rf_confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(rf_confmat, "RF")

    # Classification Report
    # XGBoost Classification Report
    class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True)
    print("XGBoost Classification Report: \n {0}".format(class_rep))
    # RF Classification Report
    rf_class_rep = classification_report(y_true=y_test, y_pred=rf_pred_labels, output_dict=True)
    print("RF Classification Report: \n {0}".format(rf_class_rep))
    # Output Classification Report as a Table using MCenter
    export_classification_report(class_rep, "XGBoost")
    export_classification_report(rf_class_rep, "RF")

    # AUC and ROC Curves
    # ROC for XGBoost model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("XGBoost ROC AUC value: {}".format(roc_auc))
    rf_roc_auc = roc_auc_score(y_test, rf_pred_probs[:, 1])
    print("RF ROC AUC value:  {}".format(rf_roc_auc))
    # Output ROC of the chosen model using MCenter
    mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)
    mlops.set_stat("RF ROC AUC", rf_roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert("[Training] AUC Violation From Training Node",
                           "AUC Went Below {}. Current AUC Is {}".format(min_auc_requirement, roc_auc))

    # ROC curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, rf_pred_probs[:, 1])

    cg = MultiGraph().name("Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random curve ''', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist())
    cg.add_series(label='RF ROC curve (area = {0:0.2f})'''.format(rf_roc_auc), x=rf_fpr.tolist(), y=rf_tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    # Feature importance comparison
    # XGBoost Feature importance
    export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost")
    export_feature_importance(rf_model, list(X_train.columns), 5, "RF")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0=np.where(y_test == 0)[0]
    y_test1=np.where(y_test == 1)[0]
    rf_max_pred_probs = rf_pred_probs.max(axis=1)

    # KS for the XGBoost model
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(ks_stat, ks_pvalue))
    # KS for the RF model
    rf_ks = ks_2samp(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    rf_ks_stat = rf_ks.statistic
    rf_ks_pvalue = rf_ks.pvalue
    print("RF KS values: \n Statistics: {} \n pValue: {}\n".format(rf_ks_stat, rf_ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for RF", rf_ks_stat, st.TIME_SERIES)

    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert("[Training] KS Violation From Training Node",
                           "KS Stat Went Above {}. Current KS Stat Is {}".format(max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats for XGBoost").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1])
    rf_total_psi, rf_psi_table = get_psi(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats for XGBoost").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert("[Training] PSI Violation From Training Node",
                           "PSI Went Below {}. Current PSI Is {}".format(min_psi_requirement,
                                                                         total_psi))

    print("Total RF PSI values: \n {}".format(rf_total_psi))
    rf_psi_table_stat = Table().name("PSI Stats for RF").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in rf_psi_table.values:
        str_values = [str(i) for i in each_value]
        rf_psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(rf_psi_table_stat)
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total RF PSI ", rf_total_psi, st.TIME_SERIES)

    # ## Save the XGBoost Model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()

    # ## Finish the program
    mlops.done()
Esempio n. 27
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: Kernel:                      [{}]".format(pm_options.kernel))
    print("PM: Degree:                      [{}]".format(pm_options.degree))
    print("PM: Gamma:                       [{}]".format(pm_options.gamma))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum iterations:          [{}]".format(pm_options.max_iter))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)

    # Create synthetic data using scikit learn
    X, y = make_regression(n_samples=num_samples,
                           n_features=num_features,
                           n_informative=2,
                           random_state=42)

    # for making labels all positive
    y = y + -1 * np.min(y)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    hist, bin_edges = np.histogram(labels)

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("User Defined: Label Distribution") \
        .cols((bin_edges).astype(str).tolist()) \
        .data((hist).tolist()) \
        .as_continuous()

    mlops.set_stat(bar)

    # Create a model that should be deployed into production
    final_model = SVR(kernel=pm_options.kernel,
                      degree=int(pm_options.degree),
                      gamma=str(pm_options.gamma),
                      tol=float(pm_options.tol),
                      max_iter=int(pm_options.max_iter))

    final_model.fit(features, labels)

    labels_pred = final_model.predict(features)
    hist_pred, bin_edges_pred = np.histogram(labels_pred)

    # Output prediction label distribution as a BarGraph using MCenter
    pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols((bin_edges_pred).astype(str).tolist()) \
        .data((hist_pred).tolist()) \
        .as_continuous()

    mlops.set_stat(pred_label_bar)

    y_true = labels
    y_pred = labels_pred

    # Regression Metrics
    ##########################################################################
    #################### Start: Output Explained Variance ####################
    ##########################################################################

    evs = sklearn.metrics.explained_variance_score(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: Explained Variance ", evs)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.EXPLAINED_VARIANCE_SCORE, evs)

    # OR

    # Third Way
    # mlops.metrics.explained_variance_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ########################################################################
    #################### End: Output Explained Variance ####################
    ########################################################################

    ######################################################################
    #################### Start: Output Mean Abs Error ####################
    ######################################################################

    mae = sklearn.metrics.mean_absolute_error(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: Mean Abs Error", mae)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.MEAN_ABSOLUTE_ERROR, mae)

    # OR

    # Third Way
    # mlops.metrics.mean_absolute_error(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ####################################################################
    #################### End: Output Mean Abs Error ####################
    ####################################################################

    ##########################################################################
    #################### Start: Output Mean Squared Error ####################
    ##########################################################################

    mse = sklearn.metrics.mean_squared_error(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: Mean Squared Error", mse)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.MEAN_SQUARED_ERROR, mse)

    # OR

    # Third Way
    # mlops.metrics.mean_squared_error(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ########################################################################
    #################### End: Output Mean Squared Error ####################
    ########################################################################

    ##############################################################################
    #################### Start: Output Mean Squared Log Error ####################
    ##############################################################################

    msle = sklearn.metrics.mean_squared_log_error(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: Mean Squared Log Error", msle)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.MEAN_SQUARED_LOG_ERROR, msle)

    # OR

    # Third Way
    # mlops.metrics.mean_squared_log_error(y_true=labels, y_pred=labels_pred)

    #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Mean Squared Log Error ####################
    ############################################################################

    ########################################################################
    #################### Start: Output Median Abs Error ####################
    ########################################################################

    median_ae = sklearn.metrics.median_absolute_error(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: Median Abs Error", median_ae)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.MEDIAN_ABSOLUTE_ERROR, median_ae)

    # OR

    # Third Way
    # mlops.metrics.median_absolute_error(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ######################################################################
    #################### End: Output Median Abs Error ####################
    ######################################################################

    ################################################################
    #################### Start: Output R2 Score ####################
    ################################################################

    r2_s = sklearn.metrics.r2_score(y_true, y_pred)

    #################### OLD WAY ####################
    # First Way
    mlops.set_stat("User Defined: R2 Score", r2_s)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    # mlops.set_stat(RegressionMetrics.R2_SCORE, r2_s)

    # OR

    # Third Way
    # mlops.metrics.r2_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output R2 Score ####################
    ##############################################################

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()