def main(): pm_options = parse_args() # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 classifier = pickle.load(file_obj) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() test_features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(test_features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Predict labels result = classifier.predict(test_features) # Label distribution in prediction value, counts = np.unique(result, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Terminate MLOPs mlops.done()
def gen_data_dist_stats(spark_ctx): spark_session = SparkSession(spark_ctx) # Import Data ################################## K = 3 # fixed number of centers num_attr = 10 # fixed number of attributes num_rows = 60000 # number of rows in the dataset input_data = generate_dataset(num_attr, num_rows, K, spark_ctx) column_names_all = input_data.columns for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed(column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() input_train = input_data # SparkML pipeline ################################## exclude_cols = [] column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) vector_assembler = VectorAssembler(inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans(k=K, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) try: mlops.set_data_distribution_stat(data=input_train, model=model_kmeans) m = mlops.Model(model_format=ModelFormat.SPARKML) m.set_data_distribution_stat(data=input_train) print("PM: done generating histogram") except Exception as e: print("PM: failed to generate histogram using pm.stat") print(e) # Indicating that model statistics were reported mlops.set_stat(E2EConstants.MODEL_STATS_REPORTED_STAT_NAME, 1) return model_kmeans
def _materialize(self, parent_data_objs, user_data): df_infer_set = self._gen_inf_dataset(parent_data_objs[0]) # Initialize MLOps Library mlops.init() #Record the data distribution stats for the DataFrame mlops.set_data_distribution_stat(df_infer_set) # Terminate MLOPs mlops.done() return [df_infer_set]
def test_data_distribution_stat_api(generate_da_with_missing_data): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) pm._set_api_test_mode() # basic test data = np.array([[1, 2], [3, 4]]) pm.set_data_distribution_stat(data) # test with column missing blah = pd.read_csv(generate_da_with_missing_data) pm.set_data_distribution_stat(blah) pm.done()
def test_data_distribution_stat_api_spark(spark_session, generate_da_with_missing_data): sc = spark_session.sparkContext pm.init(ctx=sc, mlops_mode=MLOpsMode.STAND_ALONE) pm._set_api_test_mode() pdf = pd.read_csv(generate_da_with_missing_data) spark_df = spark_session.createDataFrame(pdf) pm.set_data_distribution_stat(data=spark_df) sc.stop() pm.done()
def main(): ## MLOPS start # Initialize mlops mlops.init() ## MLOPS end parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() print("Loading model from: {}".format(args.model_dir)) if os.path.isdir(args.model_dir): print("Found model") else: print("No model found. Exiting.") exit(0) # load the model model = SavedModel(args.model_dir, args.sig_name) # get the input input = MnistStreamInput(args.input_dir, args.total_records, args.random) test_data = input._samples mlops.set_data_distribution_stat(test_data) # track confidence conf_tracker = ConfidenceTracker(args.track_conf, args.conf_thresh, args.conf_percent, args.output_low_conf) # perform inferences on the input infer_loop(model, input, args.output_file, args.stats_interval, conf_tracker) del model del input ### MLOPS start mlops.done() ### MLOPS end print("Inference batch complete") exit(0)
def main(): parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() mnist_data = get_input(args.input_dir) X = mnist_data.train.images ## MLOps start # Initialize the mlops library mlops.init() # Report the feature distribution for the training data mlops.set_data_distribution_stat(X) ## MLOps end train(mnist_data, args.epochs, args.model_dir, args.display_step) ## MLOps start # Release mlops resources mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Classes: [{}]".format( pm_options.num_classes)) print("PM: C: [{}]".format(pm_options.C)) print("PM: Kernel: [{}]".format(pm_options.kernel)) print("PM: Degree: [{}]".format(pm_options.degree)) print("PM: Gamma: [{}]".format(pm_options.gamma)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum iterations: [{}]".format(pm_options.max_iter)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) num_classes = int(pm_options.num_classes) # Create synthetic data using scikit learn X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=num_classes, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Create a model that should be deployed into production final_model = SVC(C=float(pm_options.C), probability=True, kernel=pm_options.kernel, degree=int(pm_options.degree), gamma=str(pm_options.gamma), tol=float(pm_options.tol), max_iter=int(pm_options.max_iter)) final_model.fit(features, labels) value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T # Output actual label distribution as a BarGraph using MCenter bar = BarGraph().name("User Defined: Actual Label Distribution") \ .cols((label_distribution[:, 0]).astype(str).tolist()) \ .data((label_distribution[:, 1]).tolist()) mlops.set_stat(bar) pos_label = 1 # calculate classification prediction labels_pred = final_model.predict(features) # calculate decision scores [n_sample, n_class] labels_decision_score = final_model.decision_function(features) # calculate classification probabilities [n_sample, n_class] labels_prob = final_model.predict_proba(features) # calculate classification probabilities of positive labels label_pos_class_prob = list(map(lambda x: x[pos_label], labels_prob)) # list of sorted labels. i.e. [0, 1, 2, ..] labels_ordered = sorted(set(labels)) value_pred, counts_pred = np.unique(labels_pred, return_counts=True) label_distribution_pred = np.asarray((value_pred, counts_pred)).T # Output prediction label distribution as a BarGraph using MCenter bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \ .data((label_distribution_pred[:, 1]).tolist()) mlops.set_stat(bar_pred) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) ################################################################ #################### Start: Output Accuracy #################### ################################################################ accuracy = final_model.score(features, labels) #################### OLD WAY #################### # First Way # # # Output accuracy of the chosen model using MCenter # mlops.set_stat("User Defined: Accuracy", accuracy, st.TIME_SERIES) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ACCURACY_SCORE, accuracy) # OR # Third Way mlops.metrics.accuracy_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################## #################### End: Output Accuracy #################### ############################################################## ################################################################ #################### Start: Output AUC #################### ################################################################ fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, labels_pred, pos_label=pos_label) auc = sklearn.metrics.auc(fpr, tpr) #################### OLD WAY #################### # First Way # # # Output auc of the chosen model using MCenter # mlops.set_stat("User Defined: AUC", auc) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.AUC, auc) # OR # Third Way mlops.metrics.auc(x=fpr, y=tpr) #################### DONE NEW WAY #################### ############################################################## #################### End: Output AUC #################### ############################################################## ############################################################################### #################### Start: Output Average Precision Score #################### ############################################################################### # average precision is not supported for multiclass if len(labels_ordered) <= 2: aps = sklearn.metrics.average_precision_score(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output aps of the chosen model using MCenter # mlops.set_stat("User Defined: Average Precision Score", aps) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.AVERAGE_PRECISION_SCORE, aps) # OR # Third Way mlops.metrics.average_precision_score(y_true=labels, y_score=labels_decision_score) #################### DONE NEW WAY #################### ############################################################################# #################### End: Output Average Precision Score #################### ############################################################################# ######################################################################### #################### Start: Output Balanced Accuracy #################### ######################################################################### bas = sklearn.metrics.balanced_accuracy_score(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output bas of the chosen model using MCenter # mlops.set_stat("User Defined: Balanced Accuracy Score", bas) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, data=bas) # OR # Third Way mlops.metrics.balanced_accuracy_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ####################################################################### #################### End: Output Balanced Accuracy #################### ####################################################################### ######################################################################## #################### Start: Output Brier Score Loss #################### ######################################################################## bsl = sklearn.metrics.brier_score_loss(labels, label_pos_class_prob, pos_label=pos_label) #################### OLD WAY #################### # First Way # # # Output bsl of the chosen model using MCenter # mlops.set_stat("User Defined: Brier Score Loss", bsl) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.BRIER_SCORE_LOSS, data=bsl) # OR # Third Way mlops.metrics.brier_score_loss(y_true=labels, y_prob=label_pos_class_prob, pos_label=pos_label) #################### DONE NEW WAY #################### ###################################################################### #################### End: Output Brier Score Loss #################### ###################################################################### ############################################################################# #################### Start: Output Classification Report #################### ############################################################################# cr = sklearn.metrics.classification_report(labels, labels_pred) print("Classification Report\n{}".format(cr)) #################### OLD WAY #################### # First Way # # from parallelm.mlops.stats.table import Table # # arrayReport = list() # for row in cr.split("\n"): # parsed_row = [x for x in row.split(" ") if len(x) > 0] # if len(parsed_row) > 0: # arrayReport.append(parsed_row) # # header = arrayReport[0] # cr_table = Table().name("User Defined: Classification Report").cols(header) # # for index in range(1, len(arrayReport)): # row_title = arrayReport[index][0] # row_value = arrayReport[index][:-1] # cr_table.add_row(row_title, row_value) # # # output classification report using MCenter # mlops.set_stat(cr_table) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.CLASSIFICATION_REPORT, data=cr) # OR # Third Way mlops.metrics.classification_report(labels, labels_pred) #################### DONE NEW WAY #################### ########################################################################### #################### End: Output Classification Report #################### ########################################################################### ######################################################################### #################### Start: Output Cohen Kappa Score #################### ######################################################################### cks = sklearn.metrics.cohen_kappa_score(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output cks of the chosen model using MCenter # mlops.set_stat("User Defined: Cohen Kappa Score", cks) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.COHEN_KAPPA_SCORE, data=cks) # OR # Third Way mlops.metrics.cohen_kappa_score(labels, labels_pred) #################### DONE NEW WAY #################### ####################################################################### #################### End: Output Cohen Kappa Score #################### ####################################################################### ######################################################################## #################### Start: Output Confusion Matrix #################### ######################################################################## cm = sklearn.metrics.confusion_matrix(labels, labels_pred, labels=labels_ordered) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.table import Table # labels_string = [str(i) for i in labels_ordered] # cm_matrix = Table().name("User Defined: Confusion Matrix").cols(labels_string) # # for index in range(len(cm)): # cm_matrix.add_row(labels_string[index], list(cm[index])) # # mlops.set_stat(cm_matrix) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.CONFUSION_MATRIX, cm, labels=labels_ordered) # OR # Third Way mlops.metrics.confusion_matrix(y_true=labels, y_pred=labels_pred, labels=labels_ordered) #################### DONE NEW WAY #################### ###################################################################### #################### End: Output Confusion Matrix #################### ###################################################################### ################################################################ #################### Start: Output F1 Score #################### ################################################################ f1 = sklearn.metrics.f1_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output f1 score of the chosen model using MCenter # mlops.set_stat("User Defined: F1 Score", f1) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.F1_SCORE, data=f1) # OR # Third Way mlops.metrics.f1_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ############################################################## #################### End: Output F1 Score #################### ############################################################## ################################################################ #################### Start: Output FBeta Score #################### ################################################################ fbeta = sklearn.metrics.fbeta_score(labels, labels_pred, beta=0.5, average=None) #################### OLD WAY #################### # First Way # # # Output fbeta score of the chosen model using MCenter # mlops.set_stat("User Defined: F-beta Score", fbeta) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.FBETA_SCORE, data=fbeta) # OR # Third Way mlops.metrics.fbeta_score(labels, labels_pred, pos_label=pos_label, beta=0.5, average=None) #################### DONE NEW WAY #################### ################################################################# #################### End: Output FBeta Score #################### ################################################################# #################################################################### #################### Start: Output Hamming Loss #################### #################################################################### hamming_loss = sklearn.metrics.hamming_loss(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output hamming loss of the chosen model using MCenter # mlops.set_stat("User Defined: Hamming Loss", hamming_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.HAMMING_LOSS, data=hamming_loss) # OR # Third Way mlops.metrics.hamming_loss(labels, labels_pred) #################### DONE NEW WAY #################### ################################################################## #################### End: Output Hamming Loss #################### ################################################################## ################################################################## #################### Start: Output Hinge Loss #################### ################################################################## hinge_loss = sklearn.metrics.hinge_loss(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output hinge loss of the chosen model using MCenter # mlops.set_stat("User Defined: Hinge Loss", hinge_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.HINGE_LOSS, data=hinge_loss) # OR # Third Way mlops.metrics.hinge_loss(labels, labels_decision_score) #################### DONE NEW WAY #################### ################################################################ #################### End: Output Hinge Loss #################### ################################################################ ############################################################################## #################### Start: Output Jaccard Similarity Score #################### ############################################################################## jaccard_sim_score = sklearn.metrics.jaccard_similarity_score( labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output jaccard similarity score of the chosen model using MCenter # mlops.set_stat("User Defined: Jaccard Similarity Score", jaccard_sim_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.JACCARD_SIMILARITY_SCORE, data=jaccard_sim_score) # OR # Third Way mlops.metrics.jaccard_similarity_score(labels, labels_pred) #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Jaccard Similary Score #################### ############################################################################ ################################################################ #################### Start: Output Log Loss #################### ################################################################ log_loss = sklearn.metrics.log_loss(labels, labels_prob) #################### OLD WAY #################### # First Way # # # Output log loss of the chosen model using MCenter # mlops.set_stat("User Defined: Log Loss", log_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.LOG_LOSS, data=log_loss) # OR # Third Way mlops.metrics.log_loss(labels, labels_prob) #################### DONE NEW WAY #################### ############################################################## #################### End: Output Log Loss #################### ############################################################## ######################################################################################## #################### Start: Output Matthews Correlation Coefficient #################### ######################################################################################## mcc = sklearn.metrics.matthews_corrcoef(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output mcc of the chosen model using MCenter # mlops.set_stat("User Defined: Matthews Correlation Coefficient", mcc) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT, data=mcc) # OR # Third Way mlops.metrics.matthews_corrcoef(labels, labels_pred) #################### DONE NEW WAY #################### ###################################################################################### #################### End: Output Matthews Correlation Coefficient #################### ###################################################################################### ############################################################################## #################### Start: Output Precision Recall Curve #################### ############################################################################## # precision_recall_curve is not supported for multiclass if len(labels_ordered) <= 2: precision, recall, thresholds = sklearn.metrics.precision_recall_curve( labels, labels_decision_score, pos_label=pos_label) classes = len(labels_ordered) average_precision = sklearn.metrics.average_precision_score( labels, labels_decision_score, average="macro") graph_label_str = "{}-class Precision Recall Curve -- AP: {}".format( classes, average_precision) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.graph import Graph # # p_r_curve = Graph() \ # .name("User Defined: Precision Recall Curve") \ # .set_x_series(list(recall)) \ # .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(precision)) # # p_r_curve.x_title("Recall") # p_r_curve.y_title("Precision") # mlops.set_stat(p_r_curve) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.PRECISION_RECALL_CURVE, [precision, recall], legend=graph_label_str) # OR # Third Way mlops.metrics.precision_recall_curve(y_true=labels, probas_pred=labels_decision_score, pos_label=pos_label, average="macro") #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Precision Recall Curve #################### ############################################################################ ####################################################################### #################### Start: Output Precision Score #################### ####################################################################### precision_score = sklearn.metrics.precision_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output precision score of the chosen model using MCenter # mlops.set_stat("User Defined: Precision Score", precision_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.PRECISION_SCORE, data=precision_score) # OR # Third Way mlops.metrics.precision_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Precision Score ########################### ############################################################################ #################################################################### #################### Start: Output Recall Score #################### #################################################################### recall_score = sklearn.metrics.recall_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output recall score of the chosen model using MCenter # mlops.set_stat("User Defined: Recall Score", recall_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.RECALL_SCORE, data=recall_score) # OR # Third Way mlops.metrics.recall_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ######################################################################### #################### End: Output Recall Score ########################### ######################################################################### ##################################################################### #################### Start: Output ROC AUC Score #################### ##################################################################### # roc_auc_score is not supported for multiclass if len(labels_ordered) <= 2: roc_auc_score = sklearn.metrics.roc_auc_score(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output roc auc score of the chosen model using MCenter # mlops.set_stat("User Defined: ROC AUC Score", roc_auc_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ROC_AUC_SCORE, data=roc_auc_score) # OR # Third Way mlops.metrics.roc_auc_score(labels, labels_decision_score) #################### DONE NEW WAY #################### ################################################################### #################### End: Output ROC AUC Score #################### ################################################################### ################################################################# #################### Start: Output ROC Curve #################### ################################################################# # roc_auc_score is not supported for multiclass if len(labels_ordered) <= 2: fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, labels_decision_score, pos_label=pos_label) roc_auc_score = sklearn.metrics.roc_auc_score(labels, labels_decision_score) graph_label_str = "ROC Curve, AUC: {}".format(roc_auc_score) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.graph import Graph # # roc_curve = Graph() \ # .name("User Defined: ROC Curve") \ # .set_x_series(list(fpr)) \ # .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(tpr)) # # roc_curve.x_title("False Positive Rate") # roc_curve.y_title("True Positive Rate") # # mlops.set_stat(roc_curve) #################### DONE OLD WAY #################### #################### NEW WAY #################### mlops.set_stat(ClassificationMetrics.ROC_CURVE, [tpr, fpr], legend=graph_label_str) # OR # Third Way mlops.metrics.roc_curve(y_true=labels, y_score=labels_decision_score, pos_label=pos_label) #################### DONE NEW WAY #################### ############################################################### #################### End: Output ROC Curve #################### ############################################################### ##################################################################### #################### Start: Output Zero One Loss #################### ##################################################################### zol = sklearn.metrics.zero_one_loss(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output zol of the chosen model using MCenter # mlops.set_stat("User Defined: Zero One Loss", zol) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ZERO_ONE_LOSS, data=zol) # OR # Third Way mlops.metrics.zero_one_loss(labels, labels_pred) #################### DONE NEW WAY #################### ################################################################### #################### End: Output Zero One Loss #################### ################################################################### # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def main(): # Initialize spark and MLOps spark = SparkSession.builder.appName( "RandomForestClassifier").getOrCreate() mlops.init(spark.sparkContext) # parse the arguments to component options = parse_args() # Load the model, exit gracefully if model is not found try: model_rf = \ SparkPipelineModelHelper() \ .set_shared_context(spark_context=spark.sparkContext) \ .set_local_path(local_path=options.input_model) \ .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \ .load_sparkml_model() except Exception as e: print(e) mlops.done() spark.sparkContext.stop() exit() # Generate synthetic data for inference (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = 50 num_features = 20 np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() test_features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] feature_names = [ "".join(ascii_lowercase[a]) for a in range(num_features + 1) ] # Create a spark dataframe from the synthetic data generated inferenceData = spark.createDataFrame( pd.DataFrame(test_features, columns=feature_names[1:num_features + 1])) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score mlops.set_data_distribution_stat(inferenceData) num_samples = inferenceData.count() # Report the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Make inference predictions predicted_df = model_rf.transform(inferenceData) # Create a bar graph with label and confidence distributions histogram_predictions = predicted_df.groupby("prediction").count() prediction_values = np.array( histogram_predictions.select("prediction").collect()) prediction_counts = np.array( histogram_predictions.select("count").collect()) # Report label distribution as a BarGraph using MCenter bar_predictions = BarGraph().name("Prediction Distribution").cols( (prediction_values[0]).astype(str).tolist()).data( (prediction_counts[0]).tolist()) mlops.set_stat(bar_predictions) # Stop spark context and MLOps spark.sparkContext.stop() mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: # Model File: [{}]".format( pm_options.input_model)) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: print("Model Not Found") print("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) # Loading the data loan_df = pd.read_csv(pm_options.input_file) X = loan_df # Cleaning NAs mlops.set_data_distribution_stat(loan_df) print("dataset_size = ", loan_df.shape[0]) print("number of NAs per columns = \n", loan_df.isnull().sum()) loan_df = loan_df.dropna() print("dataset_size without NA rows= ", loan_df.shape[0]) # ## Inference pred_labels = final_model.predict(X) pred_probs = final_model.predict_proba(X) # Prediction distribution and prediction confidence distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T print("XGBoost Inference Prediction Label Distributions: \n {0}".format( pred_label_distribution)) export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:, 1], "Inference - XGBoost Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 print("XGBoost Validation Average Prediction confidence per label: \n {0}". format(average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") # Feature importance comparison export_feature_importance(final_model, list(X.columns), 5, "XGBoost") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0 = np.where(pred_labels == 0)[0] y_test1 = np.where(pred_labels == 1)[0] ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total XGBoost PSI values: \n {}".format(total_psi)) print("XGBoost PSI Stats: \n {}".format(psi_table)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) if total_psi >= min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # ## Finish the program mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: Data file: [{}]".format(pm_options.data_file)) print("PM: Output model: [{}]".format(pm_options.output_model)) print("PM: regularization_range: [{}]".format( pm_options.regularization_range)) mlops.init() # Read the Samsung datafile dataset = pd.read_csv(pm_options.data_file) # Separate into features and labels features = dataset.iloc[:, 1:].values labels = dataset.iloc[:, 0].values # Hyper-parameter search using k-fold cross-validation # Applying k_fold cross validation regularization_range = pm_options.regularization_range.split(',') regularization = [ float(regularization_var) for regularization_var in regularization_range ] tune_parameters = [{'C': regularization}] # Initialize logistic regression algorithm LR = LogisticRegression(class_weight='balanced', multi_class='multinomial', solver='lbfgs') clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy') clf.fit(features, labels) print("best parameter = ", clf.best_params_) accuracy = clf.cv_results_['mean_test_score'] print( 'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format( accuracy, regularization)) ########## Start of ParallelM instrumentation ############## # Report Hyper-parameter Table tbl = Table().name("Hyper-parameter Search Results").cols( ["Mean accuracy from k-fold cross-validation"]) print("length of regularization", len(regularization)) index_max = np.argmax(accuracy) for a in range(0, len(regularization)): print("adding row", regularization[a]) if a == index_max: tbl.add_row("[Best] Regularization = " + np.str(regularization[a]), [accuracy[a]]) else: tbl.add_row("Regularization = " + np.str(regularization[a]), [accuracy[a]]) mlops.set_stat(tbl) ########## End of ParallelM instrumentation ############## # Label distribution in training label_distribution = dataset['label'].value_counts() column_names = np.array(label_distribution.index).astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) ########## Start of ParallelM instrumentation ############## # Report label distribution as a BarGraph bar = BarGraph().name("Label Distribution").cols( np.array(label_distribution.index).astype(str).tolist()).data( label_distribution.values.tolist()) mlops.set_stat(bar) ########## Start of ParallelM instrumentation ############## #################### Start of ParallelM instrumentation ################ # Report accuracy of the chosen model mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max], st.TIME_SERIES) #################### End of ParallelM instrumentation ################ # Histogram input mlops.set_data_distribution_stat(dataset) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(clf, model_file) model_file.close() mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: C: [{}]".format(pm_options.C)) print("PM: Kernel: [{}]".format(pm_options.kernel)) print("PM: Degree: [{}]".format(pm_options.degree)) print("PM: Gamma: [{}]".format(pm_options.gamma)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum iterations: [{}]".format(pm_options.max_iter)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=3, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Create a model that should be deployed into production final_model = SVC(C=float(pm_options.C), kernel=pm_options.kernel, degree=int(pm_options.degree), gamma=str(pm_options.gamma), tol=float(pm_options.tol), max_iter=int(pm_options.max_iter)) final_model.fit(features, labels) # Accuracy for the chosen model accuracy = final_model.score(features, labels) print("Accuracy values: \n {0}".format(accuracy)) # Label distribution in training value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def main(): # Initialize spark and MLOps spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate() mlops.init(spark.sparkContext) # parse the arguments to component options = parse_args() print("PM: Configuration:") print("PM: Number of trees: [{}]".format(options.num_trees)) print("PM: Maximum depth: [{}]".format(options.max_depth)) print("PM: Output model: [{}]".format(options.output_model)) print("PM: Temp shared path: [{}]".format(options.temp_shared_path)) # Generate synthetic data using scikit learn num_samples = 50 num_features = 20 num_classes = 3 X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=num_classes, n_clusters_per_class=1, random_state=42) X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features)) feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)] feature_names[0] = "label" # Create a spark dataframe from the synthetic data generated trainingData = spark.createDataFrame( pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names)) # Histogram of label distribution value, counts = np.unique(y, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output Health Statistics to MCenter # Report features whose distribution should be compared during inference mlops.set_data_distribution_stat(trainingData) # Fit a random forest classifiction model assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features") layers = [num_features, 5, 4, num_classes] classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth)) pipeline = Pipeline(stages=[assembler, classifier]) model = pipeline.fit(trainingData) predictions = model.transform(trainingData) # Select (prediction, true label) and compute training error evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Report accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Save the spark model SparkPipelineModelHelper() \ .set_shared_context(spark_context=spark.sparkContext) \ .set_local_path(local_path=options.output_model) \ .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \ .save_sparkml_model(model) # Stop spark context and MLOps spark.sparkContext.stop() mlops.done()
def get_data_distribution_stat(df_clean): """ Record the data distribution stats for the DataFrame """ mlops.set_data_distribution_stat(df_clean)
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv").option( "header", pm_options.with_headers).option( "ignoreLeadingWhiteSpace", "true").option("ignoreTrailingWhiteSpace", "true").option( "inferschema", "true").load(pm_options.data_file)).repartition(10) column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed( column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() input_train = input_data input_test = input_data # SparkML pipeline ################################## exclude_cols = [] column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) vector_assembler = VectorAssembler(inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans(k=int(pm_options.K), initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) # Shows the result. kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros( (len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] =\ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter( predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df =\ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] =\ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davis-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] =\ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) # pmml model generation ############################################################ pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8") # PM stats ############################################################ print("Sum of Errors for Kmeans = " + str(sum_errors)) pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index)) pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(kmeans_centers)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers)): tbl.add_row( str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance]) pm.set_stat(tbl) tbl_col_name1 = [] for j in range(0, len(kmeans_centers[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data( cluster_dist.tolist()) pm.stat(bar) print("PM: generating histogram from data-frame and model") print("PM:" + pmml_file) try: pm.set_data_distribution_stat(data=input_train, model=pmml_file) print("PM: done generating histogram") except Exception as e: print("PM: failed to generate histogram using pm.stat") print(e) return pmml_file
def main(): # Parse arguments parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() print("PM: Configuration:") print("PM: Step size: [{}]".format(args.step_size)) print("PM: Iterations: [{}]".format(args.iterations)) print("PM: Model version: [{}]".format(args.model_version)) print("PM: Stats interval: [{}]".format(args.stats_interval)) print("PM: Save dir: [{}]".format(args.save_dir)) # Initialize MLOps Library mlops.init() # print the number of iteration used by optimization algorithm print('Training for %i iterations' % args.iterations) # Create sythetic data using scikit learn num_samples = 50 num_features = 20 features, labels = make_classification(n_samples=50, n_features=20, n_informative=2, n_redundant=1, n_classes=3, n_clusters_per_class=1, random_state=42) # Add noise to the data noisy_features = np.random.uniform(0, 5) * np.random.normal( 0, 1, (num_samples, num_features)) features = features + noisy_features num_features = (features.shape[1]) num_labels = len(np.unique(labels)) # One-hot encode labels for all data onehot_labels = np.eye(num_labels)[labels] # Label distribution in training value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output Health Statistics to MCenter # Report features whose distribution should be compared during inference mlops.set_data_distribution_stat(features) # Algorithm parameters parsed from arguments learning_rate = args.step_size training_epochs = args.iterations display_step = args.stats_interval # tf Graph Input x = tf.placeholder(tf.float32, [None, num_features], name="features") y = tf.placeholder(tf.float32, [None, num_labels], name="labels") # Set model weights W = tf.Variable(tf.zeros([num_features, num_labels])) b = tf.Variable(tf.zeros([num_labels])) # Store values for saving model serialized_tf_example = tf.placeholder(tf.string, name='tf_example') # Construct model pred = tf.nn.softmax(tf.matmul(x, W) + b, name="predictions") # Softmax # Minimize error using cross entropy cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred), reduction_indices=1)) # Gradient Descent optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Evaluation correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(pred, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # Start timer training_start_time = time.time() # Initialize the variables in a tf session sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) iteration_array = [] cost_array = [] accuracy_array = [] # Training cycle for epoch in range(training_epochs): avg_cost = 0 temp, c, a = sess.run([optimizer, cost, accuracy], feed_dict={ x: features, y: onehot_labels }) # Compute average loss avg_cost += c / num_samples # Display logs per epoch step if (epoch + 1) % display_step == 0: iteration_array.append(epoch) cost_array.append(avg_cost) accuracy_array.append(np.float(a)) print("accuracy", a) print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost)) # Plot the cost function using MCenter gg = Graph().name("Cost function across epochs").set_x_series( iteration_array).add_y_series(label="Cost Function Across Iterations", data=cost_array) gg.x_title("Average Cost") gg.y_title('Iterations') mlops.set_stat(gg) # Plot the accuracy function using MCenter gg1 = Graph().name("Accuracy across epochs").set_x_series( iteration_array).add_y_series(label="Accuracy Across Iterations", data=accuracy_array) gg1.x_title("Accuracy") gg1.y_title('Iterations') mlops.set_stat(gg1) # Plot accuracy and cost across epochs using MCenter mg = MultiGraph().name("Cost and Accuracy Progress Across Epochs") mg.add_series(x=iteration_array, label="Cost Function Across Iterations", y=cost_array) mg.add_series(x=iteration_array, label="Accuracy across epochs", y=accuracy_array) mlops.set_stat(mg) # Plot final cost and accuracy in this session using MCenter mlt = MultiLineGraph().name("Final Accuracy and Cost").labels( ["Cost", "Accuracy"]) mlt.data([cost_array[-1], accuracy_array[-1]]) mlops.set_stat(mlt) # Save the model export_path = args.save_dir print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) values, indices = tf.nn.top_k(y, num_labels) table = tf.contrib.lookup.index_to_string_table_from_tensor( tf.constant([str(i) for i in range(num_labels)])) prediction_classes = table.lookup(tf.to_int64(indices)) # Build the signature_def_map. classification_inputs = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( prediction_classes) classification_outputs_scores = tf.saved_model.utils.build_tensor_info( values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: classification_outputs_scores }, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME )) tensor_info_x = tf.saved_model.utils.build_tensor_info(x) tensor_info_y = tf.saved_model.utils.build_tensor_info(y) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'inputs': tensor_info_x}, outputs={'outputs': tensor_info_y}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) ) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save(as_text=args.use_text)
def main(): parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() if args.training_iteration <= 0: print('Please specify a positive value for training iteration.') sys.exit(-1) # Read the train and test data sets mnist = mnist_input_data.read_data_sets(args.input_cache_dir, one_hot=True) ## MLOps start # Initialize the mlops library mlops.init() # Report the feature distribution for the training data train_images = mnist.train.images mlops.set_data_distribution_stat(train_images) # Initialize a table to track training accuracy and cost train_table = Table().name("Training Stats").cols(["Accuracy", "Cost"]) ## MLOps end # Create the model sess = tf.InteractiveSession() serialized_tf_example = tf.placeholder(tf.string, name='tf_example') feature_configs = { 'x': tf.FixedLenFeature(shape=[784], dtype=tf.float32), } tf_example = tf.parse_example(serialized_tf_example, feature_configs) x = tf.identity(tf_example['x'], name='x') # use tf.identity() to assign name y_ = tf.placeholder('float', shape=[None, 10]) w = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) sess.run(tf.global_variables_initializer()) y = tf.nn.softmax(tf.matmul(x, w) + b, name='y') # Set the cost function and optimizer cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) train_step = tf.train.GradientDescentOptimizer(0.01).minimize( cross_entropy) values, indices = tf.nn.top_k(y, 10) table = tf.contrib.lookup.index_to_string_table_from_tensor( tf.constant([str(i) for i in range(10)])) prediction_classes = table.lookup(tf.to_int64(indices)) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # Train the model print('Training model...') for i in range(args.training_iteration): batch = mnist.train.next_batch(50) _, train_cost, train_acc = sess.run( [train_step, cross_entropy, accuracy], feed_dict={ x: batch[0], y_: batch[1] }) # Display stats if (i + 1 ) % args.display_step == 0 or i + 1 == args.training_iteration: # Report training accuracy and cost print("Training. step={}, accuracy={}, cost={}".format( i + 1, train_acc, train_cost)) # MLOps start # multiply by 1 to convert into double train_table.add_row("Iterations: {}".format(i + 1), [train_acc * 100, train_cost * 1]) mlops.set_stat(train_table) # MLOps end print('Done training!') # Report final cost and accuracy on test set test_cost, test_acc = sess.run([cross_entropy, accuracy], feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print("Testing. accuracy={}, cost={}".format(test_acc, test_cost)) ## MLOps start acc_table = Table().name("Test Accuracy").cols(["Accuracy"]) acc_table.add_row("Total iterations: {}".format(args.training_iteration), [test_acc]) mlops.set_stat(acc_table) # Release mlops resources mlops.done() ## MLOps end # Export the trained model so it can be used for inference # WARNING(break-tutorial-inline-code): The following code snippet is # in-lined in tutorials, please update tutorial documents accordingly # whenever code changes. export_path = args.save_dir print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) # Build the signature_def_map. classification_inputs = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( prediction_classes) classification_outputs_scores = tf.saved_model.utils.build_tensor_info( values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: classification_outputs_scores }, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME )) tensor_info_x = tf.saved_model.utils.build_tensor_info(x) tensor_info_y = tf.saved_model.utils.build_tensor_info(y) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'inputs': tensor_info_x}, outputs={'outputs': tensor_info_y}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) ) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save() print('Done exporting!')
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: # Model File: [{}]".format( pm_options.input_model)) # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: print("Model Not Found") print("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data # select columns 1 through end except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) # Add random noise to the data randomly import random if random.randint(1, 21) / 2 == 0: print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features # Separate into features and labels features = X max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features), st.TIME_SERIES) # Accuracy for the chosen model pred_labels = final_model.predict(features) pred_probs = final_model.predict_proba(features) print("Pred Labels: ", pred_labels) # Remove printout can be huge print("Pred Probabilities: ", pred_probs) # Remove printout can be huge # Pred Label distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Pred Label distributions: \n {0}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name("Pred Label Distribution").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # Pred Label confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] print(" np.sum(confidence[index_class])", np.sum(max_pred_probs[index_class])) print("counts_elements[i] ", pred_counts[i]) if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 # BarGraph showing confidence per class pred_values1 = [str(i) for i in pred_value] bar = BarGraph().name("Average Confidence Per Class").cols( pred_values1).data(average_confidence.tolist()) mlops.set_stat(bar) # KS for the chosen model ks = ks_2samp(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter if not np.isnan(ks_stat): print("printing KS_stat ") mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) else: print("not printing KS_stat ") # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Inference] KS Violation From Inference Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Inference] PSI Violation From Inference Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Terminate MLOPs mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Validation Split: [{}]".format( pm_options.validation_split)) print("PM: # AUC Threshold: [{}]".format( pm_options.auc_threshold)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Estimators: [{}]".format( pm_options.n_estimators)) print("PM: # Max Depth: [{}]".format(pm_options.max_depth)) print("PM: # Learning Rate: [{}]".format( pm_options.learning_rate)) print("PM: # Min Child Weight: [{}]".format( pm_options.min_child_weight)) print("PM: # Objective: [{}]".format(pm_options.objective)) print("PM: # Gamma: [{}]".format(pm_options.gamma)) print("PM: # Max Delta Step: [{}]".format( pm_options.max_delta_step)) print("PM: # Subsample: [{}]".format(pm_options.subsample)) print("PM: # Reg Alpha: [{}]".format(pm_options.reg_alpha)) print("PM: # Reg Lambda: [{}]".format( pm_options.reg_lambda)) print("PM: # Scale Pos Weight: [{}]".format( pm_options.scale_pos_weight)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: Output model: [{}]".format( pm_options.output_model)) min_auc_requirement = float(pm_options.auc_threshold) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Initialize MLOps Library mlops.init() try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data[:, 1:] # select columns 1 through end y = data[:, 0] except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(pm_options.validation_split), random_state=42) import xgboost as xgb # Create a model that should be deployed into production final_model = xgb.XGBClassifier( max_depth=int(pm_options.max_depth), min_child_weight=int(pm_options.min_child_weight), learning_rate=float(pm_options.learning_rate), n_estimators=int(pm_options.n_estimators), silent=True, objective=str(pm_options.objective), gamma=float(pm_options.gamma), max_delta_step=int(pm_options.max_delta_step), subsample=float(pm_options.subsample), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(pm_options.reg_alpha), reg_lambda=float(pm_options.reg_lambda), scale_pos_weight=float(pm_options.scale_pos_weight), seed=1, missing=None) final_model.fit(X_train, y_train) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(X_train) # Accuracy for the chosen model pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) print("Pred Labels: ", pred_labels) print("Pred Probabilities: ", pred_probs) accuracy = accuracy_score(y_test, pred_labels) print("Accuracy values: \n {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T # column_names = value.astype(str).tolist() print("Validation Actual Label distributions: \n {0}".format( label_distribution)) # Output Label distribution as a BarGraph using MCenter bar = BarGraph().name("Validation Actual Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Validation Prediction Label Distributions: \n {0}".format( pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name( "Validation Prediction Label Distributions").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # ROC for the chosen model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) print("ROC AUC values: \n {}".format(roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES) if roc_auc <= min_auc_requirement: mlops.health_alert( "[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format( min_auc_requirement, roc_auc)) # ROC Curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) cg = MultiGraph().name( "Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='ROC Curve (Area = {0:0.2f})' ''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) max_pred_probs = pred_probs.max(axis=1) # KS for the chosen model ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def _prep_and_train(self, df_dataset): self.min_auc_requirement = self._params["auc_threshold"] self.max_ks_requirement = self._params["ks_threshold"] self.min_psi_requirement = self._params["psi_threshold"] train_on_col = self._params["train_on_column"] #mlops Init mlops.init() y = df_dataset[train_on_col] self._logger.info("train_on_col= {}".format(train_on_col)) self._logger.info("df_dataset {}".format(df_dataset.shape[1])) X = df_dataset.drop(train_on_col, axis=1) mlops.set_data_distribution_stat(X) self._logger.info("df_dataset {}".format(X.shape[1])) # Splitting the data to train and test sets: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self._params["validation_split"], random_state=42) All_columns = X_train.columns.tolist() categorical_columns = self._params["categorical_cols"] mapper_list = [] for d in All_columns: if d in categorical_columns: mapper_list.append( ([d], OneHotEncoder(handle_unknown='ignore'))) else: mapper_list.append(([d], MinMaxScaler())) mapper = DataFrameMapper(mapper_list) ## Training # XGBoost Training: n_cpu = multiprocessing.cpu_count() xgboost_model = xgb.XGBClassifier( max_depth=int(self._params["max_depth"]), min_child_weight=int(self._params["min_child_weight"]), learning_rate=float(self._params["learning_rate"]), n_estimators=int(self._params["n_estimators"]), silent=True, objective=self._params["objective"], gamma=float(self._params["gamma"]), max_delta_step=int(self._params["max_delta_step"]), subsample=float(self._params["subsample"]), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(self._params["reg_alpha"]), reg_lambda=float(self._params["reg_lambda"]), scale_pos_weight=float(self._params["scale_pos_weight"]), seed=1, n_jobs=n_cpu, missing=None) final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)]) final_model.fit(X_train, y_train) # Prediction and prediction distribution pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) # Accuracy calculation # Accuracy for the xgboost model accuracy = accuracy_score(y_test, pred_labels) self._logger.info("XGBoost Accuracy value: {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES) # Label distribution: # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T self._logger.info( "Validation Actual Label distributions: \n {0}".format( label_distribution)) # Output Label distribution as a BarGraph using MCenter export_bar_table(label_distribution[:, 0], label_distribution[:, 1], "Validation - Actual Label Distribution") # Prediction distribution and prediction confidence distribution # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T self._logger.info( "XGBoost Validation Prediction Label Distributions: \n {0}".format( pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:, 1], "Validation - XGBoost Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 self._logger.info( "XGBoost Validation Average Prediction confidence per label: \n {0}" .format(average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") # Confusion Matrix # XGBoost Confusion Matrix confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels) self._logger.info( "Confusion Matrix for XGBoost: \n {0}".format(confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(confmat, "XGBoost") # Classification Report # XGBoost Classification Report class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True) self._logger.info( "XGBoost Classification Report: \n {0}".format(class_rep)) # AUC and ROC Curves # ROC for XGBoost model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) self._logger.info("XGBoost ROC AUC value: {}".format(roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES) if roc_auc <= self.min_auc_requirement: mlops.health_alert( "[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format( self.min_auc_requirement, roc_auc)) # ROC curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) cg = MultiGraph().name( "Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random curve ' '', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})' ''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) # Feature importance comparison # XGBoost Feature importance export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0 = np.where(y_test == 0)[0] y_test1 = np.where(y_test == 1)[0] # KS for the XGBoost model ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue self._logger.info( "KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for CGBoost", ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= self.max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( self.max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats for XGBoost").cols( ["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis # Calculating PSI total_psi, psi_table = get_psi(self, max_pred_probs[y_test0], max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats for XGBoost").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) self._logger.info("Total XGBoost PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES) if total_psi >= self.min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( self.min_psi_requirement, total_psi)) # ## Save the XGBoost Model model_file = open(self._params["output-model"], 'wb') pickle.dump(final_model, model_file) model_file.close() # ## Finish the program mlops.done() return (model_file)
def _prep_and_infer(self, df_dataset): # Get number of features self.num_features = df_dataset.shape[1] # Get number of samples self.num_samples = df_dataset.shape[0] #get input model self.input_model = self._params["input-model"] self._logger.info("PM: Configuration:") self._logger.info("PM: # Sample: [{}]".format( self.num_samples)) self._logger.info("PM: # Features: [{}]".format( self.num_features)) self._logger.info("PM: # Input-Model: [{}]".format( self.input_model)) # Initialize MLOps Library mlops.init() # Load the model if self.input_model is not None: try: filename = self._params["input-model"] model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: #self._logger.error("Model Not Found") self._logger.error("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) features = df_dataset # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data # and compare it automatically with the ones mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features), st.TIME_SERIES) # Accuracy for the chosen model pred_labels = final_model.predict(features) pred_probs = final_model.predict_proba(features) self._logger.info("Pred Labels: {}".format( pred_labels)) # Remove printout can be huge self._logger.info("Pred Probabilities: {}".format( pred_probs)) # Remove printout can be huge # Pred Label distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() self._logger.info( "Pred Label distributions: \n {}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name("Pred Label Distribution").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # Pred Label confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] self._logger.info("np.sum(confidence[index_class]) {}".format( np.sum(max_pred_probs[index_class]))) self._logger.info("counts_elements[i] {}".format(pred_counts[i])) if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 # BarGraph showing confidence per class pred_values1 = [str(i) for i in pred_value] bar = BarGraph().name("Average Confidence Per Class").cols( pred_values1).data(average_confidence.tolist()) mlops.set_stat(bar) # Terminate MLOPs mlops.done() df_result = pd.concat([ df_dataset, pd.DataFrame({'predict': pred_labels}), pd.DataFrame({ 'probs-0': pred_probs[:, 0], 'probs-1': pred_probs[:, 1] }) ], axis=1) df_result.insert(0, 'idx', [x for x in range(1, df_result.shape[0] + 1)], allow_duplicates=False) return df_result
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Classes: [{}]".format( pm_options.num_cluster)) print("PM: Init: [{}]".format(pm_options.init)) print("PM: N Init: [{}]".format(pm_options.n_init)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum Iterations: [{}]".format(pm_options.max_iter)) print("PM: Pre-Compute Distances: [{}]".format( pm_options.precompute_distances)) print("PM: Algorithm: [{}]".format(pm_options.algorithm)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() n_samples = int(pm_options.num_samples) n_features = int(pm_options.num_features) n_clusters = int(pm_options.num_cluster) init = str(pm_options.init) n_init = int(pm_options.n_init) max_iter = int(pm_options.max_iter) tol = float(pm_options.tol) precompute_distances = str(pm_options.precompute_distances) algorithm = str(pm_options.algorithm) verbose = 0 n_jobs = 1 # Create synthetic data using scikit learn X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_redundant=1, n_classes=n_clusters, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels_true = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (n_samples, n_features)) features = features + noisy_features kmeans_model = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=verbose, random_state=None, copy_x=True, n_jobs=n_jobs, algorithm=algorithm).fit(features, labels_true) mlops.set_stat("User Defined: Training Inertia", kmeans_model.inertia_) mlops.set_stat("User Defined: Training Iteration", kmeans_model.n_iter_) value, counts = np.unique(labels_true, return_counts=True) label_distribution = np.asarray((value, counts)).T # Output actual label distribution as a BarGraph using MCenter bar_true = BarGraph().name("User Defined: Actual Label Distribution") \ .cols((label_distribution[:, 0]).astype(str).tolist()) \ .data((label_distribution[:, 1]).tolist()) mlops.set_stat(bar_true) # prediction labels labels_pred = kmeans_model.predict(features) value_pred, counts_pred = np.unique(labels_pred, return_counts=True) label_distribution_pred = np.asarray((value_pred, counts_pred)).T # Output prediction label distribution as a BarGraph using MCenter bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \ .data((label_distribution_pred[:, 1]).tolist()) mlops.set_stat(bar_pred) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) ########################################################################### #################### Start: Adjusted Mutual Info Score #################### ########################################################################### adjusted_mutual_info_score = sklearn.metrics \ .adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Adjusted Mutual Info Score", adjusted_mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.ADJUSTED_MUTUAL_INFO_SCORE, adjusted_mutual_info_score) # OR # Third Way mlops.metrics.adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ######################################################################### #################### End: Adjusted Mutual Info Score #################### ######################################################################### #################################################################### #################### Start: Adjusted Rand Score #################### #################################################################### adjusted_rand_score = sklearn.metrics \ .adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Adjusted Rand Score", adjusted_rand_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.ADJUSTED_RAND_SCORE, adjusted_rand_score) # OR # Third Way mlops.metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################## #################### End: Adjusted Rand Score #################### ################################################################## ####################################################################### #################### Start: Calinski Harabaz Score #################### ####################################################################### calinski_harabaz_score = sklearn.metrics \ .calinski_harabaz_score(X=features, labels=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Calinski Harabaz Score", calinski_harabaz_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.CALINSKI_HARABAZ_SCORE, calinski_harabaz_score) # OR # Third Way mlops.metrics.calinski_harabaz_score(X=features, labels=labels_pred) #################### DONE NEW WAY #################### ##################################################################### #################### End: Calinski Harabaz Score #################### ##################################################################### ################################################################### #################### Start: Completeness Score #################### ################################################################### completeness_score = sklearn.metrics \ .completeness_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Completeness Score", completeness_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.COMPLETENESS_SCORE, completeness_score) # OR # Third Way mlops.metrics.completeness_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################# #################### End: Completeness Score #################### ################################################################# ################################################################### #################### Start: Contingency Matrix #################### ################################################################### contingency_matrix = sklearn.metrics.cluster \ .contingency_matrix(labels_true, labels_pred) # list of sorted labels. i.e. [0, 1, 2, ..] pred_labels_list = sorted(set(labels_pred)) true_labels_list = sorted(set(labels_true)) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.table import Table # # cm_cols_ordered_string = [str(i) for i in pred_labels_list] # cm_rows_ordered_string = [str(i) for i in true_labels_list] # cm_matrix = Table().name("User Defined: Contingency Matrix").cols(cm_cols_ordered_string) # # for index in range(len(contingency_matrix)): # cm_matrix.add_row(cm_rows_ordered_string[index], list(contingency_matrix[index])) # # mlops.set_stat(cm_matrix) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.CONTINGENCY_MATRIX, data=contingency_matrix, true_labels=true_labels_list, pred_labels=pred_labels_list) # OR # Third Way mlops.metrics.cluster.contingency_matrix(labels_true, labels_pred) #################### DONE NEW WAY #################### ################################################################# #################### End: Contingency Matrix #################### ################################################################# ###################################################################### #################### Start: Fowlkes Mallows Score #################### ###################################################################### fowlkes_mallows_score = \ sklearn.metrics.fowlkes_mallows_score(labels_true=labels_true, labels_pred=labels_pred, sparse=False) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Fowlkes Mallows Score", fowlkes_mallows_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.FOWLKES_MALLOWS_SCORE, fowlkes_mallows_score) # OR # Third Way mlops.metrics.fowlkes_mallows_score(labels_true=labels_true, labels_pred=labels_pred, sparse=False) #################### DONE NEW WAY #################### #################################################################### #################### End: Fowlkes Mallows Score #################### #################################################################### ##################################################################################### #################### Start: Homogeneity, Completeness, V Measure #################### ##################################################################################### homogeneity, completeness, v_measure = sklearn.metrics \ .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # multiline_object = MultiLineGraph() \ # .name("User Defined: Homogeneity - Completeness - V Measure") \ # .labels(["Homogeneity", "Completeness", "V Measure"]) # # multiline_object.data([homogeneity, completeness, v_measure]) # # mlops.set_stat(multiline_object) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.HOMOGENEITY_COMPLETENESS_V_MEASURE, data=[homogeneity, completeness, v_measure]) # OR # Third Way mlops.metrics \ .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################################### #################### End: Homogeneity, Completeness, V Measure #################### ################################################################################### ################################################################## #################### Start: Homogeneity Score #################### ################################################################## homogeneity_score = sklearn.metrics \ .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Homogeneity Score", homogeneity_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.HOMOGENEITY_SCORE, homogeneity_score) # OR # Third Way mlops.metrics \ .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################ #################### End: Homogeneity Score #################### ################################################################ ################################################################## #################### Start: Mutual Info Score #################### ################################################################## mutual_info_score = sklearn.metrics \ .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Mutual Info Score", mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.MUTUAL_INFO_SCORE, mutual_info_score) # OR # Third Way mlops.metrics \ .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None) #################### DONE NEW WAY #################### ################################################################ #################### End: Mutual Info Score #################### ################################################################ ############################################################################# #################### Start: Normalized Mutual Info Score #################### ############################################################################# normalized_mutual_info_score = sklearn.metrics \ .normalized_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Normalized Mutual Info Score", normalized_mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.NORMALIZED_MUTUAL_INFO_SCORE, normalized_mutual_info_score) # OR # Third Way mlops.metrics \ .normalized_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ########################################################################### #################### End: Normalized Mutual Info Score #################### ########################################################################### ################################################################# #################### Start: Silhouette Score #################### ################################################################# silhouette_score = sklearn.metrics \ .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Silhouette Score", silhouette_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.SILHOUETTE_SCORE, silhouette_score) # OR # Third Way mlops.metrics \ .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None) #################### DONE NEW WAY #################### ############################################################### #################### End: Silhouette Score #################### ############################################################### ################################################################ #################### Start: V Measure Score #################### ################################################################ v_measure_score = sklearn.metrics.v_measure_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: V Measure Score", v_measure_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, v_measure_score) # OR # Third Way mlops.metrics \ .v_measure_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################## #################### End: V Measure Score #################### ############################################################## # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(kmeans_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def main(args): # Parse arguments parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() # Initialize MLOps Library mlops.init() # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = 50 num_features = 20 np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] # Start tensorflow session sess = tf.InteractiveSession() tag_set = ["serve"] if args.model_dir is not None: try: print("args.model_dir = ", args.model_dir) tf.saved_model.loader.load(sess, tag_set, args.model_dir) except Exception as e: print("Model not found") print("Got exception: " + str(e)) return 0 # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(data=features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features)) graph = tf.get_default_graph() x = graph.get_tensor_by_name("features:0") y_pred = graph.get_tensor_by_name("predictions:0") predictions = sess.run(y_pred, {x: features}) print('predictions', np.array(predictions)) # Ouput prediction distribution as a BarGraph using MCenter predict_int = np.argmax(predictions, axis=1) unique, counts = np.unique(predict_int, return_counts=True) counts = list(map(int, counts)) x_series = list(map(str, unique)) mlt = BarGraph().name("Prediction Distribution").cols(x_series).data( list(counts)) mlops.set_stat(mlt) # Show average prediction probability value for each prediction num_labels = len(np.unique(predict_int)) probability = np.zeros((num_labels, )) for a in range(0, num_labels): temp = predictions[np.argmax(predictions, axis=1) == a, :] print(temp) probability[a] = np.mean(temp[:, a]) print("probability", list(np.squeeze(probability))) # Plot average probability in each class using MCenter bg = BarGraph().name("Probability of Each Label").cols(x_series).data( list(np.squeeze(probability))) mlops.set_stat(bg)
def main(): pm_options = parse_args() mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 classifier = pickle.load(file_obj) # Load the data test_dataset = pd.read_csv(pm_options.input_file) mlops.set_data_distribution_stat(test_dataset) # Extract numpy array test_features = test_dataset.values # Predict labels result = classifier.predict(test_features) # Predict probability class_probability = classifier.predict_proba(test_features) maximum_prob = np.max(class_probability, axis=1) # Tag samples that are below a certain probability and write to a file confidence = 0.8 low_prob_samples = test_features[np.where(maximum_prob < confidence)] low_prob_predictions = result[np.where(maximum_prob < confidence)] unique_elements_low, counts_elements_low = np.unique(low_prob_predictions, return_counts=True) unique_elements_low = [str(i) for i in unique_elements_low] print("Low confidence predictions: \n {0} \n with frequency {1}".format( unique_elements_low, counts_elements_low)) ########## Start of ParallelM instrumentation ############## # BarGraph showing distribution of low confidence labels bar = BarGraph().name("Low confidence label distribution").cols( unique_elements_low).data(counts_elements_low.tolist()) mlops.set_stat(bar) ########## End of ParallelM instrumentation ################ # Samples with high probability high_prob_samples = test_features[np.where(maximum_prob >= confidence)] high_prob_predictions = result[np.where(maximum_prob >= confidence)] unique_elements_high, counts_elements_high = np.unique( high_prob_predictions, return_counts=True) unique_elements_high = [str(i) for i in unique_elements_high] print("High confidence predictions: \n {0} \n with frequency {1}".format( unique_elements_high, counts_elements_high)) ########## Start of ParallelM instrumentation ############## # BarGraph showing distribution of high confidence labels bar = BarGraph().name("High confidence label distribution").cols( unique_elements_high).data(counts_elements_high.tolist()) mlops.set_stat(bar) ########## End of ParallelM instrumentation ################ mlops.done()
def main(): pm_options = parse_args() # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 regression = pickle.load(file_obj) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) mae_threshold = float(pm_options.threshold) # Create synthetic data using scikit learn X, y = make_regression(n_samples=num_samples, n_features=num_features, n_informative=2, random_state=42) # for making labels all positive y = y + -1 * np.min(y) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Predict labels labels_pred = regression.predict(features) hist_pred, bin_edges_pred = np.histogram(labels_pred) # Output prediction label distribution as a BarGraph using MCenter pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols(bin_edges_pred.astype(str).tolist()) \ .data(hist_pred.tolist()) \ .as_continuous() mlops.set_stat(pred_label_bar) ########################################################################## #################### Start: Output Sample/Conversions #################### ########################################################################@@ mae = np.absolute(labels_pred - labels) conversions = sum(i < mae_threshold for i in mae) samples = num_samples mlops.set_stat("samples", samples) mlops.set_stat("conversions", conversions) ######################################################################## #################### End: Output Sample/Conversions #################### ######################################################################## # Terminate MLOPs mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Validation Split: [{}]".format(pm_options.validation_split)) print("PM: # AUC Threshold: [{}]".format(pm_options.auc_threshold)) print("PM: # KS Threshold: [{}]".format(pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format(pm_options.psi_threshold)) print("PM: # Estimators: [{}]".format(pm_options.n_estimators)) print("PM: # Max Depth: [{}]".format(pm_options.max_depth)) print("PM: # Learning Rate: [{}]".format(pm_options.learning_rate)) print("PM: # Min Child Weight: [{}]".format(pm_options.min_child_weight)) print("PM: # Objective: [{}]".format(pm_options.objective)) print("PM: # Gamma: [{}]".format(pm_options.gamma)) print("PM: # Max Delta Step: [{}]".format(pm_options.max_delta_step)) print("PM: # Subsample: [{}]".format(pm_options.subsample)) print("PM: # Reg Alpha: [{}]".format(pm_options.reg_alpha)) print("PM: # Reg Lambda: [{}]".format(pm_options.reg_lambda)) print("PM: # Scale Pos Weight: [{}]".format(pm_options.scale_pos_weight)) print("PM: # Input File: [{}]".format(pm_options.input_file)) print("PM: Output model: [{}]".format(pm_options.output_model)) min_auc_requirement = float(pm_options.auc_threshold) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # mlops Init mlops.init() # Loading and cleaning the data # This section goes though the various stages of loading and cleaning the data: loan_df = pd.read_csv(pm_options.input_file) # Cleaning NAs print("dataset_size = ", loan_df.shape[0]) mlops.set_data_distribution_stat(loan_df) print("number of NAs per columns = ", loan_df.isnull().sum()) loan_df = loan_df.dropna() print("dataset_size without NA rows= ", loan_df.shape[0]) # Marking the label field. remove it from the features set: y = loan_df["bad_loan"] X = loan_df.drop("bad_loan", axis=1) from sklearn_pandas import DataFrameMapper # Splitting the data to train and test sets: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(pm_options.validation_split), random_state=42) All_columns = X_train.columns.tolist() categorical_columns = ["verification_status", "addr_state", "purpose", "home_ownership", "term"] mapper_list =[] for d in All_columns: if d in categorical_columns: mapper_list.append(([d], OneHotEncoder(handle_unknown='ignore'))) else: mapper_list.append(([d], MinMaxScaler())) mapper = DataFrameMapper(mapper_list) # ## Training # XGBoost Training: import xgboost as xgb xgboost_model = xgb.XGBClassifier(max_depth=int(pm_options.max_depth), min_child_weight=int(pm_options.min_child_weight), learning_rate=float(pm_options.learning_rate), n_estimators=int(pm_options.n_estimators), silent=True, objective=pm_options.objective, gamma=float(pm_options.gamma), max_delta_step=int(pm_options.max_delta_step), subsample=float(pm_options.subsample), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(pm_options.reg_alpha), reg_lambda=float(pm_options.reg_lambda), scale_pos_weight=float(pm_options.scale_pos_weight), seed=1, n_jobs=1, missing=None) final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)]) final_model.fit(X_train, y_train) # Random Forest Training from sklearn.ensemble import RandomForestClassifier rf_only_model = RandomForestClassifier(n_estimators=int(pm_options.n_estimators), max_depth=int(pm_options.max_depth)+3, random_state=42, n_jobs=1, class_weight="balanced") rf_model = Pipeline([("mapper", mapper), ("rf", rf_only_model)]) rf_model.fit(X_train, y_train) # ## Statistics on Test Dataset # Prediction and prediction distribution pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) rf_pred_labels = rf_model.predict(X_test) rf_pred_probs = rf_model.predict_proba(X_test) # Accuracy calculation # Accuracy for the xgboost model accuracy = accuracy_score(y_test, pred_labels) print("XGBoost Accuracy value: {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES) # Accuracy for the RF model rf_accuracy = accuracy_score(y_test, rf_pred_labels) print("RF Accuracy value: {0}".format(rf_accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("RF Accuracy", rf_accuracy, st.TIME_SERIES) # Label distribution: # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T print("Validation Actual Label distributions: \n {0}".format(label_distribution)) # Output Label distribution as a BarGraph using MCenter export_bar_table(label_distribution[:,0], label_distribution[:,1], "Validation - Actual Label Distribution") # Prediction distribution and prediction confidence distribution # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T print("XGBoost Validation Prediction Label Distributions: \n {0}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_label_distribution[:,0], pred_label_distribution[:,1], "Validation - XGBoost Prediction Distribution") rf_pred_value, rf_pred_counts = np.unique(rf_pred_labels, return_counts=True) rf_pred_label_distribution = np.asarray((rf_pred_value, rf_pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("RF Validation Prediction Label Distributions: \n {0}".format(rf_pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(rf_pred_label_distribution[:,0], rf_pred_label_distribution[:,1], "Validation - RF Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum(max_pred_probs[index_class])/(float(pred_counts[i])) else: average_confidence[i] = 0 print("XGBoost Validation Average Prediction confidence per label: \n {0}".format(average_confidence)) # Pred confidence per label rf_label_number = len(rf_pred_counts) rf_average_confidence = np.zeros(rf_label_number) rf_max_pred_probs = rf_pred_probs.max(axis=1) for i in range(0, rf_label_number): rf_index_class = np.where(rf_pred_labels == i)[0] if rf_pred_counts[i] > 0: rf_average_confidence[i] = np.sum(rf_max_pred_probs[rf_index_class])/(float(rf_pred_counts[i])) else: rf_average_confidence[i] = 0 print("RF Validation Average Prediction confidence per label: \n {0}".format(rf_average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") export_bar_table(rf_pred_value, rf_average_confidence, "Validation - RF Average confidence per class") # Confusion Matrix # XGBoost Confusion Matrix confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels) print("Confusion Matrix for XGBoost: \n {0}".format(confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(confmat, "XGBoost") # RF Confusion Matrix rf_confmat = confusion_matrix(y_true=y_test, y_pred=rf_pred_labels) print("Confusion Matrix for RF: \n {0}".format(rf_confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(rf_confmat, "RF") # Classification Report # XGBoost Classification Report class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True) print("XGBoost Classification Report: \n {0}".format(class_rep)) # RF Classification Report rf_class_rep = classification_report(y_true=y_test, y_pred=rf_pred_labels, output_dict=True) print("RF Classification Report: \n {0}".format(rf_class_rep)) # Output Classification Report as a Table using MCenter export_classification_report(class_rep, "XGBoost") export_classification_report(rf_class_rep, "RF") # AUC and ROC Curves # ROC for XGBoost model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) print("XGBoost ROC AUC value: {}".format(roc_auc)) rf_roc_auc = roc_auc_score(y_test, rf_pred_probs[:, 1]) print("RF ROC AUC value: {}".format(rf_roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES) mlops.set_stat("RF ROC AUC", rf_roc_auc, st.TIME_SERIES) if roc_auc <= min_auc_requirement: mlops.health_alert("[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format(min_auc_requirement, roc_auc)) # ROC curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, rf_pred_probs[:, 1]) cg = MultiGraph().name("Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random curve ''', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.add_series(label='RF ROC curve (area = {0:0.2f})'''.format(rf_roc_auc), x=rf_fpr.tolist(), y=rf_tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) # Feature importance comparison # XGBoost Feature importance export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost") export_feature_importance(rf_model, list(X_train.columns), 5, "RF") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0=np.where(y_test == 0)[0] y_test1=np.where(y_test == 1)[0] rf_max_pred_probs = rf_pred_probs.max(axis=1) # KS for the XGBoost model ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(ks_stat, ks_pvalue)) # KS for the RF model rf_ks = ks_2samp(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1]) rf_ks_stat = rf_ks.statistic rf_ks_pvalue = rf_ks.pvalue print("RF KS values: \n Statistics: {} \n pValue: {}\n".format(rf_ks_stat, rf_ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for RF", rf_ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert("[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format(max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats for XGBoost").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1]) rf_total_psi, rf_psi_table = get_psi(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats for XGBoost").cols( ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI"]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total XGBoost PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES) if total_psi >= min_psi_requirement: mlops.health_alert("[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format(min_psi_requirement, total_psi)) print("Total RF PSI values: \n {}".format(rf_total_psi)) rf_psi_table_stat = Table().name("PSI Stats for RF").cols( ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI"]) row_num = 1 for each_value in rf_psi_table.values: str_values = [str(i) for i in each_value] rf_psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(rf_psi_table_stat) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total RF PSI ", rf_total_psi, st.TIME_SERIES) # ## Save the XGBoost Model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # ## Finish the program mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: Kernel: [{}]".format(pm_options.kernel)) print("PM: Degree: [{}]".format(pm_options.degree)) print("PM: Gamma: [{}]".format(pm_options.gamma)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum iterations: [{}]".format(pm_options.max_iter)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_regression(n_samples=num_samples, n_features=num_features, n_informative=2, random_state=42) # for making labels all positive y = y + -1 * np.min(y) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) hist, bin_edges = np.histogram(labels) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("User Defined: Label Distribution") \ .cols((bin_edges).astype(str).tolist()) \ .data((hist).tolist()) \ .as_continuous() mlops.set_stat(bar) # Create a model that should be deployed into production final_model = SVR(kernel=pm_options.kernel, degree=int(pm_options.degree), gamma=str(pm_options.gamma), tol=float(pm_options.tol), max_iter=int(pm_options.max_iter)) final_model.fit(features, labels) labels_pred = final_model.predict(features) hist_pred, bin_edges_pred = np.histogram(labels_pred) # Output prediction label distribution as a BarGraph using MCenter pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols((bin_edges_pred).astype(str).tolist()) \ .data((hist_pred).tolist()) \ .as_continuous() mlops.set_stat(pred_label_bar) y_true = labels y_pred = labels_pred # Regression Metrics ########################################################################## #################### Start: Output Explained Variance #################### ########################################################################## evs = sklearn.metrics.explained_variance_score(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: Explained Variance ", evs) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.EXPLAINED_VARIANCE_SCORE, evs) # OR # Third Way # mlops.metrics.explained_variance_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ######################################################################## #################### End: Output Explained Variance #################### ######################################################################## ###################################################################### #################### Start: Output Mean Abs Error #################### ###################################################################### mae = sklearn.metrics.mean_absolute_error(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: Mean Abs Error", mae) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.MEAN_ABSOLUTE_ERROR, mae) # OR # Third Way # mlops.metrics.mean_absolute_error(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### #################################################################### #################### End: Output Mean Abs Error #################### #################################################################### ########################################################################## #################### Start: Output Mean Squared Error #################### ########################################################################## mse = sklearn.metrics.mean_squared_error(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: Mean Squared Error", mse) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.MEAN_SQUARED_ERROR, mse) # OR # Third Way # mlops.metrics.mean_squared_error(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ######################################################################## #################### End: Output Mean Squared Error #################### ######################################################################## ############################################################################## #################### Start: Output Mean Squared Log Error #################### ############################################################################## msle = sklearn.metrics.mean_squared_log_error(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: Mean Squared Log Error", msle) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.MEAN_SQUARED_LOG_ERROR, msle) # OR # Third Way # mlops.metrics.mean_squared_log_error(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Mean Squared Log Error #################### ############################################################################ ######################################################################## #################### Start: Output Median Abs Error #################### ######################################################################## median_ae = sklearn.metrics.median_absolute_error(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: Median Abs Error", median_ae) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.MEDIAN_ABSOLUTE_ERROR, median_ae) # OR # Third Way # mlops.metrics.median_absolute_error(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ###################################################################### #################### End: Output Median Abs Error #################### ###################################################################### ################################################################ #################### Start: Output R2 Score #################### ################################################################ r2_s = sklearn.metrics.r2_score(y_true, y_pred) #################### OLD WAY #################### # First Way mlops.set_stat("User Defined: R2 Score", r2_s) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way # mlops.set_stat(RegressionMetrics.R2_SCORE, r2_s) # OR # Third Way # mlops.metrics.r2_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################## #################### End: Output R2 Score #################### ############################################################## # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()