def main(): pm_options = parse_args() # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 classifier = pickle.load(file_obj) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() test_features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(test_features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Predict labels result = classifier.predict(test_features) # Label distribution in prediction value, counts = np.unique(result, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Terminate MLOPs mlops.done()
def main(): print("Starting example") mlops.init(run_in_non_pm_mode=True, mlops_mode=MLOpsMode.PYTHON) # Line graphs mlops.set_stat("myCounterDouble", 5.5) mlops.set_stat("myCounterDouble2", 7.3) # Multi-line graphs mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) tbl = Table().name("MyTable").cols(["Date", "Some number"]) tbl.add_row(["2001Q1", "55"]) tbl.add_row(["2001Q2", "66"]) tbl.add_row(["2003Q3", "33"]) tbl.add_row(["2003Q2", "22"]) mlops.set_stat(tbl) bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd", "ee"]).data([10, 15, 12, 9, 8]) mlops.set_stat(bar) mlops.done() print("Example done")
def feature_importance(self, model_obj, feature_importance_vector=None, feature_names=None, model=None, df=None, num_significant_features=100): """ present feature importance, either according to the provided vector or generated from the provided model if available. Feature importance bar graph is attached to the current model and can be fetched later for this model. this function implements: 1) use feature_importance_vector if exists 2) feature_names from the model if available 3) get feature names vector if exists 4) extract feature name from pipeline model or dataframe if exists - (code different to pyspark and sklearn) 5) sort the vector. 6) take first k elements 7) create a bar graph for feature importance :param model_obj: model object :param feature_importance_vector: feature importance vector optional :param feature_names: feature names vector optional :param model: optional pipeline model for pyspark, sklearn model for python :param df: optional dataframe for analysis :param num_significant_features: Number of significant features :raises: MLOpsException """ self._validate_feature_importance_inputs(feature_importance_vector, feature_names, model, df) important_named_features = self._output_channel.feature_importance(feature_importance_vector, feature_names, model, df) if important_named_features: # Sort the feature importance vector important_named_features_sorted = sorted(important_named_features, key=lambda x: x[1], reverse=True) self._logger.info("Full important_named_features_sorted = {}" .format(important_named_features_sorted)) # output k significant features if int(num_significant_features) < len(important_named_features_sorted): important_named_features_sorted = important_named_features_sorted[0:int(num_significant_features)] # Plot results in a bar graph self._logger.info("Important_named_features_sorted = {}" .format(important_named_features_sorted)) col_names = [v[0] for i, v in enumerate(important_named_features_sorted)] col_value = [v[1] for i, v in enumerate(important_named_features_sorted)] bar = BarGraph().name("Feature Importance").cols(col_names).data(col_value) model_obj.set_stat(bar)
def export_bar_table(bar_names, bar_data, title_name): """ This function provides a bar_graph for a bar type data at MCenter data scientist view :param bar_names: Bar graph names :param bar_data: Bar graph data. :param title_name: Title of the bar Graph :return: """ bar_graph_data = BarGraph().name(title_name).cols( bar_names.astype(str).tolist()).data(bar_data.tolist()) mlops.set_stat(bar_graph_data)
def _report_bar_graph_metric(self, metric_meta, metrics): cols = [] data = [] for related_m, bar_name in metric_meta.related_metric: cols.append(bar_name) data.append(metrics[related_m.metric_name]) if not all(v == 0 for v in data) or not metric_meta.metric_already_displayed: metric_meta.metric_already_displayed = True mlt = BarGraph().name(metric_meta.title).cols(cols).data(data) mlops.set_stat(mlt)
def test_bar_graph(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) with pytest.raises(MLOpsException): BarGraph().name("bar").cols(["g1", "g2"]).data(["aa", "bb"]) with pytest.raises(MLOpsException): BarGraph().name("bar").data(["aa", "bb"]) with pytest.raises(MLOpsException): mlt = BarGraph().name("mlt").cols(["g1"]).data([55, 66]) pm.set_stat(mlt) with pytest.raises(MLOpsException): mlt_cont = BarGraph().name("mlt").cols([1, 2]).data([55, 66]).as_continuous() pm.set_stat(mlt_cont) mlt = BarGraph().name("mlt").cols(["g1", "g2"]).data([55, 66]) pm.set_stat(mlt) mlt_cont = BarGraph().name("mlt").cols([1, 2, 3]).data([55, 66]).as_continuous() pm.set_stat(mlt_cont) pm.done()
def __init__(self, track_conf, conf_thresh, conf_percent, output): self._track_conf = track_conf self._conf_thresh = conf_thresh self._conf_percent = conf_percent self._output_low_confidence_predictions = output self._low_confidence_predictions = 0 print("track_conf: {}".format(track_conf)) if track_conf > 0: print("conf_thresh: {}".format(conf_thresh)) print("conf_percent: {}".format(conf_percent)) categories = [ "10", "20", "30", "40", "50", "60", "70", "80", "90", "100" ] self._conf_hist = [] for i in range(0, 10): self._conf_hist.append(0) ## MLOps start self._conf_graph = BarGraph().name( "Confidence Distribution Bar Graph").cols(categories)
def __init__(self, print_interval, stats_type, num_categories, conf_thresh, conf_percent, hot_label=True): super(CategoricalStatistics, self).__init__(print_interval) self._num_categories = num_categories self._hot_label = hot_label self._stats_type = stats_type self._conf_thresh = conf_thresh / 100.0 self._conf_percent = conf_percent # These are useful for development, but should be replaced by mlops library functions self._label_hist = [] self._infer_hist = [] for i in range(0, self._num_categories): self._label_hist.append(0) self._infer_hist.append(0) if self._stats_type == "python": mlops.init(ctx=None, connect_mlops=True, mlops_mode=MLOpsMode.AGENT) elif self._stats_type == "file": mlops.init(ctx=None, connect_mlops=False, mlops_mode=MLOpsMode.STAND_ALONE) else: self._stats_type = "none" if self._stats_type != "none": column_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] self._infer_tbl = Table().name("categories").cols(column_names) self._infer_bar = BarGraph().name("categories bar").cols( column_names)
def _materialize(self, parent_data_objs, user_data): for param in parent_data_objs: prent_param = "parent param is: {param}".format(param=param) print(prent_param) self._logger.info(prent_param) for k,v in self._params.items(): params_info = "key: {key} ==> value: {value}".format(key=k, value=v) print(params_info) self._logger.info(params_info) mlt = BarGraph().name("Kenshoo Bar graph example").cols(["bar", "bar2"]).data([1500, 2000]) mlops.set_stat(mlt) return []
def _report_stats(self, file_path): self._logger.info(" *** generate stats .. params:{}".format( self._params)) self._logger.info(" *** Source file {}".format(file_path)) # Read the file data = pd.read_csv(file_path, sep=' |,', header=None, skiprows=1) data = data.rename(index=str, columns={ 1: "label", 2: "confidence0", 3: "confidence1" }) prediction_distribution = data['label'].value_counts() column_names = np.array( prediction_distribution.index).astype(str).tolist() # Initialize mlops mlops.init() # Report a bar graph bar = BarGraph().name("Prediction Distribution").cols( np.array(prediction_distribution.index).astype(str).tolist()).data( prediction_distribution.values.tolist()) mlops.set_stat(bar) # Generate an alert on low confidence if the argument is set to true if (self._params["alert"]): index = data.values[:, 1].astype(int) confidence = data.values[:, 2:4] confidence_per_prediction = confidence[:, index][:, 0] * 100 low_conf_percent = len(confidence_per_prediction[ confidence_per_prediction < self._params["confidence"]]) / len( confidence_per_prediction) * 100 if low_conf_percent > self._params["samples"]: msg = "Low confidence: {}% of inferences had confidence below {}%".format( low_conf_percent, self._params["confidence"]) print(msg) mlops.health_alert("Low confidence alert", msg) mlops.done() return []
class ConfidenceTracker(object): def __init__(self, track_conf, conf_thresh, conf_percent, output): self._track_conf = track_conf self._conf_thresh = conf_thresh self._conf_percent = conf_percent self._output_low_confidence_predictions = output self._low_confidence_predictions = 0 print("track_conf: {}".format(track_conf)) if track_conf > 0: print("conf_thresh: {}".format(conf_thresh)) print("conf_percent: {}".format(conf_percent)) categories = [ "10", "20", "30", "40", "50", "60", "70", "80", "90", "100" ] self._conf_hist = [] for i in range(0, 10): self._conf_hist.append(0) ## MLOps start self._conf_graph = BarGraph().name( "Confidence Distribution Bar Graph").cols(categories) ## MLOps end def check_confidence(self, confidence, sample): if self._track_conf == 0: return conf_bin = int(math.floor(confidence / 10)) # include 100% confidence in the 90-100 range if conf_bin == 10: conf_bin = 9 self._conf_hist[conf_bin] += 1 if confidence < self._conf_thresh: self._low_confidence_predictions += 1 if self._output_low_confidence_predictions != 0: import tensorflow as tf import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt image = tf.reshape(sample, [28, 28]) plotData = sample plotData = plotData.reshape(28, 28) plt.gray( ) # use this line if you don't want to see it in color plt.imshow(plotData) plt.savefig( "/opt/data-lake/image{}_conf{}_prediction{}.png".format( total_predictions, int(round(confidence)), prediction)) def report_confidence(self, total_predictions): if self._track_conf == 0: return ## MLOps start # Show the prediction distribution as a bar graph self._conf_graph.data(self._conf_hist) mlops.set_stat(self._conf_graph) ## MLOps end # Percentage of low confidence predictions in this reporting interval low_conf_percent = self._low_confidence_predictions * 100.0 / total_predictions print("low confidence predictions: {} ({})%".format( self._low_confidence_predictions, low_conf_percent)) if low_conf_percent > self._conf_percent: msg = "Low confidence: {}% of inferences had confidence below {}%".format( low_conf_percent, self._conf_thresh) print(msg) ## MLOps start mlops.health_alert("Low confidence alert", msg) ## MLOps end # reset counters for next round for i in range(0, 9): self._conf_hist[i] = 0 self._low_confidence_predictions = 0
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Classes: [{}]".format( pm_options.num_classes)) print("PM: C: [{}]".format(pm_options.C)) print("PM: Kernel: [{}]".format(pm_options.kernel)) print("PM: Degree: [{}]".format(pm_options.degree)) print("PM: Gamma: [{}]".format(pm_options.gamma)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum iterations: [{}]".format(pm_options.max_iter)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) num_classes = int(pm_options.num_classes) # Create synthetic data using scikit learn X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=num_classes, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Create a model that should be deployed into production final_model = SVC(C=float(pm_options.C), probability=True, kernel=pm_options.kernel, degree=int(pm_options.degree), gamma=str(pm_options.gamma), tol=float(pm_options.tol), max_iter=int(pm_options.max_iter)) final_model.fit(features, labels) value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T # Output actual label distribution as a BarGraph using MCenter bar = BarGraph().name("User Defined: Actual Label Distribution") \ .cols((label_distribution[:, 0]).astype(str).tolist()) \ .data((label_distribution[:, 1]).tolist()) mlops.set_stat(bar) pos_label = 1 # calculate classification prediction labels_pred = final_model.predict(features) # calculate decision scores [n_sample, n_class] labels_decision_score = final_model.decision_function(features) # calculate classification probabilities [n_sample, n_class] labels_prob = final_model.predict_proba(features) # calculate classification probabilities of positive labels label_pos_class_prob = list(map(lambda x: x[pos_label], labels_prob)) # list of sorted labels. i.e. [0, 1, 2, ..] labels_ordered = sorted(set(labels)) value_pred, counts_pred = np.unique(labels_pred, return_counts=True) label_distribution_pred = np.asarray((value_pred, counts_pred)).T # Output prediction label distribution as a BarGraph using MCenter bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \ .data((label_distribution_pred[:, 1]).tolist()) mlops.set_stat(bar_pred) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) ################################################################ #################### Start: Output Accuracy #################### ################################################################ accuracy = final_model.score(features, labels) #################### OLD WAY #################### # First Way # # # Output accuracy of the chosen model using MCenter # mlops.set_stat("User Defined: Accuracy", accuracy, st.TIME_SERIES) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ACCURACY_SCORE, accuracy) # OR # Third Way mlops.metrics.accuracy_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################## #################### End: Output Accuracy #################### ############################################################## ################################################################ #################### Start: Output AUC #################### ################################################################ fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, labels_pred, pos_label=pos_label) auc = sklearn.metrics.auc(fpr, tpr) #################### OLD WAY #################### # First Way # # # Output auc of the chosen model using MCenter # mlops.set_stat("User Defined: AUC", auc) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.AUC, auc) # OR # Third Way mlops.metrics.auc(x=fpr, y=tpr) #################### DONE NEW WAY #################### ############################################################## #################### End: Output AUC #################### ############################################################## ############################################################################### #################### Start: Output Average Precision Score #################### ############################################################################### # average precision is not supported for multiclass if len(labels_ordered) <= 2: aps = sklearn.metrics.average_precision_score(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output aps of the chosen model using MCenter # mlops.set_stat("User Defined: Average Precision Score", aps) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.AVERAGE_PRECISION_SCORE, aps) # OR # Third Way mlops.metrics.average_precision_score(y_true=labels, y_score=labels_decision_score) #################### DONE NEW WAY #################### ############################################################################# #################### End: Output Average Precision Score #################### ############################################################################# ######################################################################### #################### Start: Output Balanced Accuracy #################### ######################################################################### bas = sklearn.metrics.balanced_accuracy_score(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output bas of the chosen model using MCenter # mlops.set_stat("User Defined: Balanced Accuracy Score", bas) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, data=bas) # OR # Third Way mlops.metrics.balanced_accuracy_score(y_true=labels, y_pred=labels_pred) #################### DONE NEW WAY #################### ####################################################################### #################### End: Output Balanced Accuracy #################### ####################################################################### ######################################################################## #################### Start: Output Brier Score Loss #################### ######################################################################## bsl = sklearn.metrics.brier_score_loss(labels, label_pos_class_prob, pos_label=pos_label) #################### OLD WAY #################### # First Way # # # Output bsl of the chosen model using MCenter # mlops.set_stat("User Defined: Brier Score Loss", bsl) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.BRIER_SCORE_LOSS, data=bsl) # OR # Third Way mlops.metrics.brier_score_loss(y_true=labels, y_prob=label_pos_class_prob, pos_label=pos_label) #################### DONE NEW WAY #################### ###################################################################### #################### End: Output Brier Score Loss #################### ###################################################################### ############################################################################# #################### Start: Output Classification Report #################### ############################################################################# cr = sklearn.metrics.classification_report(labels, labels_pred) print("Classification Report\n{}".format(cr)) #################### OLD WAY #################### # First Way # # from parallelm.mlops.stats.table import Table # # arrayReport = list() # for row in cr.split("\n"): # parsed_row = [x for x in row.split(" ") if len(x) > 0] # if len(parsed_row) > 0: # arrayReport.append(parsed_row) # # header = arrayReport[0] # cr_table = Table().name("User Defined: Classification Report").cols(header) # # for index in range(1, len(arrayReport)): # row_title = arrayReport[index][0] # row_value = arrayReport[index][:-1] # cr_table.add_row(row_title, row_value) # # # output classification report using MCenter # mlops.set_stat(cr_table) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.CLASSIFICATION_REPORT, data=cr) # OR # Third Way mlops.metrics.classification_report(labels, labels_pred) #################### DONE NEW WAY #################### ########################################################################### #################### End: Output Classification Report #################### ########################################################################### ######################################################################### #################### Start: Output Cohen Kappa Score #################### ######################################################################### cks = sklearn.metrics.cohen_kappa_score(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output cks of the chosen model using MCenter # mlops.set_stat("User Defined: Cohen Kappa Score", cks) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.COHEN_KAPPA_SCORE, data=cks) # OR # Third Way mlops.metrics.cohen_kappa_score(labels, labels_pred) #################### DONE NEW WAY #################### ####################################################################### #################### End: Output Cohen Kappa Score #################### ####################################################################### ######################################################################## #################### Start: Output Confusion Matrix #################### ######################################################################## cm = sklearn.metrics.confusion_matrix(labels, labels_pred, labels=labels_ordered) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.table import Table # labels_string = [str(i) for i in labels_ordered] # cm_matrix = Table().name("User Defined: Confusion Matrix").cols(labels_string) # # for index in range(len(cm)): # cm_matrix.add_row(labels_string[index], list(cm[index])) # # mlops.set_stat(cm_matrix) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.CONFUSION_MATRIX, cm, labels=labels_ordered) # OR # Third Way mlops.metrics.confusion_matrix(y_true=labels, y_pred=labels_pred, labels=labels_ordered) #################### DONE NEW WAY #################### ###################################################################### #################### End: Output Confusion Matrix #################### ###################################################################### ################################################################ #################### Start: Output F1 Score #################### ################################################################ f1 = sklearn.metrics.f1_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output f1 score of the chosen model using MCenter # mlops.set_stat("User Defined: F1 Score", f1) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.F1_SCORE, data=f1) # OR # Third Way mlops.metrics.f1_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ############################################################## #################### End: Output F1 Score #################### ############################################################## ################################################################ #################### Start: Output FBeta Score #################### ################################################################ fbeta = sklearn.metrics.fbeta_score(labels, labels_pred, beta=0.5, average=None) #################### OLD WAY #################### # First Way # # # Output fbeta score of the chosen model using MCenter # mlops.set_stat("User Defined: F-beta Score", fbeta) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.FBETA_SCORE, data=fbeta) # OR # Third Way mlops.metrics.fbeta_score(labels, labels_pred, pos_label=pos_label, beta=0.5, average=None) #################### DONE NEW WAY #################### ################################################################# #################### End: Output FBeta Score #################### ################################################################# #################################################################### #################### Start: Output Hamming Loss #################### #################################################################### hamming_loss = sklearn.metrics.hamming_loss(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output hamming loss of the chosen model using MCenter # mlops.set_stat("User Defined: Hamming Loss", hamming_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.HAMMING_LOSS, data=hamming_loss) # OR # Third Way mlops.metrics.hamming_loss(labels, labels_pred) #################### DONE NEW WAY #################### ################################################################## #################### End: Output Hamming Loss #################### ################################################################## ################################################################## #################### Start: Output Hinge Loss #################### ################################################################## hinge_loss = sklearn.metrics.hinge_loss(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output hinge loss of the chosen model using MCenter # mlops.set_stat("User Defined: Hinge Loss", hinge_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.HINGE_LOSS, data=hinge_loss) # OR # Third Way mlops.metrics.hinge_loss(labels, labels_decision_score) #################### DONE NEW WAY #################### ################################################################ #################### End: Output Hinge Loss #################### ################################################################ ############################################################################## #################### Start: Output Jaccard Similarity Score #################### ############################################################################## jaccard_sim_score = sklearn.metrics.jaccard_similarity_score( labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output jaccard similarity score of the chosen model using MCenter # mlops.set_stat("User Defined: Jaccard Similarity Score", jaccard_sim_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.JACCARD_SIMILARITY_SCORE, data=jaccard_sim_score) # OR # Third Way mlops.metrics.jaccard_similarity_score(labels, labels_pred) #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Jaccard Similary Score #################### ############################################################################ ################################################################ #################### Start: Output Log Loss #################### ################################################################ log_loss = sklearn.metrics.log_loss(labels, labels_prob) #################### OLD WAY #################### # First Way # # # Output log loss of the chosen model using MCenter # mlops.set_stat("User Defined: Log Loss", log_loss) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.LOG_LOSS, data=log_loss) # OR # Third Way mlops.metrics.log_loss(labels, labels_prob) #################### DONE NEW WAY #################### ############################################################## #################### End: Output Log Loss #################### ############################################################## ######################################################################################## #################### Start: Output Matthews Correlation Coefficient #################### ######################################################################################## mcc = sklearn.metrics.matthews_corrcoef(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output mcc of the chosen model using MCenter # mlops.set_stat("User Defined: Matthews Correlation Coefficient", mcc) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT, data=mcc) # OR # Third Way mlops.metrics.matthews_corrcoef(labels, labels_pred) #################### DONE NEW WAY #################### ###################################################################################### #################### End: Output Matthews Correlation Coefficient #################### ###################################################################################### ############################################################################## #################### Start: Output Precision Recall Curve #################### ############################################################################## # precision_recall_curve is not supported for multiclass if len(labels_ordered) <= 2: precision, recall, thresholds = sklearn.metrics.precision_recall_curve( labels, labels_decision_score, pos_label=pos_label) classes = len(labels_ordered) average_precision = sklearn.metrics.average_precision_score( labels, labels_decision_score, average="macro") graph_label_str = "{}-class Precision Recall Curve -- AP: {}".format( classes, average_precision) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.graph import Graph # # p_r_curve = Graph() \ # .name("User Defined: Precision Recall Curve") \ # .set_x_series(list(recall)) \ # .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(precision)) # # p_r_curve.x_title("Recall") # p_r_curve.y_title("Precision") # mlops.set_stat(p_r_curve) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.PRECISION_RECALL_CURVE, [precision, recall], legend=graph_label_str) # OR # Third Way mlops.metrics.precision_recall_curve(y_true=labels, probas_pred=labels_decision_score, pos_label=pos_label, average="macro") #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Precision Recall Curve #################### ############################################################################ ####################################################################### #################### Start: Output Precision Score #################### ####################################################################### precision_score = sklearn.metrics.precision_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output precision score of the chosen model using MCenter # mlops.set_stat("User Defined: Precision Score", precision_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.PRECISION_SCORE, data=precision_score) # OR # Third Way mlops.metrics.precision_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ############################################################################ #################### End: Output Precision Score ########################### ############################################################################ #################################################################### #################### Start: Output Recall Score #################### #################################################################### recall_score = sklearn.metrics.recall_score(labels, labels_pred, pos_label=pos_label, average=None) #################### OLD WAY #################### # First Way # # # Output recall score of the chosen model using MCenter # mlops.set_stat("User Defined: Recall Score", recall_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.RECALL_SCORE, data=recall_score) # OR # Third Way mlops.metrics.recall_score(labels, labels_pred, pos_label=pos_label, average=None) #################### DONE NEW WAY #################### ######################################################################### #################### End: Output Recall Score ########################### ######################################################################### ##################################################################### #################### Start: Output ROC AUC Score #################### ##################################################################### # roc_auc_score is not supported for multiclass if len(labels_ordered) <= 2: roc_auc_score = sklearn.metrics.roc_auc_score(labels, labels_decision_score) #################### OLD WAY #################### # First Way # # # Output roc auc score of the chosen model using MCenter # mlops.set_stat("User Defined: ROC AUC Score", roc_auc_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ROC_AUC_SCORE, data=roc_auc_score) # OR # Third Way mlops.metrics.roc_auc_score(labels, labels_decision_score) #################### DONE NEW WAY #################### ################################################################### #################### End: Output ROC AUC Score #################### ################################################################### ################################################################# #################### Start: Output ROC Curve #################### ################################################################# # roc_auc_score is not supported for multiclass if len(labels_ordered) <= 2: fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels, labels_decision_score, pos_label=pos_label) roc_auc_score = sklearn.metrics.roc_auc_score(labels, labels_decision_score) graph_label_str = "ROC Curve, AUC: {}".format(roc_auc_score) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.graph import Graph # # roc_curve = Graph() \ # .name("User Defined: ROC Curve") \ # .set_x_series(list(fpr)) \ # .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(tpr)) # # roc_curve.x_title("False Positive Rate") # roc_curve.y_title("True Positive Rate") # # mlops.set_stat(roc_curve) #################### DONE OLD WAY #################### #################### NEW WAY #################### mlops.set_stat(ClassificationMetrics.ROC_CURVE, [tpr, fpr], legend=graph_label_str) # OR # Third Way mlops.metrics.roc_curve(y_true=labels, y_score=labels_decision_score, pos_label=pos_label) #################### DONE NEW WAY #################### ############################################################### #################### End: Output ROC Curve #################### ############################################################### ##################################################################### #################### Start: Output Zero One Loss #################### ##################################################################### zol = sklearn.metrics.zero_one_loss(labels, labels_pred) #################### OLD WAY #################### # First Way # # # Output zol of the chosen model using MCenter # mlops.set_stat("User Defined: Zero One Loss", zol) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClassificationMetrics.ZERO_ONE_LOSS, data=zol) # OR # Third Way mlops.metrics.zero_one_loss(labels, labels_pred) #################### DONE NEW WAY #################### ################################################################### #################### End: Output Zero One Loss #################### ################################################################### # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def main(): # Initialize spark and MLOps spark = SparkSession.builder.appName( "RandomForestClassifier").getOrCreate() mlops.init(spark.sparkContext) # parse the arguments to component options = parse_args() # Load the model, exit gracefully if model is not found try: model_rf = \ SparkPipelineModelHelper() \ .set_shared_context(spark_context=spark.sparkContext) \ .set_local_path(local_path=options.input_model) \ .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \ .load_sparkml_model() except Exception as e: print(e) mlops.done() spark.sparkContext.stop() exit() # Generate synthetic data for inference (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = 50 num_features = 20 np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() test_features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] feature_names = [ "".join(ascii_lowercase[a]) for a in range(num_features + 1) ] # Create a spark dataframe from the synthetic data generated inferenceData = spark.createDataFrame( pd.DataFrame(test_features, columns=feature_names[1:num_features + 1])) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score mlops.set_data_distribution_stat(inferenceData) num_samples = inferenceData.count() # Report the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Make inference predictions predicted_df = model_rf.transform(inferenceData) # Create a bar graph with label and confidence distributions histogram_predictions = predicted_df.groupby("prediction").count() prediction_values = np.array( histogram_predictions.select("prediction").collect()) prediction_counts = np.array( histogram_predictions.select("count").collect()) # Report label distribution as a BarGraph using MCenter bar_predictions = BarGraph().name("Prediction Distribution").cols( (prediction_values[0]).astype(str).tolist()).data( (prediction_counts[0]).tolist()) mlops.set_stat(bar_predictions) # Stop spark context and MLOps spark.sparkContext.stop() mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: C: [{}]".format(pm_options.C)) print("PM: Kernel: [{}]".format(pm_options.kernel)) print("PM: Degree: [{}]".format(pm_options.degree)) print("PM: Gamma: [{}]".format(pm_options.gamma)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum iterations: [{}]".format(pm_options.max_iter)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=3, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Create a model that should be deployed into production final_model = SVC(C=float(pm_options.C), kernel=pm_options.kernel, degree=int(pm_options.degree), gamma=str(pm_options.gamma), tol=float(pm_options.tol), max_iter=int(pm_options.max_iter)) final_model.fit(features, labels) # Accuracy for the chosen model accuracy = final_model.score(features, labels) print("Accuracy values: \n {0}".format(accuracy)) # Label distribution in training value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def canary_comparator(options, start_time, end_time, mode): sc = None if mode == RunModes.PYSPARK: from pyspark import SparkContext sc = SparkContext(appName="canary-comparator") mlops.init(sc) elif mode == RunModes.PYTHON: mlops.init() else: raise Exception("Invalid mode " + mode) not_enough_data = False # Following are main and canary component names main_prediction_component_name = options.nodeA canary_prediction_component_name = options.nodeB main_stat_name = options.predictionHistogramA canary_stat_name = options.predictionHistogramB main_agent = utils._get_agent_id(main_prediction_component_name, options.agentA) canary_agent = utils._get_agent_id(canary_prediction_component_name, options.agentB) if main_agent is None or canary_agent is None: print("Invalid agent provided {} or {}".format(options.agentA, options.agentB)) mlops.system_alert( "PyException", "Invalid Agent {} or {}".format(options.agentA, options.agentB)) return try: main_data_frame = mlops.get_stats( name=main_stat_name, mlapp_node=main_prediction_component_name, agent=main_agent, start_time=start_time, end_time=end_time) canary_data_frame = mlops.get_stats( name=canary_stat_name, mlapp_node=canary_prediction_component_name, agent=canary_agent, start_time=start_time, end_time=end_time) main_pdf = pd.DataFrame(main_data_frame) canary_pdf = pd.DataFrame(canary_data_frame) try: row1 = main_pdf.tail(1).iloc[0] row2 = canary_pdf.tail(1).iloc[0] except Exception as e: not_enough_data = True print("Not enough histograms produced in pipelines") raise ValueError("Not enough data to compare") if row1['hist_type'] != row2['hist_type']: raise ValueError( 'Canary and Main pipelines dont produce histograms' + 'of same type {} != {}'.format(row1['hist_type'], row2['hist_type'])) if row1['hist_type'] == 'continuous': rmse = _compare_cont_hist(row1['bin_edges'], row2['bin_edges'], row1['hist_values'], row2['hist_values']) gg2 = MultiGraph().name("Prediction Histograms").set_categorical() gg2.x_title("Predictions") gg2.y_title("Normalized Frequency") gg2.add_series(label="Main", x=[float(x) for x in row1['bin_edges']][:-1], y=[y for y in row1['hist_values']]) gg2.add_series(label="Canary", x=[float(x) for x in row2['bin_edges']][:-1], y=[y for y in row2['hist_values']]) mlops.set_stat(gg2) bar1 = BarGraph().name("Main Pipeline").cols([ "{} to {}".format(x, y) for (x, y) in pairwise(row1['bin_edges']) ]).data([x for x in row1['hist_values']]) mlops.set_stat(bar1) bar2 = BarGraph().name("Canary Pipeline").cols([ "{} to {}".format(x, y) for (x, y) in pairwise(row2['bin_edges']) ]).data([x for x in row2['hist_values']]) mlops.set_stat(bar2) elif row1['hist_type'] == 'categorical': rmse = _compare_cat_hist(row1['bin_edges'], row2['bin_edges'], row1['hist_values'], row2['hist_values']) gg2 = MultiGraph().name("Prediction Histograms").set_categorical() gg2.x_title("Predictions") gg2.y_title("Normalized Frequency") gg2.add_series(label="Main", x=row1['bin_edges'], y=[y for y in row1['hist_values']]) gg2.add_series(label="Canary", x=row2['bin_edges'], y=[y for y in row2['hist_values']]) mlops.set_stat(gg2) bar1 = BarGraph().name("Main Pipeline").cols([ "{}".format(x) for x in row1['bin_edges'] ]).data([x for x in row1['hist_values']]) mlops.set_stat(bar1) bar2 = BarGraph().name("Canary Pipeline").cols([ "{}".format(x) for x in row2['bin_edges'] ]).data([x for x in row2['hist_values']]) mlops.set_stat(bar2) else: raise ValueError('Invalid histogram type: {}'.format( row1['hist_type'])) mlops.set_stat("RMSE", rmse, st.TIME_SERIES) print("mlops policy {}".format(mlops.mlapp_policy)) if mlops.mlapp_policy.canary_threshold is None: print("Canary health threshold not set") raise ValueError("Canary health threshold not set in config") # Following code perform comparison between the histograms. # Here you can insert your own code if rmse > mlops.mlapp_policy.canary_threshold: print("Canary Alert {} > {}".format( rmse, mlops.mlapp_policy.canary_threshold)) mlops.event( CanaryAlert(label="CanaryAlert", is_healthy=False, score=rmse, threshold=mlops.mlapp_policy.canary_threshold)) else: print("Data matches {}".format(rmse)) mlops.event( CanaryAlert(label="CanaryAlert", is_healthy=True, score=rmse, threshold=mlops.mlapp_policy.canary_threshold)) except Exception as e: if not_enough_data is False: print("Got exception while getting stats: {}".format(e)) mlops.system_alert( "PyException", "Got exception {}".format(traceback.format_exc())) if mode == RunModes.PYSPARK: sc.stop() mlops.done()
def infer_loop(model, input, output_file, stats_interval, conf_thresh, conf_percent): output = open(output_file, "w") # Initialize statistics total_predictions = 0 low_confidence_predictions = 0 categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] prediction_hist = [] for i in range(0, len(categories)): prediction_hist.append(0) ### MLOPS start # Create a bar graph and table for reporting prediction distributions and set the column names infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(categories) infer_tbl = Table().name("Prediction Distribution Table").cols(categories) ### MLOPS end while True: try: sample, label = input.get_next_input() sample_np = ny.array(sample).reshape(1, -1) # The prediction is the class with the highest probability prediction = model.predict(sample_np) # Append the prediction to the output file output.write("{}\n".format(prediction)) # Calculate statistics total_predictions += 1 prediction_hist[ny.int(prediction[0])] += 1 # Report statistics if total_predictions % stats_interval == 0: # Report the prediction distribution for i in range(0, len(categories)): print("category: {} predictions: {}".format(categories[i], prediction_hist[i])) ### MLOPS start # Show the prediction distribution as a table infer_tbl.add_row(str(total_predictions), prediction_hist) # Show the prediction distribution as a bar graph infer_bar.data(prediction_hist) except EOFError: # stop when we hit end of input # Report the stats mlops.set_stat(infer_tbl) mlops.set_stat(infer_bar) ### MLOPS end output.close() ### MLOPS start mlops.done() ### MLOPS end break
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: Data file: [{}]".format(pm_options.data_file)) print("PM: Output model: [{}]".format(pm_options.output_model)) print("PM: regularization_range: [{}]".format( pm_options.regularization_range)) mlops.init() # Read the Samsung datafile dataset = pd.read_csv(pm_options.data_file) # Separate into features and labels features = dataset.iloc[:, 1:].values labels = dataset.iloc[:, 0].values # Hyper-parameter search using k-fold cross-validation # Applying k_fold cross validation regularization_range = pm_options.regularization_range.split(',') regularization = [ float(regularization_var) for regularization_var in regularization_range ] tune_parameters = [{'C': regularization}] # Initialize logistic regression algorithm LR = LogisticRegression(class_weight='balanced', multi_class='multinomial', solver='lbfgs') clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy') clf.fit(features, labels) print("best parameter = ", clf.best_params_) accuracy = clf.cv_results_['mean_test_score'] print( 'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format( accuracy, regularization)) ########## Start of ParallelM instrumentation ############## # Report Hyper-parameter Table tbl = Table().name("Hyper-parameter Search Results").cols( ["Mean accuracy from k-fold cross-validation"]) print("length of regularization", len(regularization)) index_max = np.argmax(accuracy) for a in range(0, len(regularization)): print("adding row", regularization[a]) if a == index_max: tbl.add_row("[Best] Regularization = " + np.str(regularization[a]), [accuracy[a]]) else: tbl.add_row("Regularization = " + np.str(regularization[a]), [accuracy[a]]) mlops.set_stat(tbl) ########## End of ParallelM instrumentation ############## # Label distribution in training label_distribution = dataset['label'].value_counts() column_names = np.array(label_distribution.index).astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) ########## Start of ParallelM instrumentation ############## # Report label distribution as a BarGraph bar = BarGraph().name("Label Distribution").cols( np.array(label_distribution.index).astype(str).tolist()).data( label_distribution.values.tolist()) mlops.set_stat(bar) ########## Start of ParallelM instrumentation ############## #################### Start of ParallelM instrumentation ################ # Report accuracy of the chosen model mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max], st.TIME_SERIES) #################### End of ParallelM instrumentation ################ # Histogram input mlops.set_data_distribution_stat(dataset) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(clf, model_file) model_file.close() mlops.done()
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv") .option("header", pm_options.with_headers) .option("ignoreLeadingWhiteSpace", "true") .option("ignoreTrailingWhiteSpace", "true") .option("inferschema", "true") .load(pm_options.data_file)).repartition(10) # If Data doesn't have headers Create column names c0-cn column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed(column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() # Set both train and tesst data to the entire dataset input_train = input_data input_test = input_data # SparkML pipeline ################################## # Create column names for vector assembler. Handle exclude columns for vector assembler exclude_cols = [] # No columns to exclude - kmeans of all columns column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) # Set hyper parameters search parameters k_range = pm_options.KRange.split(',') db_index_max = np.finfo(np.float64).max k_max = k_range[0] db_index_array = np.zeros(len(k_range)) for index_hs in range (0,len(k_range)): vector_assembler = VectorAssembler( inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans( k=int(k_range[index_hs]), initMode="k-means||", initSteps=5, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] = \ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df = \ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] = \ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davies-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] = \ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) db_index_array[index_hs] = db_index # Check Hyper Parameter Search max if (db_index < db_index_max): db_index_max = db_index k_max = k_range[index_hs] model_kmeans_max = model_kmeans sum_errors_max = sum_errors kmeans_centers_max = kmeans_centers inter_cluster_distance_max = inter_cluster_distance intra_cluster_distance_max = intra_cluster_distance cluster_dist_max = cluster_dist # PM stats ############################################################ print("Optimal K = " + str(k_max)) pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES) print("Sum of Errors for Kmeans = " + str(sum_errors_max)) pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index_max)) pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(k_range)): tbl_col_name.append(str(k_range[j])) tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name) tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array]) pm.set_stat(tbl) tbl_col_name = [] for j in range(0, len(kmeans_centers_max)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers_max)): tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max]) pm.set_stat(tbl) if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12): tbl_col_name1 = [] for j in range(0, len(kmeans_centers_max[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers_max)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist()) pm.set_stat(bar) return model_kmeans_max
def main(): # Parse arguments parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() print("PM: Configuration:") print("PM: Step size: [{}]".format(args.step_size)) print("PM: Iterations: [{}]".format(args.iterations)) print("PM: Model version: [{}]".format(args.model_version)) print("PM: Stats interval: [{}]".format(args.stats_interval)) print("PM: Save dir: [{}]".format(args.save_dir)) # Initialize MLOps Library mlops.init() # print the number of iteration used by optimization algorithm print('Training for %i iterations' % args.iterations) # Create sythetic data using scikit learn num_samples = 50 num_features = 20 features, labels = make_classification(n_samples=50, n_features=20, n_informative=2, n_redundant=1, n_classes=3, n_clusters_per_class=1, random_state=42) # Add noise to the data noisy_features = np.random.uniform(0, 5) * np.random.normal( 0, 1, (num_samples, num_features)) features = features + noisy_features num_features = (features.shape[1]) num_labels = len(np.unique(labels)) # One-hot encode labels for all data onehot_labels = np.eye(num_labels)[labels] # Label distribution in training value, counts = np.unique(labels, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output Health Statistics to MCenter # Report features whose distribution should be compared during inference mlops.set_data_distribution_stat(features) # Algorithm parameters parsed from arguments learning_rate = args.step_size training_epochs = args.iterations display_step = args.stats_interval # tf Graph Input x = tf.placeholder(tf.float32, [None, num_features], name="features") y = tf.placeholder(tf.float32, [None, num_labels], name="labels") # Set model weights W = tf.Variable(tf.zeros([num_features, num_labels])) b = tf.Variable(tf.zeros([num_labels])) # Store values for saving model serialized_tf_example = tf.placeholder(tf.string, name='tf_example') # Construct model pred = tf.nn.softmax(tf.matmul(x, W) + b, name="predictions") # Softmax # Minimize error using cross entropy cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred), reduction_indices=1)) # Gradient Descent optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Evaluation correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(pred, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # Start timer training_start_time = time.time() # Initialize the variables in a tf session sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) iteration_array = [] cost_array = [] accuracy_array = [] # Training cycle for epoch in range(training_epochs): avg_cost = 0 temp, c, a = sess.run([optimizer, cost, accuracy], feed_dict={ x: features, y: onehot_labels }) # Compute average loss avg_cost += c / num_samples # Display logs per epoch step if (epoch + 1) % display_step == 0: iteration_array.append(epoch) cost_array.append(avg_cost) accuracy_array.append(np.float(a)) print("accuracy", a) print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost)) # Plot the cost function using MCenter gg = Graph().name("Cost function across epochs").set_x_series( iteration_array).add_y_series(label="Cost Function Across Iterations", data=cost_array) gg.x_title("Average Cost") gg.y_title('Iterations') mlops.set_stat(gg) # Plot the accuracy function using MCenter gg1 = Graph().name("Accuracy across epochs").set_x_series( iteration_array).add_y_series(label="Accuracy Across Iterations", data=accuracy_array) gg1.x_title("Accuracy") gg1.y_title('Iterations') mlops.set_stat(gg1) # Plot accuracy and cost across epochs using MCenter mg = MultiGraph().name("Cost and Accuracy Progress Across Epochs") mg.add_series(x=iteration_array, label="Cost Function Across Iterations", y=cost_array) mg.add_series(x=iteration_array, label="Accuracy across epochs", y=accuracy_array) mlops.set_stat(mg) # Plot final cost and accuracy in this session using MCenter mlt = MultiLineGraph().name("Final Accuracy and Cost").labels( ["Cost", "Accuracy"]) mlt.data([cost_array[-1], accuracy_array[-1]]) mlops.set_stat(mlt) # Save the model export_path = args.save_dir print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) values, indices = tf.nn.top_k(y, num_labels) table = tf.contrib.lookup.index_to_string_table_from_tensor( tf.constant([str(i) for i in range(num_labels)])) prediction_classes = table.lookup(tf.to_int64(indices)) # Build the signature_def_map. classification_inputs = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( prediction_classes) classification_outputs_scores = tf.saved_model.utils.build_tensor_info( values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: classification_outputs_scores }, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME )) tensor_info_x = tf.saved_model.utils.build_tensor_info(x) tensor_info_y = tf.saved_model.utils.build_tensor_info(y) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'inputs': tensor_info_x}, outputs={'outputs': tensor_info_y}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) ) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save(as_text=args.use_text)
def _prep_and_infer(self, df_dataset): # Get number of features self.num_features = df_dataset.shape[1] # Get number of samples self.num_samples = df_dataset.shape[0] #get input model self.input_model = self._params["input-model"] self._logger.info("PM: Configuration:") self._logger.info("PM: # Sample: [{}]".format( self.num_samples)) self._logger.info("PM: # Features: [{}]".format( self.num_features)) self._logger.info("PM: # Input-Model: [{}]".format( self.input_model)) # Initialize MLOps Library mlops.init() # Load the model if self.input_model is not None: try: filename = self._params["input-model"] model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: #self._logger.error("Model Not Found") self._logger.error("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) features = df_dataset # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data # and compare it automatically with the ones mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features), st.TIME_SERIES) # Accuracy for the chosen model pred_labels = final_model.predict(features) pred_probs = final_model.predict_proba(features) self._logger.info("Pred Labels: {}".format( pred_labels)) # Remove printout can be huge self._logger.info("Pred Probabilities: {}".format( pred_probs)) # Remove printout can be huge # Pred Label distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() self._logger.info( "Pred Label distributions: \n {}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name("Pred Label Distribution").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # Pred Label confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] self._logger.info("np.sum(confidence[index_class]) {}".format( np.sum(max_pred_probs[index_class]))) self._logger.info("counts_elements[i] {}".format(pred_counts[i])) if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 # BarGraph showing confidence per class pred_values1 = [str(i) for i in pred_value] bar = BarGraph().name("Average Confidence Per Class").cols( pred_values1).data(average_confidence.tolist()) mlops.set_stat(bar) # Terminate MLOPs mlops.done() df_result = pd.concat([ df_dataset, pd.DataFrame({'predict': pred_labels}), pd.DataFrame({ 'probs-0': pred_probs[:, 0], 'probs-1': pred_probs[:, 1] }) ], axis=1) df_result.insert(0, 'idx', [x for x in range(1, df_result.shape[0] + 1)], allow_duplicates=False) return df_result
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: # Model File: [{}]".format( pm_options.input_model)) # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: print("Model Not Found") print("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data # select columns 1 through end except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) # Add random noise to the data randomly import random if random.randint(1, 21) / 2 == 0: print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features # Separate into features and labels features = X max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features), st.TIME_SERIES) # Accuracy for the chosen model pred_labels = final_model.predict(features) pred_probs = final_model.predict_proba(features) print("Pred Labels: ", pred_labels) # Remove printout can be huge print("Pred Probabilities: ", pred_probs) # Remove printout can be huge # Pred Label distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Pred Label distributions: \n {0}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name("Pred Label Distribution").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # Pred Label confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] print(" np.sum(confidence[index_class])", np.sum(max_pred_probs[index_class])) print("counts_elements[i] ", pred_counts[i]) if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 # BarGraph showing confidence per class pred_values1 = [str(i) for i in pred_value] bar = BarGraph().name("Average Confidence Per Class").cols( pred_values1).data(average_confidence.tolist()) mlops.set_stat(bar) # KS for the chosen model ks = ks_2samp(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter if not np.isnan(ks_stat): print("printing KS_stat ") mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) else: print("not printing KS_stat ") # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Inference] KS Violation From Inference Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Inference] PSI Violation From Inference Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Terminate MLOPs mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Validation Split: [{}]".format( pm_options.validation_split)) print("PM: # AUC Threshold: [{}]".format( pm_options.auc_threshold)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Estimators: [{}]".format( pm_options.n_estimators)) print("PM: # Max Depth: [{}]".format(pm_options.max_depth)) print("PM: # Learning Rate: [{}]".format( pm_options.learning_rate)) print("PM: # Min Child Weight: [{}]".format( pm_options.min_child_weight)) print("PM: # Objective: [{}]".format(pm_options.objective)) print("PM: # Gamma: [{}]".format(pm_options.gamma)) print("PM: # Max Delta Step: [{}]".format( pm_options.max_delta_step)) print("PM: # Subsample: [{}]".format(pm_options.subsample)) print("PM: # Reg Alpha: [{}]".format(pm_options.reg_alpha)) print("PM: # Reg Lambda: [{}]".format( pm_options.reg_lambda)) print("PM: # Scale Pos Weight: [{}]".format( pm_options.scale_pos_weight)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: Output model: [{}]".format( pm_options.output_model)) min_auc_requirement = float(pm_options.auc_threshold) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Initialize MLOps Library mlops.init() try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data[:, 1:] # select columns 1 through end y = data[:, 0] except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(pm_options.validation_split), random_state=42) import xgboost as xgb # Create a model that should be deployed into production final_model = xgb.XGBClassifier( max_depth=int(pm_options.max_depth), min_child_weight=int(pm_options.min_child_weight), learning_rate=float(pm_options.learning_rate), n_estimators=int(pm_options.n_estimators), silent=True, objective=str(pm_options.objective), gamma=float(pm_options.gamma), max_delta_step=int(pm_options.max_delta_step), subsample=float(pm_options.subsample), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(pm_options.reg_alpha), reg_lambda=float(pm_options.reg_lambda), scale_pos_weight=float(pm_options.scale_pos_weight), seed=1, missing=None) final_model.fit(X_train, y_train) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(X_train) # Accuracy for the chosen model pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) print("Pred Labels: ", pred_labels) print("Pred Probabilities: ", pred_probs) accuracy = accuracy_score(y_test, pred_labels) print("Accuracy values: \n {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T # column_names = value.astype(str).tolist() print("Validation Actual Label distributions: \n {0}".format( label_distribution)) # Output Label distribution as a BarGraph using MCenter bar = BarGraph().name("Validation Actual Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Validation Prediction Label Distributions: \n {0}".format( pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name( "Validation Prediction Label Distributions").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # ROC for the chosen model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) print("ROC AUC values: \n {}".format(roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES) if roc_auc <= min_auc_requirement: mlops.health_alert( "[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format( min_auc_requirement, roc_auc)) # ROC Curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) cg = MultiGraph().name( "Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='ROC Curve (Area = {0:0.2f})' ''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) max_pred_probs = pred_probs.max(axis=1) # KS for the chosen model ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
"l2"]).data([5, 16]) mlops.set_stat(mlt) # Example of sending a table to pm system. # Multi-line graphs mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) # Table example tbl = Table().name("MyTable").cols(["", "Date"]) tbl.add_row(["line 1", "2001Q1"]) tbl.add_row(["line 2", "2014Q3"]) mlops.set_stat(tbl) bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd", "ee"]).data([10, 15, 12, 9, 8]) mlops.set_stat(bar) partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 n = 100000 * partitions def f(_): x = random() * 2 - 1 y = random() * 2 - 1 return 1 if x**2 + y**2 <= 1 else 0 count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) print("Pi is roughly %f" % (4.0 * count / n)) spark.stop()
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv").option( "header", pm_options.with_headers).option( "ignoreLeadingWhiteSpace", "true").option("ignoreTrailingWhiteSpace", "true").option( "inferschema", "true").load(pm_options.data_file)).repartition(10) column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed( column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() input_train = input_data input_test = input_data # SparkML pipeline ################################## exclude_cols = [] column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) vector_assembler = VectorAssembler(inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans(k=int(pm_options.K), initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) # Shows the result. kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros( (len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] =\ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter( predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df =\ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] =\ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davis-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] =\ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) # pmml model generation ############################################################ pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8") # PM stats ############################################################ print("Sum of Errors for Kmeans = " + str(sum_errors)) pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index)) pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(kmeans_centers)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers)): tbl.add_row( str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance]) pm.set_stat(tbl) tbl_col_name1 = [] for j in range(0, len(kmeans_centers[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data( cluster_dist.tolist()) pm.stat(bar) print("PM: generating histogram from data-frame and model") print("PM:" + pmml_file) try: pm.set_data_distribution_stat(data=input_train, model=pmml_file) print("PM: done generating histogram") except Exception as e: print("PM: failed to generate histogram using pm.stat") print(e) return pmml_file
def main(): # Initialize spark and MLOps spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate() mlops.init(spark.sparkContext) # parse the arguments to component options = parse_args() print("PM: Configuration:") print("PM: Number of trees: [{}]".format(options.num_trees)) print("PM: Maximum depth: [{}]".format(options.max_depth)) print("PM: Output model: [{}]".format(options.output_model)) print("PM: Temp shared path: [{}]".format(options.temp_shared_path)) # Generate synthetic data using scikit learn num_samples = 50 num_features = 20 num_classes = 3 X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=num_classes, n_clusters_per_class=1, random_state=42) X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features)) feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)] feature_names[0] = "label" # Create a spark dataframe from the synthetic data generated trainingData = spark.createDataFrame( pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names)) # Histogram of label distribution value, counts = np.unique(y, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output Health Statistics to MCenter # Report features whose distribution should be compared during inference mlops.set_data_distribution_stat(trainingData) # Fit a random forest classifiction model assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features") layers = [num_features, 5, 4, num_classes] classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth)) pipeline = Pipeline(stages=[assembler, classifier]) model = pipeline.fit(trainingData) predictions = model.transform(trainingData) # Select (prediction, true label) and compute training error evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Report accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Save the spark model SparkPipelineModelHelper() \ .set_shared_context(spark_context=spark.sparkContext) \ .set_local_path(local_path=options.output_model) \ .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \ .save_sparkml_model(model) # Stop spark context and MLOps spark.sparkContext.stop() mlops.done()
def main(args): # Parse arguments parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() # Initialize MLOps Library mlops.init() # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = 50 num_features = 20 np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] # Start tensorflow session sess = tf.InteractiveSession() tag_set = ["serve"] if args.model_dir is not None: try: print("args.model_dir = ", args.model_dir) tf.saved_model.loader.load(sess, tag_set, args.model_dir) except Exception as e: print("Model not found") print("Got exception: " + str(e)) return 0 # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(data=features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features)) graph = tf.get_default_graph() x = graph.get_tensor_by_name("features:0") y_pred = graph.get_tensor_by_name("predictions:0") predictions = sess.run(y_pred, {x: features}) print('predictions', np.array(predictions)) # Ouput prediction distribution as a BarGraph using MCenter predict_int = np.argmax(predictions, axis=1) unique, counts = np.unique(predict_int, return_counts=True) counts = list(map(int, counts)) x_series = list(map(str, unique)) mlt = BarGraph().name("Prediction Distribution").cols(x_series).data( list(counts)) mlops.set_stat(mlt) # Show average prediction probability value for each prediction num_labels = len(np.unique(predict_int)) probability = np.zeros((num_labels, )) for a in range(0, num_labels): temp = predictions[np.argmax(predictions, axis=1) == a, :] print(temp) probability[a] = np.mean(temp[:, a]) print("probability", list(np.squeeze(probability))) # Plot average probability in each class using MCenter bg = BarGraph().name("Probability of Each Label").cols(x_series).data( list(np.squeeze(probability))) mlops.set_stat(bg)
def main(): pm_options = parse_args() mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 classifier = pickle.load(file_obj) # Load the data test_dataset = pd.read_csv(pm_options.input_file) mlops.set_data_distribution_stat(test_dataset) # Extract numpy array test_features = test_dataset.values # Predict labels result = classifier.predict(test_features) # Predict probability class_probability = classifier.predict_proba(test_features) maximum_prob = np.max(class_probability, axis=1) # Tag samples that are below a certain probability and write to a file confidence = 0.8 low_prob_samples = test_features[np.where(maximum_prob < confidence)] low_prob_predictions = result[np.where(maximum_prob < confidence)] unique_elements_low, counts_elements_low = np.unique(low_prob_predictions, return_counts=True) unique_elements_low = [str(i) for i in unique_elements_low] print("Low confidence predictions: \n {0} \n with frequency {1}".format( unique_elements_low, counts_elements_low)) ########## Start of ParallelM instrumentation ############## # BarGraph showing distribution of low confidence labels bar = BarGraph().name("Low confidence label distribution").cols( unique_elements_low).data(counts_elements_low.tolist()) mlops.set_stat(bar) ########## End of ParallelM instrumentation ################ # Samples with high probability high_prob_samples = test_features[np.where(maximum_prob >= confidence)] high_prob_predictions = result[np.where(maximum_prob >= confidence)] unique_elements_high, counts_elements_high = np.unique( high_prob_predictions, return_counts=True) unique_elements_high = [str(i) for i in unique_elements_high] print("High confidence predictions: \n {0} \n with frequency {1}".format( unique_elements_high, counts_elements_high)) ########## Start of ParallelM instrumentation ############## # BarGraph showing distribution of high confidence labels bar = BarGraph().name("High confidence label distribution").cols( unique_elements_high).data(counts_elements_high.tolist()) mlops.set_stat(bar) ########## End of ParallelM instrumentation ################ mlops.done()
def main(): pm_options = parse_args() # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 regression = pickle.load(file_obj) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) mae_threshold = float(pm_options.threshold) # Create synthetic data using scikit learn X, y = make_regression(n_samples=num_samples, n_features=num_features, n_informative=2, random_state=42) # for making labels all positive y = y + -1 * np.min(y) # Separate into features and labels features = X labels = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (num_samples, num_features)) features = features + noisy_features # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Predict labels labels_pred = regression.predict(features) hist_pred, bin_edges_pred = np.histogram(labels_pred) # Output prediction label distribution as a BarGraph using MCenter pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols(bin_edges_pred.astype(str).tolist()) \ .data(hist_pred.tolist()) \ .as_continuous() mlops.set_stat(pred_label_bar) ########################################################################## #################### Start: Output Sample/Conversions #################### ########################################################################@@ mae = np.absolute(labels_pred - labels) conversions = sum(i < mae_threshold for i in mae) samples = num_samples mlops.set_stat("samples", samples) mlops.set_stat("conversions", conversions) ######################################################################## #################### End: Output Sample/Conversions #################### ######################################################################## # Terminate MLOPs mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Classes: [{}]".format( pm_options.num_cluster)) print("PM: Init: [{}]".format(pm_options.init)) print("PM: N Init: [{}]".format(pm_options.n_init)) print("PM: Tolerance: [{}]".format(pm_options.tol)) print("PM: Maximum Iterations: [{}]".format(pm_options.max_iter)) print("PM: Pre-Compute Distances: [{}]".format( pm_options.precompute_distances)) print("PM: Algorithm: [{}]".format(pm_options.algorithm)) print("PM: Output model: [{}]".format( pm_options.output_model)) # Initialize MLOps Library mlops.init() n_samples = int(pm_options.num_samples) n_features = int(pm_options.num_features) n_clusters = int(pm_options.num_cluster) init = str(pm_options.init) n_init = int(pm_options.n_init) max_iter = int(pm_options.max_iter) tol = float(pm_options.tol) precompute_distances = str(pm_options.precompute_distances) algorithm = str(pm_options.algorithm) verbose = 0 n_jobs = 1 # Create synthetic data using scikit learn X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_redundant=1, n_classes=n_clusters, n_clusters_per_class=1, random_state=42) # Separate into features and labels features = X labels_true = y # Add noise to the data noisy_features = np.random.uniform(0, 10) * \ np.random.normal(0, 1, (n_samples, n_features)) features = features + noisy_features kmeans_model = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=verbose, random_state=None, copy_x=True, n_jobs=n_jobs, algorithm=algorithm).fit(features, labels_true) mlops.set_stat("User Defined: Training Inertia", kmeans_model.inertia_) mlops.set_stat("User Defined: Training Iteration", kmeans_model.n_iter_) value, counts = np.unique(labels_true, return_counts=True) label_distribution = np.asarray((value, counts)).T # Output actual label distribution as a BarGraph using MCenter bar_true = BarGraph().name("User Defined: Actual Label Distribution") \ .cols((label_distribution[:, 0]).astype(str).tolist()) \ .data((label_distribution[:, 1]).tolist()) mlops.set_stat(bar_true) # prediction labels labels_pred = kmeans_model.predict(features) value_pred, counts_pred = np.unique(labels_pred, return_counts=True) label_distribution_pred = np.asarray((value_pred, counts_pred)).T # Output prediction label distribution as a BarGraph using MCenter bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \ .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \ .data((label_distribution_pred[:, 1]).tolist()) mlops.set_stat(bar_pred) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(features) ########################################################################### #################### Start: Adjusted Mutual Info Score #################### ########################################################################### adjusted_mutual_info_score = sklearn.metrics \ .adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Adjusted Mutual Info Score", adjusted_mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.ADJUSTED_MUTUAL_INFO_SCORE, adjusted_mutual_info_score) # OR # Third Way mlops.metrics.adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ######################################################################### #################### End: Adjusted Mutual Info Score #################### ######################################################################### #################################################################### #################### Start: Adjusted Rand Score #################### #################################################################### adjusted_rand_score = sklearn.metrics \ .adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Adjusted Rand Score", adjusted_rand_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.ADJUSTED_RAND_SCORE, adjusted_rand_score) # OR # Third Way mlops.metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################## #################### End: Adjusted Rand Score #################### ################################################################## ####################################################################### #################### Start: Calinski Harabaz Score #################### ####################################################################### calinski_harabaz_score = sklearn.metrics \ .calinski_harabaz_score(X=features, labels=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Calinski Harabaz Score", calinski_harabaz_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.CALINSKI_HARABAZ_SCORE, calinski_harabaz_score) # OR # Third Way mlops.metrics.calinski_harabaz_score(X=features, labels=labels_pred) #################### DONE NEW WAY #################### ##################################################################### #################### End: Calinski Harabaz Score #################### ##################################################################### ################################################################### #################### Start: Completeness Score #################### ################################################################### completeness_score = sklearn.metrics \ .completeness_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Completeness Score", completeness_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.COMPLETENESS_SCORE, completeness_score) # OR # Third Way mlops.metrics.completeness_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################# #################### End: Completeness Score #################### ################################################################# ################################################################### #################### Start: Contingency Matrix #################### ################################################################### contingency_matrix = sklearn.metrics.cluster \ .contingency_matrix(labels_true, labels_pred) # list of sorted labels. i.e. [0, 1, 2, ..] pred_labels_list = sorted(set(labels_pred)) true_labels_list = sorted(set(labels_true)) #################### OLD WAY #################### # First Way # from parallelm.mlops.stats.table import Table # # cm_cols_ordered_string = [str(i) for i in pred_labels_list] # cm_rows_ordered_string = [str(i) for i in true_labels_list] # cm_matrix = Table().name("User Defined: Contingency Matrix").cols(cm_cols_ordered_string) # # for index in range(len(contingency_matrix)): # cm_matrix.add_row(cm_rows_ordered_string[index], list(contingency_matrix[index])) # # mlops.set_stat(cm_matrix) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.CONTINGENCY_MATRIX, data=contingency_matrix, true_labels=true_labels_list, pred_labels=pred_labels_list) # OR # Third Way mlops.metrics.cluster.contingency_matrix(labels_true, labels_pred) #################### DONE NEW WAY #################### ################################################################# #################### End: Contingency Matrix #################### ################################################################# ###################################################################### #################### Start: Fowlkes Mallows Score #################### ###################################################################### fowlkes_mallows_score = \ sklearn.metrics.fowlkes_mallows_score(labels_true=labels_true, labels_pred=labels_pred, sparse=False) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Fowlkes Mallows Score", fowlkes_mallows_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.FOWLKES_MALLOWS_SCORE, fowlkes_mallows_score) # OR # Third Way mlops.metrics.fowlkes_mallows_score(labels_true=labels_true, labels_pred=labels_pred, sparse=False) #################### DONE NEW WAY #################### #################################################################### #################### End: Fowlkes Mallows Score #################### #################################################################### ##################################################################################### #################### Start: Homogeneity, Completeness, V Measure #################### ##################################################################################### homogeneity, completeness, v_measure = sklearn.metrics \ .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # multiline_object = MultiLineGraph() \ # .name("User Defined: Homogeneity - Completeness - V Measure") \ # .labels(["Homogeneity", "Completeness", "V Measure"]) # # multiline_object.data([homogeneity, completeness, v_measure]) # # mlops.set_stat(multiline_object) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.HOMOGENEITY_COMPLETENESS_V_MEASURE, data=[homogeneity, completeness, v_measure]) # OR # Third Way mlops.metrics \ .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################################### #################### End: Homogeneity, Completeness, V Measure #################### ################################################################################### ################################################################## #################### Start: Homogeneity Score #################### ################################################################## homogeneity_score = sklearn.metrics \ .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Homogeneity Score", homogeneity_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.HOMOGENEITY_SCORE, homogeneity_score) # OR # Third Way mlops.metrics \ .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ################################################################ #################### End: Homogeneity Score #################### ################################################################ ################################################################## #################### Start: Mutual Info Score #################### ################################################################## mutual_info_score = sklearn.metrics \ .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Mutual Info Score", mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.MUTUAL_INFO_SCORE, mutual_info_score) # OR # Third Way mlops.metrics \ .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None) #################### DONE NEW WAY #################### ################################################################ #################### End: Mutual Info Score #################### ################################################################ ############################################################################# #################### Start: Normalized Mutual Info Score #################### ############################################################################# normalized_mutual_info_score = sklearn.metrics \ .normalized_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Normalized Mutual Info Score", normalized_mutual_info_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.NORMALIZED_MUTUAL_INFO_SCORE, normalized_mutual_info_score) # OR # Third Way mlops.metrics \ .normalized_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ########################################################################### #################### End: Normalized Mutual Info Score #################### ########################################################################### ################################################################# #################### Start: Silhouette Score #################### ################################################################# silhouette_score = sklearn.metrics \ .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: Silhouette Score", silhouette_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.SILHOUETTE_SCORE, silhouette_score) # OR # Third Way mlops.metrics \ .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None) #################### DONE NEW WAY #################### ############################################################### #################### End: Silhouette Score #################### ############################################################### ################################################################ #################### Start: V Measure Score #################### ################################################################ v_measure_score = sklearn.metrics.v_measure_score(labels_true=labels_true, labels_pred=labels_pred) #################### OLD WAY #################### # First Way # mlops.set_stat("User Defined: V Measure Score", v_measure_score) #################### DONE OLD WAY #################### #################### NEW WAY #################### # Second Way mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, v_measure_score) # OR # Third Way mlops.metrics \ .v_measure_score(labels_true=labels_true, labels_pred=labels_pred) #################### DONE NEW WAY #################### ############################################################## #################### End: V Measure Score #################### ############################################################## # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(kmeans_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
def infer_loop(model, input, output_file, stats_interval, conf_tracker): output = open(output_file, "w") # Initialize statistics total_predictions = 0 low_confidence_predictions = 0 categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] prediction_hist = [] for i in range(0, model.get_num_categories()): prediction_hist.append(0) ### MLOPS start # Create a bar graph and table for reporting prediction distributions and set the column names infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols( categories) infer_tbl = Table().name("Prediction Distribution Table").cols(categories) ### MLOPS end while True: try: sample, label = input.get_next_input() # Get the inference. This is an array of probabilities for each output value. inference = model.infer(sample) # The prediction is the class with the highest probability prediction = ny.argmax(inference) # The confidence for that prediction confidence = inference[prediction] * 100 # Append the prediction to the output file output.write("{}\n".format(prediction)) # Calculate statistics total_predictions += 1 prediction_hist[prediction] += 1 conf_tracker.check_confidence(confidence, sample) # Report statistics if total_predictions % stats_interval == 0: # Report the prediction distribution for i in range(0, model.get_num_categories()): print("category: {} predictions: {}".format( categories[i], prediction_hist[i])) ### MLOPS start # Update total prediction count with the all new predictions since we last reported mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, stats_interval) # Show the prediction distribution as a table infer_tbl.add_row(str(total_predictions), prediction_hist) # Show the prediction distribution as a bar graph infer_bar.data(prediction_hist) # Report the stats mlops.set_stat(infer_tbl) mlops.set_stat(infer_bar) ### MLOPS end conf_tracker.report_confidence(stats_interval) except EOFError: # stop when we hit end of input print("Reached end of input") output.close() break