def test_mlops_log_loss_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred_prob = [[0.9, 0.1], [0.6, 0.4], [0.6, 0.4], [0.1, 0.9], [0.1, 0.8], [0.1, 0.9]] labels_actual = [0, 1, 0, 0, 0, 1] log_loss = sklearn.metrics.log_loss(labels_actual, labels_pred_prob) # first way pm.set_stat(ClassificationMetrics.LOG_LOSS, log_loss) # second way pm.metrics.log_loss(labels_actual, labels_pred_prob) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(ClassificationMetrics.LOG_LOSS, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_prob_missing_values = [[0.9, 0.1], [0.6, 0.4], [0.6, 0.4]] pm.metrics.log_loss(y_true=labels_actual, y_pred=labels_prob_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.log_loss(y_true=labels_actual, y_pred=labels_pred_prob, sample_weight=sample_weight) pm.done()
def main(): print("args: {}".format(sys.argv)) options = parse_args() print("- inside test-python-aux Running main.py") print("arg1: {}".format(options.arg1)) print("input_model: {}".format(options.input_model)) print("use-mlops: {}".format(options.use_mlops)) print("iter: {}".format(options.iter)) print("exit_value: {}".format(options.exit_value)) print("Calling mlops.init()") if options.use_mlops: mlops.init() # Some output - to test logs for idx in range(options.iter): print("stdout - Idx {}".format(idx)) print("stderr - Idx {}".format(idx), file=sys.stderr) if options.use_mlops: mlops.set_stat("aux_stat", 1) time.sleep(1) if options.use_mlops: mlops.done() # Exit status if options.exit_value >= 0: print("About to exit with value: {}".format(options.exit_value)) sys.exit(options.exit_value) else: print("About to raise exception: {}".format(options.exit_value)) raise Exception("Exiting main using exception")
def test_mlops_v_measure_score_apis(): mlops.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1, 0, 1, 2, 3, 0] labels_actual = [0, 1, 0, 1, 3, 1] vms = metrics.v_measure_score(labels_actual, labels_pred) # first way mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, vms) # second way mlops.metrics.v_measure_score(labels_true=labels_actual, labels_pred=labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_pred_missing_values = [0, 0, 0, 1] mlops.metrics.v_measure_score(labels_true=labels_actual, labels_pred=labels_pred_missing_values) mlops.done()
def test_mlops_mean_squared_error_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1.0, 0.5, 2.5, 4.75, 7.0, 0.75] labels_actual = [1.5, 0.75, 2.75, 4.5, 7.50, 0.25] mse = sklearn.metrics.mean_squared_error(labels_actual, labels_pred) # first way pm.set_stat(RegressionMetrics.MEAN_SQUARED_ERROR, mse) # second way pm.metrics.mean_squared_error(y_true=labels_actual, y_pred=labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(RegressionMetrics.MEAN_SQUARED_ERROR, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_pred_missing_values = [1.0, 0.5, 7.0, 0.75] pm.metrics.mean_squared_error(y_true=labels_actual, y_pred=labels_pred_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.mean_squared_error(y_true=labels_actual, y_pred=labels_pred, sample_weight=sample_weight) pm.done()
def test_mlops_explained_variance_score_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1.0, 0.5, 2.5, 4.75, 7.0, 0.75] labels_actual = [1.5, 0.75, 2.75, 4.5, 7.50, 0.25] evs = sklearn.metrics.explained_variance_score(labels_actual, labels_pred) # first way pm.set_stat(RegressionMetrics.EXPLAINED_VARIANCE_SCORE, evs) # second way pm.metrics.explained_variance_score(y_true=labels_actual, y_pred=labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(RegressionMetrics.EXPLAINED_VARIANCE_SCORE, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_pred_missing_values = [1.0, 0.5, 7.0, 0.75] pm.metrics.explained_variance_score(y_true=labels_actual, y_pred=labels_pred_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.explained_variance_score(y_true=labels_actual, y_pred=labels_pred, sample_weight=sample_weight) pm.done()
def main(): print("Starting example") mlops.init(run_in_non_pm_mode=True, mlops_mode=MLOpsMode.PYTHON) # Line graphs mlops.set_stat("myCounterDouble", 5.5) mlops.set_stat("myCounterDouble2", 7.3) # Multi-line graphs mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) tbl = Table().name("MyTable").cols(["Date", "Some number"]) tbl.add_row(["2001Q1", "55"]) tbl.add_row(["2001Q2", "66"]) tbl.add_row(["2003Q3", "33"]) tbl.add_row(["2003Q2", "22"]) mlops.set_stat(tbl) bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd", "ee"]).data([10, 15, 12, 9, 8]) mlops.set_stat(bar) mlops.done() print("Example done")
def test_mlops_median_absolute_error_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1.0, 0.5, 2.5, 4.75, 7.0, 0.75] labels_actual = [1.5, 0.75, 2.75, 4.5, 7.50, 0.25] mae = sklearn.metrics.median_absolute_error(labels_actual, labels_pred) # first way pm.set_stat(RegressionMetrics.MEDIAN_ABSOLUTE_ERROR, mae) # second way pm.metrics.median_absolute_error(y_true=labels_actual, y_pred=labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(RegressionMetrics.MEDIAN_ABSOLUTE_ERROR, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_pred_missing_values = [1.0, 0.5, 7.0, 0.75] pm.metrics.mean_absolute_error(y_true=labels_actual, y_pred=labels_pred_missing_values) pm.done()
def test_mlops_bas_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1, 0, 1, 1, 1, 0] labels_actual = [0, 1, 0, 0, 0, 1] bas = sklearn.metrics.balanced_accuracy_score(labels_actual, labels_pred) # first way pm.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, bas) # second way pm.metrics.balanced_accuracy_score(y_true=labels_actual, y_pred=labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_pred_missing_values = [0, 0, 0, 1] pm.metrics.balanced_accuracy_score(y_true=labels_actual, y_pred=labels_pred_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.balanced_accuracy_score(y_true=labels_actual, y_pred=labels_pred, sample_weight=sample_weight) pm.done()
def test_mlops_roc_auc_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred_prob = [0.9, 0.4, 0.6, 0.9, 0.1, 0.9] labels_actual = [0, 1, 0, 0, 0, 1] roc_auc_score = sklearn.metrics.roc_auc_score(labels_actual, labels_pred_prob) # first way pm.set_stat(ClassificationMetrics.ROC_AUC_SCORE, roc_auc_score) # second way pm.metrics.roc_auc_score(labels_actual, labels_pred_prob) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(ClassificationMetrics.ROC_AUC_SCORE, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_prob_missing_values = [0.0, 0.9, 1.0, 0.85] pm.metrics.roc_auc_score(y_true=labels_actual, y_score=labels_prob_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.roc_auc_score(y_true=labels_actual, y_score=labels_pred_prob, sample_weight=sample_weight) pm.done()
def test_mlops_matthews_corrcoef_apis(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) labels_pred = [1, 0, 1, 1, 1, 0] labels_actual = [0, 1, 0, 0, 0, 1] mcc = sklearn.metrics.matthews_corrcoef(labels_actual, labels_pred) # first way pm.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT, mcc) # second way pm.metrics.matthews_corrcoef(labels_actual, labels_pred) # should throw error if not numeric number is provided with pytest.raises(MLOpsStatisticsException): pm.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT, [1, 2, 3]) # should throw error if labels predicted is different length than actuals with pytest.raises(ValueError): labels_prob_missing_values = [1, 0, 1, 1] pm.metrics.matthews_corrcoef(y_true=labels_actual, y_pred=labels_prob_missing_values) sample_weight = [0.9, 0.1, 0.5, 0.9, 1.0, 0] # testing with sample weights as well pm.metrics.matthews_corrcoef(y_true=labels_actual, y_pred=labels_pred, sample_weight=sample_weight) pm.done()
def job_secondary_transitions(rows): tbl = Table().name("SageMaker Job Transitions")\ .cols(["Start Time", "End Time", "Time Span", "Status", "Description"]) for row in rows: tbl.add_row(row) mlops.set_stat(tbl)
def job_host_metrics(job_name, metrics_data): tbl = Table().name("Job Host Metrics").cols(["Metric", "Value"]) for metric_data in metrics_data: tbl.add_row([ metric_data['Label'], metric_data['Values'][0] if metric_data['Values'] else 0 ]) mlops.set_stat(tbl)
def gen_data_dist_stats(spark_ctx): spark_session = SparkSession(spark_ctx) # Import Data ################################## K = 3 # fixed number of centers num_attr = 10 # fixed number of attributes num_rows = 60000 # number of rows in the dataset input_data = generate_dataset(num_attr, num_rows, K, spark_ctx) column_names_all = input_data.columns for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed(column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() input_train = input_data # SparkML pipeline ################################## exclude_cols = [] column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) vector_assembler = VectorAssembler(inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans(k=K, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) try: mlops.set_data_distribution_stat(data=input_train, model=model_kmeans) m = mlops.Model(model_format=ModelFormat.SPARKML) m.set_data_distribution_stat(data=input_train) print("PM: done generating histogram") except Exception as e: print("PM: failed to generate histogram using pm.stat") print(e) # Indicating that model statistics were reported mlops.set_stat(E2EConstants.MODEL_STATS_REPORTED_STAT_NAME, 1) return model_kmeans
def db_to_df(engine, table): """ Save DataFrame to Database """ mlops.init() df_sink = pandas.read_sql("{} {}".format(SELECT_STATEMENT, table), con = engine) mlops.set_stat(table, df_sink.shape[0]) mlops.done() return(df_sink, df_sink.shape[0])
def _materialize(self, parent_data_objs, user_data): for k,v in self._params.items(): params_info = "key: {key} ==> value: {value}".format(key=k, value=v) print(params_info) self._logger.info(params_info) for x in range(100): mlops.set_stat("k graph", x) return ["s3://kenshoo/this is your report/report.txt"]
def _df_to_db(self, engine, df_sink, table, database): """ Save DataFrame to Database """ mlops.init() df_sink.to_sql(con = engine, name = table, if_exists = 'replace', index=False) mlops.set_stat(database.join(table), df_sink.shape[0]) mlops.done() return(df_sink.shape[0])
def _report_metrics_collection(self, metrics): for name, value in metrics.items(): metric_meta = Metric.metric_by_name(name) self._logger.debug("Reporting metrics ... {}".format(metric_meta)) if not metric_meta.hidden: if metric_meta.metric_relation == MetricRelation.BAR_GRAPH: self._report_bar_graph_metric(metric_meta, metrics) else: mlops.set_stat(metric_meta.title, value)
def job_status(job_name, running_time_sec, billing_time_sec, status=""): Report._last_metric_values[job_name] = status tbl = Table().name("SageMaker Job Status").cols( ["Job Name", "Total Running Time", "Time for Billing", "Status"]) tbl.add_row([ job_name, Report.seconds_fmt(running_time_sec), Report.seconds_fmt(billing_time_sec), status ]) mlops.set_stat(tbl)
def export_bar_table(bar_names, bar_data, title_name): """ This function provides a bar_graph for a bar type data at MCenter data scientist view :param bar_names: Bar graph names :param bar_data: Bar graph data. :param title_name: Title of the bar Graph :return: """ bar_graph_data = BarGraph().name(title_name).cols( bar_names.astype(str).tolist()).data(bar_data.tolist()) mlops.set_stat(bar_graph_data)
def _materialize(self, parent_data_objs, user_data): for param in parent_data_objs: prent_param = "parent param is: {param}".format(param=param) print(prent_param) self._logger.info(prent_param) tbl = Table().name("Table example").cols(["Worker", "Requests"]) for index in range(0, 10): tbl.add_row(["kenshoo-worker-{}".format(index), index + 3]) mlops.set_stat(tbl) return ["s3://Kenshoo/this is the logistic model path/model.pmml"]
def _report_event(self, tb_parse_event, time_stamp_start): """ Process the TensorBoard events only `summary` events are scanned and updated using mlops-stats API, entries supported are scalar values only. """ if tb_parse_event.HasField('summary') and (time_stamp_start < tb_parse_event.wall_time): for tf_value in tb_parse_event.summary.value: self._print_verbose("calling mlops.set_stats {}".format( tf_value.tag)) mlops.set_stat(tf_value.tag, data=tf_value.simple_value)
def _report_bar_graph_metric(self, metric_meta, metrics): cols = [] data = [] for related_m, bar_name in metric_meta.related_metric: cols.append(bar_name) data.append(metrics[related_m.metric_name]) if not all(v == 0 for v in data) or not metric_meta.metric_already_displayed: metric_meta.metric_already_displayed = True mlt = BarGraph().name(metric_meta.title).cols(cols).data(data) mlops.set_stat(mlt)
def test_init_done(): """ Testing api for information such as ION id, ion name and such :return: """ with pytest.raises(MLOpsException): pm.set_stat("st1", 5.5) with pytest.raises(MLOpsException): pm.done() pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) pm.done()
def _report_acc_requests_and_status(self): self._logger.debug("Reporting about workers requests & status ...") tbl = Table().name(StatsConstants.ACC_REQS_TABLE_NAME).cols([ StatsConstants.ACC_REQS_NUM_REQS_COL_NAME, StatsConstants.ACC_REQS_STATUS_COL_NAME ]) for col, value, status in self._curr_stats_snapshot.sorted_worker_stats: tbl.add_row(col, [value, status]) tbl.add_row(StatsConstants.ACC_REQS_LAST_ROW_NAME, [self._curr_stats_snapshot.total_requests, "---"]) mlops.set_stat(tbl) mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, self._curr_stats_snapshot.total_requests_diff)
def _materialize(self, parent_data_objs, user_data): for param in parent_data_objs: prent_param = "parent param is: {param}".format(param=param) print(prent_param) self._logger.info(prent_param) mlt = MultiLineGraph().name("Multi-line graph example").labels( ["lable-1", "lable-2", "lable-3"]) for x in range(100): mlt.data([x, x + 1, 0.5 * x]) mlops.set_stat(mlt) return ["s3://Kenshoo/this is the linear model path/model.pmml"]
def export_confusion_table(confmat, algo): """ This function provides the confusion matrix as a table in at MCenter data scientist view :param confmat: Confusion matrix :param algo: text for the algorithm type :return: """ tbl = Table()\ .name("Confusion Matrix for " + str(algo))\ .cols(["Predicted label: " + str(i) for i in range(0, confmat.shape[0])]) for i in range(confmat.shape[1]): tbl.add_row("True Label: " + str(i), [str(confmat[i, j]) for j in range(0, confmat.shape[0])]) mlops.set_stat(tbl)
def test_bar_graph(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) with pytest.raises(MLOpsException): BarGraph().name("bar").cols(["g1", "g2"]).data(["aa", "bb"]) with pytest.raises(MLOpsException): BarGraph().name("bar").data(["aa", "bb"]) with pytest.raises(MLOpsException): mlt = BarGraph().name("mlt").cols(["g1"]).data([55, 66]) pm.set_stat(mlt) with pytest.raises(MLOpsException): mlt_cont = BarGraph().name("mlt").cols([1, 2]).data([55, 66]).as_continuous() pm.set_stat(mlt_cont) mlt = BarGraph().name("mlt").cols(["g1", "g2"]).data([55, 66]) pm.set_stat(mlt) mlt_cont = BarGraph().name("mlt").cols([1, 2, 3]).data([55, 66]).as_continuous() pm.set_stat(mlt_cont) pm.done()
def test_table(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) with pytest.raises(MLOpsException): Table().name("mytable").cols(["a", "b", "c"]).add_row([1, 2, 3]).add_row([1, 2]) with pytest.raises(MLOpsException): tbl = Table().name("mytable").cols(["a", "b"]) pm.set_stat(tbl) tbl = Table().name("good-1").cols(["a", "b", "c"]).add_rows([[1, 2, 3], [1, 2, 3]]) pm.set_stat(tbl) tbl = Table().name("good-2").cols(["a", "b", "c"]) tbl.add_row("r1", [1, 2, 3]) tbl.add_row("r2", [3, 4, 5]) pm.set_stat(tbl) tbl = Table().name("good-3").cols(["a", "b", "c"]) tbl.add_row([6, 7, 8]) tbl.add_row([9, 0, 1]) pm.set_stat(tbl) pm.done()
def main(): pm_options = parse_args() # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model file_obj = open(filename, 'rb') mlops.set_stat("model_file", 1) except Exception as e: print("Model not found") print("Got exception: {}".format(e)) mlops.set_stat("model_file", 0) mlops.done() return 0 classifier = pickle.load(file_obj) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) np.random.seed(0) g = np.random.normal(0, 1, (num_samples, num_features)) p = np.random.poisson(0.7, (num_samples, num_features)) b = np.random.beta(2, 2, (num_samples, num_features)) test_data = np.concatenate((g, p, b), axis=0) np.random.seed() test_features = test_data[np.random.choice(test_data.shape[0], num_samples, replace=False)] # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones # reported during training to generate the similarity score. mlops.set_data_distribution_stat(test_features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples, st.TIME_SERIES) # Predict labels result = classifier.predict(test_features) # Label distribution in prediction value, counts = np.unique(result, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Terminate MLOPs mlops.done()
def count_words(sc, words_file): lines = sc.textFile(words_file) words = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) counts = words.reduceByKey(operator.add) sorted_counts = counts.sortBy(lambda x: x[1], False) total_words = sorted_counts.count() mlops.set_stat("total_words_1_push_7", total_words) total_words = 0 for word, count in sorted_counts.toLocalIterator(): print(u"{} --> {}".format(word, count)) total_words += 1 mlops.set_stat("total_words_2_push_7", total_words)