Esempio n. 1
0
 def _report_avg_response_time_metrics(self):
     self._logger.debug("Reporting about workers average response time ...")
     tbl = Table().name(StatsConstants.AVG_RESP_TIME_TABLE_NAME).cols(
         [StatsConstants.AVG_RESP_TIME_COL_NAME])
     for col, rt in self._curr_stats_snapshot.avg_workers_response_time:
         tbl.add_row(col, rt)
     mlops.set_stat(tbl)
    def get_table_value_stat_object(name, list_2d, match_header_pattern=None):
        """
        Create Table Value stat object from list of list. Where first element of 2d list is header. And from remaining lists, list's first index is Row's header.
        :param name: Name of stat
        :param list_2d: 2d representation of table to output
        :param match_header_pattern: If not none, then header of table should match the pattern provided
        :return: MLOps Table Value object, general stat category
        """
        category = StatCategory.GENERAL
        try:
            header = list(map(lambda x: str(x).strip(), list_2d[0]))

            if match_header_pattern is not None:
                assert header == match_header_pattern, \
                    "headers {} is not matching expected headers pattern {}" \
                        .format(header, match_header_pattern)

            len_of_header = len(header)
            table_object = Table().name(name).cols(header)

            for index in range(1, len(list_2d)):
                assert len(list_2d[index]) - 1 == len_of_header, \
                    "length of row value does not match with headers length"

                row_title = str(list_2d[index][0]).strip()
                row_value = list(
                    map(lambda x: str(x).strip(), list_2d[index][1:]))
                table_object.add_row(row_title, row_value)

            return table_object, category
        except Exception as e:
            raise MLOpsStatisticsException \
                ("error happened while outputting table object from list_2d: {}. error: {}".format(list_2d, e))
Esempio n. 3
0
    def __init__(self,
                 print_interval,
                 stats_type,
                 num_categories,
                 conf_thresh,
                 hot_label=True):
        super(CategoricalStatistics, self).__init__(print_interval)
        self._num_categories = num_categories
        self._hot_label = hot_label
        self._stats_type = stats_type
        self._conf_thresh = conf_thresh / 100.0

        # These are useful for development, but should be replaced by mlops library functions
        self._label_hist = []
        self._infer_hist = []
        for i in range(0, self._num_categories):
            self._label_hist.append(0)
            self._infer_hist.append(0)

        if self._stats_type == "python":
            mlops.init(ctx=None,
                       connect_mlops=True,
                       mlops_mode=MLOpsMode.AGENT)
        elif self._stats_type == "file":
            mlops.init(ctx=None,
                       connect_mlops=False,
                       mlops_mode=MLOpsMode.STAND_ALONE)
        else:
            self._stats_type = "none"

        if self._stats_type != "none":
            self._infer_tbl = Table().name("inferences").cols(
                ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
Esempio n. 4
0
    def job_secondary_transitions(rows):
        tbl = Table().name("SageMaker Job Transitions")\
                     .cols(["Start Time", "End Time", "Time Span", "Status", "Description"])
        for row in rows:
            tbl.add_row(row)

        mlops.set_stat(tbl)
Esempio n. 5
0
def export_classification_report(class_rep, algo):
    """
    This function provides the classification report as a table in at MCenter data scientist view
    :param class_rep: Classification report data
    :param algo: text for the algorithm type
    :return:
    """
    col_keys = []
    row_keys = []
    class_tlb = []
    add_col_keys = True
    for row_key in class_rep.keys():
        row_keys.append(str(row_key))
        class_tlb_row = []
        class_row = class_rep[row_key]
        for col_key in class_row.keys():
            if add_col_keys:
                col_keys.append(str(col_key))
            class_tlb_row.append(str(class_row[col_key]))
        add_col_keys = False
        class_tlb.append(class_tlb_row)

    tbl = Table().name("Classification Report "+str(algo)).cols(col_keys)
    for i in range(len(row_keys)):
        tbl.add_row(row_keys[i], class_tlb[i])
    mlops.set_stat(tbl)
Esempio n. 6
0
 def job_host_metrics(job_name, metrics_data):
     tbl = Table().name("Job Host Metrics").cols(["Metric", "Value"])
     for metric_data in metrics_data:
         tbl.add_row([
             metric_data['Label'],
             metric_data['Values'][0] if metric_data['Values'] else 0
         ])
     mlops.set_stat(tbl)
Esempio n. 7
0
 def job_status(job_name, running_time_sec, billing_time_sec, status=""):
     Report._last_metric_values[job_name] = status
     tbl = Table().name("SageMaker Job Status").cols(
         ["Job Name", "Total Running Time", "Time for Billing", "Status"])
     tbl.add_row([
         job_name,
         Report.seconds_fmt(running_time_sec),
         Report.seconds_fmt(billing_time_sec), status
     ])
     mlops.set_stat(tbl)
Esempio n. 8
0
    def _materialize(self, parent_data_objs, user_data):
        for param in parent_data_objs:
            prent_param = "parent param is: {param}".format(param=param)
            print(prent_param)
            self._logger.info(prent_param)

        tbl = Table().name("Table example").cols(["Worker", "Requests"])
        for index in range(0, 10):
            tbl.add_row(["kenshoo-worker-{}".format(index), index + 3])
        mlops.set_stat(tbl)

        return ["s3://Kenshoo/this is the logistic model path/model.pmml"]
Esempio n. 9
0
def export_confusion_table(confmat, algo):
    """
    This function provides the confusion matrix as a table in at MCenter data scientist view
    :param confmat: Confusion matrix
    :param algo: text for the algorithm type
    :return:
    """

    tbl = Table()\
        .name("Confusion Matrix for " + str(algo))\
        .cols(["Predicted label: " + str(i) for i in range(0, confmat.shape[0])])
    for i in range(confmat.shape[1]):
        tbl.add_row("True Label: " + str(i), [str(confmat[i, j]) for j in range(0, confmat.shape[0])])
    mlops.set_stat(tbl)
Esempio n. 10
0
def main():
    print("Starting example")
    mlops.init(run_in_non_pm_mode=True, mlops_mode=MLOpsMode.PYTHON)

    # Line graphs
    mlops.set_stat("myCounterDouble", 5.5)
    mlops.set_stat("myCounterDouble2", 7.3)

    # Multi-line graphs
    mlt = MultiLineGraph().name("Multi Line").labels(["l1",
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    tbl = Table().name("MyTable").cols(["Date", "Some number"])
    tbl.add_row(["2001Q1", "55"])
    tbl.add_row(["2001Q2", "66"])
    tbl.add_row(["2003Q3", "33"])
    tbl.add_row(["2003Q2", "22"])
    mlops.set_stat(tbl)

    bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd",
                                         "ee"]).data([10, 15, 12, 9, 8])
    mlops.set_stat(bar)

    mlops.done()
    print("Example done")
Esempio n. 11
0
 def _report_acc_requests_and_status(self):
     self._logger.debug("Reporting about workers requests & status ...")
     tbl = Table().name(StatsConstants.ACC_REQS_TABLE_NAME).cols([
         StatsConstants.ACC_REQS_NUM_REQS_COL_NAME,
         StatsConstants.ACC_REQS_STATUS_COL_NAME
     ])
     for col, value, status in self._curr_stats_snapshot.sorted_worker_stats:
         tbl.add_row(col, [value, status])
     tbl.add_row(StatsConstants.ACC_REQS_LAST_ROW_NAME,
                 [self._curr_stats_snapshot.total_requests, "---"])
     mlops.set_stat(tbl)
     mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT,
                    self._curr_stats_snapshot.total_requests_diff)
Esempio n. 12
0
    def _report_acc_requests_and_status(self):
        self._logger.debug("Reporting about workers requests & status ...")
        try:
            predict_reqs = self._curr_stats_snapshot.total_requests - \
                           self._curr_stats_snapshot.uwsgi_pm_metric_by_name(PredefinedStats.PM_STAT_REQUESTS)
            mlops.set_stat("Number of Predict Requests", predict_reqs)
        except:
            self._logger.error("Failed to retrieve pm stat requests")
            predict_reqs = self._curr_stats_snapshot.total_requests

        tbl = Table().name(StatsConstants.ACC_REQS_TABLE_NAME).cols([
            StatsConstants.ACC_REQS_NUM_REQS_COL_NAME,
            StatsConstants.ACC_REQS_STATUS_COL_NAME
        ])
        for col, value, status in self._curr_stats_snapshot.sorted_worker_stats:
            tbl.add_row(col, [value, status])

        tbl.add_row(StatsConstants.ACC_REQS_LAST_ROW_NAME,
                    [self._curr_stats_snapshot.total_requests, "---"])
        mlops.set_stat(tbl)

        mlops.set_stat(PredefinedStats.WORKER_STATS,
                       len(self._curr_stats_snapshot.worker_ids))
Esempio n. 13
0
class CategoricalStatistics(InferenceStatistics):
    def __init__(self,
                 print_interval,
                 stats_type,
                 num_categories,
                 conf_thresh,
                 hot_label=True):
        super(CategoricalStatistics, self).__init__(print_interval)
        self._num_categories = num_categories
        self._hot_label = hot_label
        self._stats_type = stats_type
        self._conf_thresh = conf_thresh / 100.0

        # These are useful for development, but should be replaced by mlops library functions
        self._label_hist = []
        self._infer_hist = []
        for i in range(0, self._num_categories):
            self._label_hist.append(0)
            self._infer_hist.append(0)

        if self._stats_type == "python":
            mlops.init(ctx=None,
                       connect_mlops=True,
                       mlops_mode=MLOpsMode.AGENT)
        elif self._stats_type == "file":
            mlops.init(ctx=None,
                       connect_mlops=False,
                       mlops_mode=MLOpsMode.STAND_ALONE)
        else:
            self._stats_type = "none"

        if self._stats_type != "none":
            self._infer_tbl = Table().name("inferences").cols(
                ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])

    def infer_stats(self, sample, label, inference):

        # for now, we only process 1 inference at a time
        inference = inference[0]
        prediction = ny.argmax(inference)
        confidence = inference[prediction]
        if confidence < self._conf_thresh:
            self.increment_low_conf()

        self._infer_hist[prediction] += 1

        if label is not None:
            if (self._hot_label):
                label = ny.argmax(label)
            self._label_hist[label] += 1

            if prediction == label:
                self.increment_correct()

        self.increment_total()
        if self.is_time_to_report():
            self.report_stats()

        return prediction

    def report_stats(self):

        if self.get_low_conf() > 0:
            mlops.health_alert(
                "Low confidence alert",
                "{}% of inferences had confidence below {}%".format(
                    self.get_low_conf() * 100.0 / self.get_total(),
                    self._conf_thresh * 100))

        for i in range(0, self._num_categories):
            print(i, "label_total =", self._label_hist[i], "infer_total = ",
                  self._infer_hist[i])

        print("total = ", self.get_total(), "total_correct = ",
              self.get_correct())

        self._infer_tbl.add_row(str(self.get_total()), [
            self._infer_hist[0], self._infer_hist[1], self._infer_hist[2],
            self._infer_hist[3], self._infer_hist[4], self._infer_hist[5],
            self._infer_hist[6], self._infer_hist[7], self._infer_hist[8],
            self._infer_hist[9]
        ])

        if self._stats_type != "none":
            mlops.set_stat("correct_percent",
                           self.get_correct() * 100.0 / self.get_total())
            mlops.set_stat(self._infer_tbl)

    def __del__(self):
        mlops.done()
        super(CategoricalStatistics, self).__del__()
Esempio n. 14
0
    def _prep_and_train(self, df_dataset):
        self.min_auc_requirement = self._params["auc_threshold"]
        self.max_ks_requirement = self._params["ks_threshold"]
        self.min_psi_requirement = self._params["psi_threshold"]
        train_on_col = self._params["train_on_column"]

        #mlops Init
        mlops.init()

        y = df_dataset[train_on_col]
        self._logger.info("train_on_col= {}".format(train_on_col))
        self._logger.info("df_dataset {}".format(df_dataset.shape[1]))
        X = df_dataset.drop(train_on_col, axis=1)
        mlops.set_data_distribution_stat(X)
        self._logger.info("df_dataset {}".format(X.shape[1]))

        # Splitting the data to train and test sets:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self._params["validation_split"], random_state=42)
        All_columns = X_train.columns.tolist()
        categorical_columns = self._params["categorical_cols"]
        mapper_list = []
        for d in All_columns:
            if d in categorical_columns:
                mapper_list.append(
                    ([d], OneHotEncoder(handle_unknown='ignore')))
            else:
                mapper_list.append(([d], MinMaxScaler()))

        mapper = DataFrameMapper(mapper_list)

        ## Training
        # XGBoost Training:
        n_cpu = multiprocessing.cpu_count()

        xgboost_model = xgb.XGBClassifier(
            max_depth=int(self._params["max_depth"]),
            min_child_weight=int(self._params["min_child_weight"]),
            learning_rate=float(self._params["learning_rate"]),
            n_estimators=int(self._params["n_estimators"]),
            silent=True,
            objective=self._params["objective"],
            gamma=float(self._params["gamma"]),
            max_delta_step=int(self._params["max_delta_step"]),
            subsample=float(self._params["subsample"]),
            colsample_bytree=1,
            colsample_bylevel=1,
            reg_alpha=float(self._params["reg_alpha"]),
            reg_lambda=float(self._params["reg_lambda"]),
            scale_pos_weight=float(self._params["scale_pos_weight"]),
            seed=1,
            n_jobs=n_cpu,
            missing=None)

        final_model = Pipeline([("mapper", mapper),
                                ("xgboost", xgboost_model)])
        final_model.fit(X_train, y_train)

        # Prediction and prediction distribution
        pred_labels = final_model.predict(X_test)
        pred_probs = final_model.predict_proba(X_test)

        # Accuracy calculation
        # Accuracy for the xgboost model
        accuracy = accuracy_score(y_test, pred_labels)
        self._logger.info("XGBoost Accuracy value: {0}".format(accuracy))
        #     Output accuracy of the chosen model using MCenter
        mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

        # Label distribution:
        # Label distribution in training
        value, counts = np.unique(y_test, return_counts=True)
        label_distribution = np.asarray((value, counts)).T
        self._logger.info(
            "Validation Actual Label distributions: \n {0}".format(
                label_distribution))
        # Output Label distribution as a BarGraph using MCenter
        export_bar_table(label_distribution[:, 0], label_distribution[:, 1],
                         "Validation - Actual Label Distribution")

        # Prediction distribution and prediction confidence distribution
        # Pred Label distribution in training
        pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
        pred_label_distribution = np.asarray((pred_value, pred_counts)).T
        self._logger.info(
            "XGBoost Validation Prediction Label Distributions: \n {0}".format(
                pred_label_distribution))
        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_label_distribution[:, 0],
                         pred_label_distribution[:, 1],
                         "Validation - XGBoost Prediction Distribution")

        # Pred confidence per label
        label_number = len(pred_counts)
        average_confidence = np.zeros(label_number)
        max_pred_probs = pred_probs.max(axis=1)
        for i in range(0, label_number):
            index_class = np.where(pred_labels == i)[0]
            if pred_counts[i] > 0:
                average_confidence[i] = np.sum(
                    max_pred_probs[index_class]) / (float(pred_counts[i]))
            else:
                average_confidence[i] = 0
        self._logger.info(
            "XGBoost Validation Average Prediction confidence per label: \n {0}"
            .format(average_confidence))

        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_value, average_confidence,
                         "Validation - XGBoost Average confidence per class")

        # Confusion Matrix
        # XGBoost Confusion Matrix
        confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
        self._logger.info(
            "Confusion Matrix for XGBoost: \n {0}".format(confmat))
        # Output Confusion Matrix as a Table using MCenter
        export_confusion_table(confmat, "XGBoost")

        # Classification Report
        # XGBoost Classification Report
        class_rep = classification_report(y_true=y_test,
                                          y_pred=pred_labels,
                                          output_dict=True)
        self._logger.info(
            "XGBoost Classification Report: \n {0}".format(class_rep))

        # AUC and ROC Curves
        # ROC for XGBoost model
        roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
        self._logger.info("XGBoost ROC AUC value: {}".format(roc_auc))

        # Output ROC of the chosen model using MCenter
        mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)

        if roc_auc <= self.min_auc_requirement:
            mlops.health_alert(
                "[Training] AUC Violation From Training Node",
                "AUC Went Below {}. Current AUC Is {}".format(
                    self.min_auc_requirement, roc_auc))

        # ROC curve
        fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])

        cg = MultiGraph().name(
            "Receiver Operating Characteristic ").set_continuous()
        cg.add_series(label='Random curve ' '', x=fpr.tolist(), y=fpr.tolist())
        cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'
                      ''.format(roc_auc),
                      x=fpr.tolist(),
                      y=tpr.tolist())
        cg.x_title('False Positive Rate')
        cg.y_title('True Positive Rate')
        mlops.set_stat(cg)

        # Feature importance comparison
        # XGBoost Feature importance
        export_feature_importance(final_model, list(X_train.columns), 5,
                                  "XGBoost")

        # KS Analysis
        max_pred_probs = pred_probs.max(axis=1)
        y_test0 = np.where(y_test == 0)[0]
        y_test1 = np.where(y_test == 1)[0]

        # KS for the XGBoost model
        ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
        ks_stat = ks.statistic
        ks_pvalue = ks.pvalue
        self._logger.info(
            "KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
                ks_stat, ks_pvalue))

        # Output KS Stat of the chosen model using MCenter
        mlops.set_stat("KS Stats for CGBoost", ks_stat, st.TIME_SERIES)

        # raising alert if ks-stat goes above required threshold
        if ks_stat >= self.max_ks_requirement:
            mlops.health_alert(
                "[Training] KS Violation From Training Node",
                "KS Stat Went Above {}. Current KS Stat Is {}".format(
                    self.max_ks_requirement, ks_stat))

        ks_table = Table().name("KS Stats for XGBoost").cols(
            ["Statistic", "pValue"])
        ks_table.add_row([ks_stat, ks_pvalue])
        mlops.set_stat(ks_table)

        # PSI Analysis
        # Calculating PSI
        total_psi, psi_table = get_psi(self, max_pred_probs[y_test0],
                                       max_pred_probs[y_test1])
        psi_table_stat = Table().name("PSI Stats for XGBoost").cols([
            "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound",
            "Base Percent", "Curr Percent", "Segment PSI"
        ])
        row_num = 1
        for each_value in psi_table.values:
            str_values = [str(i) for i in each_value]
            psi_table_stat.add_row(str(row_num), str_values)
            row_num += 1
        mlops.set_stat(psi_table_stat)
        self._logger.info("Total XGBoost PSI values: \n {}".format(total_psi))
        #     Output Total PSI of the chosen model using MCenter
        mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

        if total_psi >= self.min_psi_requirement:
            mlops.health_alert(
                "[Training] PSI Violation From Training Node",
                "PSI Went Below {}. Current PSI Is {}".format(
                    self.min_psi_requirement, total_psi))

        # ## Save the XGBoost Model
        model_file = open(self._params["output-model"], 'wb')
        pickle.dump(final_model, model_file)
        model_file.close()

        # ## Finish the program
        mlops.done()

        return (model_file)
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    if args.training_iteration <= 0:
        print('Please specify a positive value for training iteration.')
        sys.exit(-1)

    # Read the train and test data sets
    mnist = mnist_input_data.read_data_sets(args.input_cache_dir, one_hot=True)

    ## MLOps start
    # Initialize the mlops library
    mlops.init()

    # Report the feature distribution for the training data
    train_images = mnist.train.images
    mlops.set_data_distribution_stat(train_images)

    # Initialize a table to track training accuracy and cost
    train_table = Table().name("Training Stats").cols(["Accuracy", "Cost"])
    ## MLOps end

    # Create the model
    sess = tf.InteractiveSession()
    serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
    feature_configs = {
        'x': tf.FixedLenFeature(shape=[784], dtype=tf.float32),
    }
    tf_example = tf.parse_example(serialized_tf_example, feature_configs)
    x = tf.identity(tf_example['x'],
                    name='x')  # use tf.identity() to assign name
    y_ = tf.placeholder('float', shape=[None, 10])
    w = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    sess.run(tf.global_variables_initializer())
    y = tf.nn.softmax(tf.matmul(x, w) + b, name='y')

    # Set the cost function and optimizer
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(
        cross_entropy)
    values, indices = tf.nn.top_k(y, 10)
    table = tf.contrib.lookup.index_to_string_table_from_tensor(
        tf.constant([str(i) for i in range(10)]))
    prediction_classes = table.lookup(tf.to_int64(indices))

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

    # Train the model
    print('Training model...')
    for i in range(args.training_iteration):
        batch = mnist.train.next_batch(50)
        _, train_cost, train_acc = sess.run(
            [train_step, cross_entropy, accuracy],
            feed_dict={
                x: batch[0],
                y_: batch[1]
            })

        # Display stats
        if (i + 1
            ) % args.display_step == 0 or i + 1 == args.training_iteration:
            # Report training accuracy and cost

            print("Training. step={}, accuracy={}, cost={}".format(
                i + 1, train_acc, train_cost))

            # MLOps start
            # multiply by 1 to convert into double
            train_table.add_row("Iterations: {}".format(i + 1),
                                [train_acc * 100, train_cost * 1])
            mlops.set_stat(train_table)
            # MLOps end

    print('Done training!')

    # Report final cost and accuracy on test set
    test_cost, test_acc = sess.run([cross_entropy, accuracy],
                                   feed_dict={
                                       x: mnist.test.images,
                                       y_: mnist.test.labels
                                   })
    print("Testing. accuracy={}, cost={}".format(test_acc, test_cost))

    ## MLOps start
    acc_table = Table().name("Test Accuracy").cols(["Accuracy"])
    acc_table.add_row("Total iterations: {}".format(args.training_iteration),
                      [test_acc])
    mlops.set_stat(acc_table)

    # Release mlops resources
    mlops.done()
    ## MLOps end

    # Export the trained model so it can be used for inference
    # WARNING(break-tutorial-inline-code): The following code snippet is
    # in-lined in tutorials, please update tutorial documents accordingly
    # whenever code changes.
    export_path = args.save_dir
    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    # Build the signature_def_map.
    classification_inputs = tf.saved_model.utils.build_tensor_info(
        serialized_tf_example)
    classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
        prediction_classes)
    classification_outputs_scores = tf.saved_model.utils.build_tensor_info(
        values)

    classification_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                classification_inputs
            },
            outputs={
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                classification_outputs_classes,
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                classification_outputs_scores
            },
            method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME
        ))

    tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
    tensor_info_y = tf.saved_model.utils.build_tensor_info(y)

    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'inputs': tensor_info_x},
            outputs={'outputs': tensor_info_y},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
    builder.add_meta_graph_and_variables(
        sess, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            'predict_images':
            prediction_signature,
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            classification_signature,
        },
        legacy_init_op=legacy_init_op)

    builder.save()

    print('Done exporting!')
Esempio n. 16
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    # Loading the data
    loan_df = pd.read_csv(pm_options.input_file)
    X = loan_df

    # Cleaning NAs
    mlops.set_data_distribution_stat(loan_df)
    print("dataset_size = ", loan_df.shape[0])
    print("number of NAs per columns = \n", loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # ## Inference
    pred_labels = final_model.predict(X)
    pred_probs = final_model.predict_proba(X)

    # Prediction distribution and prediction confidence distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Inference Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))
    export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:,
                                                                            1],
                     "Inference - XGBoost Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".
          format(average_confidence))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence,
                     "Validation - XGBoost Average confidence per class")

    # Feature importance comparison
    export_feature_importance(final_model, list(X.columns), 5, "XGBoost")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0 = np.where(pred_labels == 0)[0]
    y_test1 = np.where(pred_labels == 1)[0]
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))
    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    total_psi, psi_table = get_psi(max_pred_probs[y_test0],
                                   max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    print("XGBoost PSI Stats: \n {}".format(psi_table))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # ## Finish the program
    mlops.done()
Esempio n. 17
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: Data file:            [{}]".format(pm_options.data_file))
    print("PM: Output model:         [{}]".format(pm_options.output_model))
    print("PM: regularization_range:         [{}]".format(
        pm_options.regularization_range))

    mlops.init()

    # Read the Samsung datafile
    dataset = pd.read_csv(pm_options.data_file)

    # Separate into features and labels
    features = dataset.iloc[:, 1:].values
    labels = dataset.iloc[:, 0].values

    # Hyper-parameter search using k-fold cross-validation
    # Applying k_fold cross validation
    regularization_range = pm_options.regularization_range.split(',')
    regularization = [
        float(regularization_var)
        for regularization_var in regularization_range
    ]
    tune_parameters = [{'C': regularization}]

    # Initialize logistic regression algorithm
    LR = LogisticRegression(class_weight='balanced',
                            multi_class='multinomial',
                            solver='lbfgs')
    clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy')
    clf.fit(features, labels)
    print("best parameter = ", clf.best_params_)
    accuracy = clf.cv_results_['mean_test_score']
    print(
        'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format(
            accuracy, regularization))

    ########## Start of ParallelM instrumentation ##############
    # Report Hyper-parameter Table
    tbl = Table().name("Hyper-parameter Search Results").cols(
        ["Mean accuracy from k-fold cross-validation"])
    print("length of regularization", len(regularization))
    index_max = np.argmax(accuracy)
    for a in range(0, len(regularization)):
        print("adding row", regularization[a])
        if a == index_max:
            tbl.add_row("[Best] Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
        else:
            tbl.add_row("Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
    mlops.set_stat(tbl)
    ########## End of ParallelM instrumentation ##############

    # Label distribution in training
    label_distribution = dataset['label'].value_counts()
    column_names = np.array(label_distribution.index).astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    ########## Start of ParallelM instrumentation ##############
    # Report label distribution as a BarGraph
    bar = BarGraph().name("Label Distribution").cols(
        np.array(label_distribution.index).astype(str).tolist()).data(
            label_distribution.values.tolist())
    mlops.set_stat(bar)
    ########## Start of ParallelM instrumentation ##############

    #################### Start of ParallelM instrumentation ################
    # Report accuracy of the chosen model
    mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max],
                   st.TIME_SERIES)
    #################### End of ParallelM instrumentation ################

    # Histogram input
    mlops.set_data_distribution_stat(dataset)

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(clf, model_file)
    model_file.close()
    mlops.done()
Esempio n. 18
0
def test_table():
    pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE)

    with pytest.raises(MLOpsException):
        Table().name("mytable").cols(["a", "b",
                                      "c"]).add_row([1, 2, 3]).add_row([1, 2])

    with pytest.raises(MLOpsException):
        tbl = Table().name("mytable").cols(["a", "b"])
        pm.set_stat(tbl)

    tbl = Table().name("good-1").cols(["a", "b", "c"]).add_rows([[1, 2, 3],
                                                                 [1, 2, 3]])
    pm.set_stat(tbl)

    tbl = Table().name("good-2").cols(["a", "b", "c"])
    tbl.add_row("r1", [1, 2, 3])
    tbl.add_row("r2", [3, 4, 5])
    pm.set_stat(tbl)

    tbl = Table().name("good-3").cols(["a", "b", "c"])
    tbl.add_row([6, 7, 8])
    tbl.add_row([9, 0, 1])
    pm.set_stat(tbl)

    pm.done()
Esempio n. 19
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data  # select columns 1 through end

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            #                                binary classification only!
            n_classes=2,
            random_state=42)

        # Add random noise to the data randomly
        import random
        if random.randint(1, 21) / 2 == 0:
            print("Adding Random Noise!")

            noisy_features = np.random.uniform(0, 1) * \
                             np.random.normal(0, 1,
                                              (num_samples, num_features))
            X = X + noisy_features

    # Separate into features and labels
    features = X

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                   st.TIME_SERIES)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(features)
    pred_probs = final_model.predict_proba(features)

    print("Pred Labels: ", pred_labels)  # Remove printout can be huge
    print("Pred Probabilities: ", pred_probs)  # Remove printout can be huge

    # Pred Label distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Pred Label distributions: \n {0}".format(pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name("Pred Label Distribution").cols(
        (pred_label_distribution[:, 0]).astype(str).tolist()).data(
            (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # Pred Label confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        print(" np.sum(confidence[index_class])",
              np.sum(max_pred_probs[index_class]))
        print("counts_elements[i] ", pred_counts[i])
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0

    # BarGraph showing confidence per class
    pred_values1 = [str(i) for i in pred_value]
    bar = BarGraph().name("Average Confidence Per Class").cols(
        pred_values1).data(average_confidence.tolist())
    mlops.set_stat(bar)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[pred_labels == 1],
                  max_pred_probs[pred_labels == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    if not np.isnan(ks_stat):
        print("printing KS_stat ")
        mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)
    else:
        print("not printing KS_stat ")

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Inference] KS Violation From Inference Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1],
                                   max_pred_probs[pred_labels == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Inference] PSI Violation From Inference Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Terminate MLOPs
    mlops.done()
Esempio n. 20
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")

    print("PM: # Validation Split:          [{}]".format(pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(pm_options.input_file))
    print("PM: Output model:                [{}]".format(pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # mlops Init
    mlops.init()

    # Loading and cleaning the data
    # This section goes though the various stages of loading and cleaning the data:
    loan_df = pd.read_csv(pm_options.input_file)

    # Cleaning NAs
    print("dataset_size = ", loan_df.shape[0])
    mlops.set_data_distribution_stat(loan_df)
    print("number of NAs per columns = ",  loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # Marking the label field. remove it from the features set:
    y = loan_df["bad_loan"]
    X = loan_df.drop("bad_loan", axis=1)

    from sklearn_pandas import DataFrameMapper

    # Splitting the data to train and test sets:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=float(pm_options.validation_split),
                                                        random_state=42)

    All_columns = X_train.columns.tolist()
    categorical_columns = ["verification_status", "addr_state", "purpose", "home_ownership", "term"]
    mapper_list =[]
    for d in All_columns:
        if d in categorical_columns:
            mapper_list.append(([d], OneHotEncoder(handle_unknown='ignore')))
        else:
            mapper_list.append(([d], MinMaxScaler()))

    mapper = DataFrameMapper(mapper_list)

    # ## Training
    # XGBoost Training:
    import xgboost as xgb
    xgboost_model = xgb.XGBClassifier(max_depth=int(pm_options.max_depth),
                                    min_child_weight=int(pm_options.min_child_weight),
                                    learning_rate=float(pm_options.learning_rate),
                                    n_estimators=int(pm_options.n_estimators),
                                    silent=True,
                                    objective=pm_options.objective,
                                    gamma=float(pm_options.gamma),
                                    max_delta_step=int(pm_options.max_delta_step),
                                    subsample=float(pm_options.subsample),
                                    colsample_bytree=1,
                                    colsample_bylevel=1,
                                    reg_alpha=float(pm_options.reg_alpha),
                                    reg_lambda=float(pm_options.reg_lambda),
                                    scale_pos_weight=float(pm_options.scale_pos_weight),
                                    seed=1,
                                    n_jobs=1,
                                    missing=None)

    final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)])

    final_model.fit(X_train, y_train)
    # Random Forest Training
    from sklearn.ensemble import RandomForestClassifier
    rf_only_model = RandomForestClassifier(n_estimators=int(pm_options.n_estimators), max_depth=int(pm_options.max_depth)+3, random_state=42, n_jobs=1, class_weight="balanced")
    rf_model = Pipeline([("mapper", mapper), ("rf", rf_only_model)])

    rf_model.fit(X_train, y_train)

    # ## Statistics on Test Dataset

    # Prediction and prediction distribution
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)
    rf_pred_labels = rf_model.predict(X_test)
    rf_pred_probs = rf_model.predict_proba(X_test)

    # Accuracy calculation
    # Accuracy for the xgboost model
    accuracy = accuracy_score(y_test, pred_labels)
    print("XGBoost Accuracy value: {0}".format(accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

    # Accuracy for the RF model
    rf_accuracy = accuracy_score(y_test, rf_pred_labels)
    print("RF Accuracy value: {0}".format(rf_accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("RF Accuracy", rf_accuracy, st.TIME_SERIES)

    # Label distribution:
    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    print("Validation Actual Label distributions: \n {0}".format(label_distribution))
    # Output Label distribution as a BarGraph using MCenter
    export_bar_table(label_distribution[:,0], label_distribution[:,1], "Validation - Actual Label Distribution")

    # Prediction distribution and prediction confidence distribution
    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Validation Prediction Label Distributions: \n {0}".format(pred_label_distribution))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_label_distribution[:,0], pred_label_distribution[:,1], "Validation - XGBoost Prediction Distribution")

    rf_pred_value, rf_pred_counts = np.unique(rf_pred_labels, return_counts=True)
    rf_pred_label_distribution = np.asarray((rf_pred_value, rf_pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("RF Validation Prediction Label Distributions: \n {0}".format(rf_pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(rf_pred_label_distribution[:,0], rf_pred_label_distribution[:,1], "Validation - RF Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(max_pred_probs[index_class])/(float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".format(average_confidence))

    #  Pred confidence per label
    rf_label_number = len(rf_pred_counts)
    rf_average_confidence = np.zeros(rf_label_number)
    rf_max_pred_probs = rf_pred_probs.max(axis=1)
    for i in range(0, rf_label_number):
        rf_index_class = np.where(rf_pred_labels == i)[0]
        if rf_pred_counts[i] > 0:
            rf_average_confidence[i] = np.sum(rf_max_pred_probs[rf_index_class])/(float(rf_pred_counts[i]))
        else:
            rf_average_confidence[i] = 0
    print("RF Validation Average Prediction confidence per label: \n {0}".format(rf_average_confidence))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class")
    export_bar_table(rf_pred_value, rf_average_confidence, "Validation - RF Average confidence per class")

    # Confusion Matrix
    # XGBoost Confusion Matrix
    confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
    print("Confusion Matrix for XGBoost: \n {0}".format(confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(confmat, "XGBoost")
    # RF Confusion Matrix
    rf_confmat = confusion_matrix(y_true=y_test, y_pred=rf_pred_labels)
    print("Confusion Matrix for RF: \n {0}".format(rf_confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(rf_confmat, "RF")

    # Classification Report
    # XGBoost Classification Report
    class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True)
    print("XGBoost Classification Report: \n {0}".format(class_rep))
    # RF Classification Report
    rf_class_rep = classification_report(y_true=y_test, y_pred=rf_pred_labels, output_dict=True)
    print("RF Classification Report: \n {0}".format(rf_class_rep))
    # Output Classification Report as a Table using MCenter
    export_classification_report(class_rep, "XGBoost")
    export_classification_report(rf_class_rep, "RF")

    # AUC and ROC Curves
    # ROC for XGBoost model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("XGBoost ROC AUC value: {}".format(roc_auc))
    rf_roc_auc = roc_auc_score(y_test, rf_pred_probs[:, 1])
    print("RF ROC AUC value:  {}".format(rf_roc_auc))
    # Output ROC of the chosen model using MCenter
    mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)
    mlops.set_stat("RF ROC AUC", rf_roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert("[Training] AUC Violation From Training Node",
                           "AUC Went Below {}. Current AUC Is {}".format(min_auc_requirement, roc_auc))

    # ROC curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, rf_pred_probs[:, 1])

    cg = MultiGraph().name("Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random curve ''', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist())
    cg.add_series(label='RF ROC curve (area = {0:0.2f})'''.format(rf_roc_auc), x=rf_fpr.tolist(), y=rf_tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    # Feature importance comparison
    # XGBoost Feature importance
    export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost")
    export_feature_importance(rf_model, list(X_train.columns), 5, "RF")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0=np.where(y_test == 0)[0]
    y_test1=np.where(y_test == 1)[0]
    rf_max_pred_probs = rf_pred_probs.max(axis=1)

    # KS for the XGBoost model
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(ks_stat, ks_pvalue))
    # KS for the RF model
    rf_ks = ks_2samp(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    rf_ks_stat = rf_ks.statistic
    rf_ks_pvalue = rf_ks.pvalue
    print("RF KS values: \n Statistics: {} \n pValue: {}\n".format(rf_ks_stat, rf_ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for RF", rf_ks_stat, st.TIME_SERIES)

    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert("[Training] KS Violation From Training Node",
                           "KS Stat Went Above {}. Current KS Stat Is {}".format(max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats for XGBoost").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1])
    rf_total_psi, rf_psi_table = get_psi(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats for XGBoost").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert("[Training] PSI Violation From Training Node",
                           "PSI Went Below {}. Current PSI Is {}".format(min_psi_requirement,
                                                                         total_psi))

    print("Total RF PSI values: \n {}".format(rf_total_psi))
    rf_psi_table_stat = Table().name("PSI Stats for RF").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in rf_psi_table.values:
        str_values = [str(i) for i in each_value]
        rf_psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(rf_psi_table_stat)
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total RF PSI ", rf_total_psi, st.TIME_SERIES)

    # ## Save the XGBoost Model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()

    # ## Finish the program
    mlops.done()
Esempio n. 21
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv").option(
        "header", pm_options.with_headers).option(
            "ignoreLeadingWhiteSpace",
            "true").option("ignoreTrailingWhiteSpace", "true").option(
                "inferschema",
                "true").load(pm_options.data_file)).repartition(10)

    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(
                column_names_all[col_index], 'c' + str(col_index))

    input_data = input_data.cache()

    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    exclude_cols = []
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    vector_assembler = VectorAssembler(inputCols=input_col_names,
                                       outputCol="features")
    kmeans_pipe = KMeans(k=int(pm_options.K),
                         initMode="k-means||",
                         initSteps=2,
                         tol=1e-4,
                         maxIter=100,
                         featuresCol="features")
    full_pipe = [vector_assembler, kmeans_pipe]
    model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

    # Test validation and statistics collection
    ############################################################
    predicted_df = model_kmeans.transform(input_test)

    print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

    sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
    print("Sum of Errors for Kmeans = " + str(sum_errors))

    # Shows the result.
    kmeans_centers = model_kmeans.stages[1].clusterCenters()
    print("Kmeans Centers: ")
    for center in kmeans_centers:
        print(center)

    # calculating stats
    ############################################################

    # Calculating Inter cluster distance
    inter_cluster_distance = np.zeros(
        (len(kmeans_centers), len(kmeans_centers)))

    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            inter_cluster_distance[centerIndex1, centerIndex2] =\
                eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

    print("inter_cluster_distance = ", inter_cluster_distance)
    # Calculating Intra cluster distances and the bars for the cluster distribution
    intra_cluster_distance = np.zeros(len(kmeans_centers))
    cluster_dist = np.zeros(len(kmeans_centers))

    for centerIndex1 in range(0, len(kmeans_centers)):
        filtered_df = predicted_df.filter(
            predicted_df["prediction"] == centerIndex1)
        cluster_dist[centerIndex1] = filtered_df.count()
        if cluster_dist[centerIndex1] == 0:
            intra_cluster_distance[centerIndex1] = 0
        else:
            filtered_df =\
                filtered_df.withColumn('distance',
                                       udf(eq_dist, FloatType())(col("features"),
                                            array([lit(v) for v in kmeans_centers[centerIndex1]])))
            intra_cluster_distance[centerIndex1] =\
                filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

    # calculating Davis-Boulding Index
    ############################################################
    # R[i,j] = (S[i] + S[j])/M[i,j]
    # D[i] = max(R[i,j]) for i !=j
    # DB = (1/K) * sum(D[i])
    r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            r_index[centerIndex1, centerIndex2] = 0
            if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                r_index[centerIndex1, centerIndex2] =\
                    (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\
                    / inter_cluster_distance[centerIndex1, centerIndex2]
    d_index = np.max(r_index, axis=0)
    db_index = np.sum(d_index, axis=0) / len(kmeans_centers)

    # pmml model generation
    ############################################################
    pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8")

    # PM stats
    ############################################################
    print("Sum of Errors for Kmeans = " + str(sum_errors))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index))
    pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(kmeans_centers)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row(
            str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance])
    pm.set_stat(tbl)

    tbl_col_name1 = []
    for j in range(0, len(kmeans_centers[0])):
        tbl_col_name1.append(str(j))
    tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row("center" + str(j) + ":",
                    ["%.2f" % x for x in kmeans_centers[j]])
    pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(
        cluster_dist.tolist())
    pm.stat(bar)

    print("PM: generating histogram from data-frame and model")
    print("PM:" + pmml_file)
    try:
        pm.set_data_distribution_stat(data=input_train, model=pmml_file)
        print("PM: done generating histogram")
    except Exception as e:
        print("PM: failed to generate histogram using pm.stat")
        print(e)

    return pmml_file
Esempio n. 22
0
    def generate_health_and_heatmap_stat(stat_object_method,
                                         logger,
                                         features_values,
                                         features_names,
                                         model_stat,
                                         model_id,
                                         num_bins=13,
                                         # TODO: Have ability to get this argument from user!
                                         data_analysis=True):
        """
        Method is highly responsible and creates continuous/categorical histograms. Also creates heatmap and compare two histogram if program is running on inference.

        :param stat_object_method: stat object method to output stat
        :param logger: logger to log
        :param features_values: feature array
        :param features_names: feature names
        :param model_stat: model stat
        :param num_bins: max number of bins for features.
        :return:
        """
        # generating general stats like categorical/continuous features and contender histograms.
        general_hist_stat = GeneralHistogramStat()
        general_hist_stat \
            .create_and_set_general_stat(set_of_features_values=features_values,
                                         set_of_features_names=features_names,
                                         model_stat=model_stat)

        # For Continuous Values
        # continuous feature names
        continuous_features_names = general_hist_stat.set_of_continuous_features
        # predefined bins of contender continuous hist
        pred_bins_continuous_hist = general_hist_stat.contender_continuous_hist_bins
        contender_continuous_histogram_representation = general_hist_stat.contender_continuous_histogram

        continuous_features_values = PythonChannelHealth. \
            _create_feature_subset(features_values=features_values,
                                   features_names=features_names,
                                   selection_features_subset=continuous_features_names)
        current_continuous_histogram_representation = \
            PythonChannelHealth._create_current_hist_rep(
                features_values=continuous_features_values,
                features_names=continuous_features_names,
                num_bins=num_bins,
                pred_bins_hist=pred_bins_continuous_hist,
                stat_object_method=stat_object_method,
                name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_KEY,
                model_id=model_id)

        # running data analysis for continuous dataset
        if data_analysis:
            continuous_data_analyst_result = ContinuousDataAnalyst \
                .analyze(set_of_continuous_feature_names=continuous_features_names,
                         set_of_continuous_feature_values=continuous_features_values)

            # outputting stat only if analysis result is there
            if len(continuous_data_analyst_result) > 0:
                cont_da = Table() \
                    .name("Continuous Data Analysis") \
                    .cols(["Count",
                           "Missing",
                           "Zeros",
                           "Standard Deviation",
                           "Min",
                           "Mean",
                           "Median",
                           "Max"])

                for f_n in continuous_data_analyst_result.keys():
                    f_v = continuous_data_analyst_result[f_n]
                    cont_da.add_row(str(f_v.feature_name),
                                    [f_v.count,
                                     f_v.NAs,
                                     f_v.zeros,
                                     f_v.std,
                                     f_v.min,
                                     f_v.mean,
                                     f_v.median,
                                     f_v.max])

                # outputting stat using stat object as stat message type
                stat_object_method(mlops_stat=cont_da.get_mlops_stat(model_id=model_id),
                                   reflex_event_message_type=ReflexEvent.StatsMessage)

        logger.debug("continuous features values: {}".format(continuous_features_values))
        logger.debug("continuous features names: {}".format(continuous_features_names))
        logger.debug(
            "current histogram representation: {}".format(current_continuous_histogram_representation))
        logger.debug(
            "contender histogram representation: {}".format(contender_continuous_histogram_representation))

        # For Categorical Values
        # categorical feature names
        categorical_features_names = general_hist_stat.set_of_categorical_features

        # predefined bins of contender categorical hist
        pred_bins_categorical_hist = general_hist_stat.contender_categorical_hist_bins
        contender_categorical_histogram_representation = general_hist_stat.contender_categorical_histogram

        categorical_features_values = PythonChannelHealth._create_feature_subset(features_values=features_values,
                                                                                 features_names=features_names,
                                                                                 selection_features_subset=categorical_features_names)
        current_categorical_histogram_representation = \
            PythonChannelHealth._create_current_hist_rep(
                categorical_features_values,
                categorical_features_names,
                num_bins,
                pred_bins_categorical_hist,
                stat_object_method,
                name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_KEY,
                model_id=model_id)

        # running data analysis for categorical dataset
        if data_analysis:
            categorical_data_analyst_result = CategoricalDataAnalyst \
                .analyze(set_of_categorical_feature_names=categorical_features_names,
                         set_of_categorical_feature_values=categorical_features_values)

            # outputting stat only if analysis result is there
            if len(categorical_data_analyst_result) > 0:

                categ_da = Table() \
                    .name("Categorical Data Analysis") \
                    .cols(["Count",
                           "Missing",
                           "Uniques",
                           "Top Frequently Occurring Category",
                           "Top Frequency",
                           "Average String Length"])

                for f_n in categorical_data_analyst_result.keys():
                    f_v = categorical_data_analyst_result[f_n]
                    categ_da. \
                        add_row(str(f_v.feature_name),
                                [f_v.count,
                                 f_v.NAs,
                                 f_v.unique,
                                 f_v.top,
                                 f_v.freq_top,
                                 f_v.avg_str_len])

                # outputting stat using stat object as stat message type
                stat_object_method(mlops_stat=categ_da.get_mlops_stat(model_id=model_id),
                                   reflex_event_message_type=ReflexEvent.StatsMessage)

        logger.debug("categorical features values: {}".format(categorical_features_values))
        logger.debug("categorical features names: {}".format(categorical_features_names))
        logger.debug(
            "current histogram representation: {}".format(current_categorical_histogram_representation))
        logger.debug(
            "contender histogram representation: {}".format(contender_categorical_histogram_representation))

        # If model_stat is given, it means it is inference program
        # so it needs to create heatmap and score too.
        if model_stat is not None:
            if continuous_features_values.shape[0] > 0:
                continuous_features_names, heat_map_values = PythonChannelHealth. \
                    _create_current_continuous_heatmap_rep(continuous_features_values=continuous_features_values,
                                                           continuous_features_names=continuous_features_names,
                                                           stat_object_method=stat_object_method,
                                                           model_id=model_id)
                logger.debug("features: {}, heatmap values: {}".format(continuous_features_names,
                                                                       heat_map_values))

                compared_continuous_feature_names, compared_continuous_feature_score = PythonChannelHealth. \
                    _compare_health(
                    current_histogram_representation=current_continuous_histogram_representation,
                    contender_histogram_representation=contender_continuous_histogram_representation,
                    stat_object_method=stat_object_method,
                    name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_OVERLAP_SCORE_KEY,
                    model_id=model_id)
                logger.debug(
                    "continuous features: {}, overlap scores: {}".format(compared_continuous_feature_names,
                                                                         compared_continuous_feature_score))

            if categorical_features_values.shape[0] > 0:
                compared_categorical_feature_names, compared_categorical_feature_names = PythonChannelHealth. \
                    _compare_health(
                    current_histogram_representation=current_categorical_histogram_representation,
                    contender_histogram_representation=contender_categorical_histogram_representation,
                    stat_object_method=stat_object_method,
                    name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_OVERLAP_SCORE_KEY,
                    model_id=model_id)
                logger.debug(
                    "categorical features: {}, overlap scores: {}".format(
                        compared_categorical_feature_names, compared_categorical_feature_names))
Esempio n. 23
0
class CategoricalStatistics(InferenceStatistics):
    def __init__(self,
                 print_interval,
                 stats_type,
                 num_categories,
                 conf_thresh,
                 conf_percent,
                 hot_label=True):
        super(CategoricalStatistics, self).__init__(print_interval)
        self._num_categories = num_categories
        self._hot_label = hot_label
        self._stats_type = stats_type
        self._conf_thresh = conf_thresh / 100.0
        self._conf_percent = conf_percent

        # These are useful for development, but should be replaced by mlops library functions
        self._label_hist = []
        self._infer_hist = []
        for i in range(0, self._num_categories):
            self._label_hist.append(0)
            self._infer_hist.append(0)

        if self._stats_type == "python":
            mlops.init(ctx=None,
                       connect_mlops=True,
                       mlops_mode=MLOpsMode.AGENT)
        elif self._stats_type == "file":
            mlops.init(ctx=None,
                       connect_mlops=False,
                       mlops_mode=MLOpsMode.STAND_ALONE)
        else:
            self._stats_type = "none"

        if self._stats_type != "none":
            column_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
            self._infer_tbl = Table().name("categories").cols(column_names)
            self._infer_bar = BarGraph().name("categories bar").cols(
                column_names)

    def infer_stats(self, sample, label, inference):

        # for now, we only process 1 inference at a time
        inference = inference[0]
        prediction = ny.argmax(inference)
        confidence = inference[prediction]
        if confidence < self._conf_thresh:
            self.increment_low_conf()

        self._infer_hist[prediction] += 1

        if label is not None:
            if (self._hot_label):
                label = ny.argmax(label)
            self._label_hist[label] += 1

            if prediction == label:
                self.increment_correct()

        self.increment_total()
        if self.is_time_to_report():
            self.report_stats()

        return prediction

    def report_stats(self):

        # what percentage of the predictions had confidences less than the threshold
        low_conf_percent = self.get_low_conf(
        ) * 100.0 / self.get_report_interval()

        if low_conf_percent > self._conf_percent:
            mlops.health_alert(
                "Low confidence alert",
                "{}% of inferences had confidence below {}%".format(
                    low_conf_percent, self._conf_thresh * 100))

        for i in range(0, self._num_categories):
            print(i, "label_total =", self._label_hist[i], "infer_total = ",
                  self._infer_hist[i])

        print("total = ", self.get_total(), "total_correct = ",
              self.get_correct())

        category_data = [
            self._infer_hist[0], self._infer_hist[1], self._infer_hist[2],
            self._infer_hist[3], self._infer_hist[4], self._infer_hist[5],
            self._infer_hist[6], self._infer_hist[7], self._infer_hist[8],
            self._infer_hist[9]
        ]

        self._infer_tbl.add_row(str(self.get_cum_total()), category_data)
        self._infer_bar.data(category_data)

        if self._stats_type != "none":
            mlops.set_stat("correct_percent",
                           self.get_correct() * 100.0 / self.get_total())
            mlops.set_stat(self._infer_tbl)
            mlops.set_stat(self._infer_bar)
            # Update total prediction count with the all new predictions since we last reported.
            mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT,
                           self.get_report_interval())
            print("Completed {} predictions".format(
                self.get_report_interval()))

        self.reset()

    def __del__(self):
        mlops.done()
        super(CategoricalStatistics, self).__del__()
Esempio n. 24
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv")
                  .option("header", pm_options.with_headers)
                  .option("ignoreLeadingWhiteSpace", "true")
                  .option("ignoreTrailingWhiteSpace", "true")
                  .option("inferschema", "true")
                  .load(pm_options.data_file)).repartition(10)

    # If Data doesn't have headers Create column names c0-cn
    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(column_names_all[col_index],
                                                      'c' + str(col_index))

    input_data = input_data.cache()

    # Set both train and tesst data to the entire dataset
    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    # Create column names for vector assembler. Handle exclude columns for vector assembler
    exclude_cols = [] # No columns to exclude - kmeans of all columns
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    # Set hyper parameters search parameters
    k_range = pm_options.KRange.split(',')
    db_index_max = np.finfo(np.float64).max
    k_max = k_range[0]
    db_index_array = np.zeros(len(k_range))

    for index_hs in range (0,len(k_range)):
        vector_assembler = VectorAssembler(
                inputCols=input_col_names,
                outputCol="features")
        kmeans_pipe = KMeans(
            k=int(k_range[index_hs]),
            initMode="k-means||",
            initSteps=5,
            tol=1e-4,
            maxIter=100,
            featuresCol="features")
        full_pipe = [vector_assembler, kmeans_pipe]
        model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

        # Test validation and statistics collection
        ############################################################
        predicted_df = model_kmeans.transform(input_test)

        print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

        sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
        print("Sum of Errors for Kmeans = " + str(sum_errors))

        kmeans_centers = model_kmeans.stages[1].clusterCenters()
        print("Kmeans Centers: ")
        for center in kmeans_centers:
            print(center)

        # calculating stats
        ############################################################

        # Calculating Inter cluster distance
        inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers)))

        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                inter_cluster_distance[centerIndex1, centerIndex2] = \
                    eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

        print("inter_cluster_distance = ", inter_cluster_distance)
        
        # Calculating Intra cluster distances and the bars for the cluster distribution
        intra_cluster_distance = np.zeros(len(kmeans_centers))
        cluster_dist = np.zeros(len(kmeans_centers))

        for centerIndex1 in range(0, len(kmeans_centers)):
            filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1)
            cluster_dist[centerIndex1] = filtered_df.count()
            if cluster_dist[centerIndex1] == 0:
                intra_cluster_distance[centerIndex1] = 0
            else:
                filtered_df = \
                    filtered_df.withColumn('distance',
                                           udf(eq_dist, FloatType())(col("features"),
                                                                     array([lit(v) for v in kmeans_centers[centerIndex1]])))
                intra_cluster_distance[centerIndex1] = \
                    filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

        # calculating Davies-Boulding Index
        ############################################################
        # R[i,j] = (S[i] + S[j])/M[i,j]
        # D[i] = max(R[i,j]) for i !=j
        # DB = (1/K) * sum(D[i])
        r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                r_index[centerIndex1, centerIndex2] = 0
                if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                    r_index[centerIndex1, centerIndex2] = \
                        (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \
                        / inter_cluster_distance[centerIndex1, centerIndex2]
        d_index = np.max(r_index, axis=0)
        db_index = np.sum(d_index, axis=0) / len(kmeans_centers)
        db_index_array[index_hs] = db_index

        # Check Hyper Parameter Search max
        if (db_index < db_index_max):
            db_index_max = db_index
            k_max = k_range[index_hs]
            model_kmeans_max = model_kmeans
            sum_errors_max = sum_errors
            kmeans_centers_max = kmeans_centers
            inter_cluster_distance_max = inter_cluster_distance
            intra_cluster_distance_max = intra_cluster_distance
            cluster_dist_max = cluster_dist




    # PM stats
    ############################################################
    print("Optimal K = " + str(k_max))
    pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES)

    print("Sum of Errors for Kmeans = " + str(sum_errors_max))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index_max))
    pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(k_range)):
        tbl_col_name.append(str(k_range[j]))
    tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name)
    tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array])
    pm.set_stat(tbl)

    tbl_col_name = []
    for j in range(0, len(kmeans_centers_max)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers_max)):
        tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max])
    pm.set_stat(tbl)

    if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12):
        tbl_col_name1 = []
        for j in range(0, len(kmeans_centers_max[0])):
            tbl_col_name1.append(str(j))
        tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1)
        for j in range(0, len(kmeans_centers_max)):
            tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]])
        pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist())
    pm.set_stat(bar)


    return model_kmeans_max
Esempio n. 25
0
def infer_loop(model, input, output_file, stats_interval, conf_thresh, conf_percent):

    output = open(output_file, "w")

    # Initialize statistics
    total_predictions = 0
    low_confidence_predictions = 0
    categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    prediction_hist = []
    for i in range(0, len(categories)):
        prediction_hist.append(0)

    ### MLOPS start
    # Create a bar graph and table for reporting prediction distributions and set the column names
    infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(categories)
    infer_tbl = Table().name("Prediction Distribution Table").cols(categories)
    ### MLOPS end
    
    while True:
        try:
            sample, label = input.get_next_input()
            sample_np = ny.array(sample).reshape(1, -1)

            
            # The prediction is the class with the highest probability
            prediction = model.predict(sample_np)
            

            # Append the prediction to the output file
            output.write("{}\n".format(prediction))

            # Calculate statistics
            total_predictions += 1
            prediction_hist[ny.int(prediction[0])] += 1

            # Report statistics
            if total_predictions % stats_interval == 0:

                # Report the prediction distribution
                for i in range(0, len(categories)):
                    print("category: {} predictions: {}".format(categories[i], prediction_hist[i]))


                ### MLOPS start


                # Show the prediction distribution as a table
                infer_tbl.add_row(str(total_predictions), prediction_hist)

                # Show the prediction distribution as a bar graph
                infer_bar.data(prediction_hist)

                


        except EOFError:
            # stop when we hit end of input
            # Report the stats
            mlops.set_stat(infer_tbl)
            mlops.set_stat(infer_bar)

            ### MLOPS end
            output.close()

            ### MLOPS start
            mlops.done()
            ### MLOPS end

            break
Esempio n. 26
0
def run_mlops_tests(package_to_scan, test_to_run=None):
    """
    Given a directory, scan the directory and take all files starting with test_ in each file
    run all the functions starting with test_

    TODO: find a way to use pytest here if possible.

    :param package_to_scan: package to scan for test modules
    :param test_to_run: If provided run only a specific test "module.func"
    :raise Exception: In case of error in the tests an Exception will be raised
    """

    modules = detect_modules_in_package(package_to_scan)
    print("Detected modules: {}".format(modules))
    print("Loading and running test_XXX methods inside")

    results = []
    failed_tests = 0
    total_tests = 0

    for mod_name in modules:
        mod = importlib.import_module(package_to_scan.__name__ + "." +
                                      mod_name)

        mod_funcs = detect_module_methods(mod)

        module_results = dict()
        module_results["name"] = mod_name
        module_results["per_func"] = []
        module_results["pass"] = True
        print("Module {} funcs {}".format(mod, mod_funcs))

        for func_name in mod_funcs:
            if test_to_run is not None:
                full_test_name = "{}.{}".format(mod_name, func_name)
                if full_test_name != test_to_run:
                    continue

            total_tests += 1
            func_results = dict()
            func_results["name"] = func_name
            print("\n\nrunning test: {}.{}".format(mod_name, func_name))
            try:
                method_to_call = getattr(mod, func_name)
                method_to_call()
                func_results["pass"] = True
            except Exception as e:
                func_results["pass"] = False
                func_results["traceback"] = "".join(
                    traceback.format_exception(*sys.exc_info()))
                failed_tests += 1
                module_results["pass"] = False
            module_results["per_func"].append(func_results)
        results.append(module_results)

    # Table
    tbl = Table().name("Test Results").cols(["Module", "Test", "Status"])

    print("\n\n\n")
    print("Test Summary: total: {} ok: {} failed: {}".format(
        total_tests, total_tests - failed_tests, failed_tests))
    print("=======================================================")
    idx = 0
    for mod_res in results:
        print("Module: {}".format(mod_res["name"]))
        for func_res in mod_res["per_func"]:

            if test_to_run is not None:
                full_test_name = "{}.{}".format(mod_res["name"],
                                                func_res["name"])
                if full_test_name != test_to_run:
                    continue

            print("|    {:<20} {}".format(func_res["name"], func_res["pass"]))
            if func_res["pass"] is False:
                print("\n{}\n".format(func_res["traceback"]))

            tbl.add_row(str(idx), [
                mod_res["name"], func_res["name"],
                "pass" if func_res["pass"] else "fail"
            ])
            idx += 1

    pm.set_stat(tbl)
    pm.set_stat(E2EConstants.E2E_RUN_STAT, 1, st.TIME_SERIES)
    if failed_tests > 0:
        print("=======================================================\n")
        print("Aborting unit test due to errors")
        raise Exception(
            "Failed running unit tests! failed: {}\n".format(failed_tests))
Esempio n. 27
0
    mlops.set_stat("myCounterDouble", 5.5)
    mlops.set_stat("myCounterDouble2", 7.3)

    # Multi-line graph
    mlt = MultiLineGraph().name("Multi Line").labels(["l1",
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    # Example of sending a table to pm system.
    # Multi-line graphs
    mlt = MultiLineGraph().name("Multi Line").labels(["l1",
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    # Table example
    tbl = Table().name("MyTable").cols(["", "Date"])
    tbl.add_row(["line 1", "2001Q1"])
    tbl.add_row(["line 2", "2014Q3"])
    mlops.set_stat(tbl)

    bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd",
                                         "ee"]).data([10, 15, 12, 9, 8])
    mlops.set_stat(bar)

    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    n = 100000 * partitions

    def f(_):
        x = random() * 2 - 1
        y = random() * 2 - 1
        return 1 if x**2 + y**2 <= 1 else 0
def infer_loop(model, input, output_file, stats_interval, conf_tracker):

    output = open(output_file, "w")

    # Initialize statistics
    total_predictions = 0
    low_confidence_predictions = 0
    categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    prediction_hist = []
    for i in range(0, model.get_num_categories()):
        prediction_hist.append(0)

    ### MLOPS start
    # Create a bar graph and table for reporting prediction distributions and set the column names
    infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(
        categories)
    infer_tbl = Table().name("Prediction Distribution Table").cols(categories)
    ### MLOPS end

    while True:
        try:
            sample, label = input.get_next_input()

            # Get the inference. This is an array of probabilities for each output value.
            inference = model.infer(sample)

            # The prediction is the class with the highest probability
            prediction = ny.argmax(inference)

            # The confidence for that prediction
            confidence = inference[prediction] * 100

            # Append the prediction to the output file
            output.write("{}\n".format(prediction))

            # Calculate statistics
            total_predictions += 1
            prediction_hist[prediction] += 1

            conf_tracker.check_confidence(confidence, sample)

            # Report statistics
            if total_predictions % stats_interval == 0:

                # Report the prediction distribution
                for i in range(0, model.get_num_categories()):
                    print("category: {} predictions: {}".format(
                        categories[i], prediction_hist[i]))

                ### MLOPS start

                # Update total prediction count with the all new predictions since we last reported
                mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT,
                               stats_interval)

                # Show the prediction distribution as a table
                infer_tbl.add_row(str(total_predictions), prediction_hist)

                # Show the prediction distribution as a bar graph
                infer_bar.data(prediction_hist)

                # Report the stats
                mlops.set_stat(infer_tbl)
                mlops.set_stat(infer_bar)

                ### MLOPS end

                conf_tracker.report_confidence(stats_interval)

        except EOFError:
            # stop when we hit end of input
            print("Reached end of input")
            output.close()

            break
Esempio n. 29
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # Validation Split:          [{}]".format(
        pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(
        pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(
        pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(
        pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(
        pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(
        pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(
        pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(
        pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            # binary classification only!
            n_classes=2,
            random_state=42)

        print("Adding Random Noise!")

        noisy_features = np.random.uniform(0, 1) * \
                         np.random.normal(0, 1,
                                          (num_samples, num_features))
        X = X + noisy_features

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(pm_options.validation_split), random_state=42)

    import xgboost as xgb

    # Create a model that should be deployed into production
    final_model = xgb.XGBClassifier(
        max_depth=int(pm_options.max_depth),
        min_child_weight=int(pm_options.min_child_weight),
        learning_rate=float(pm_options.learning_rate),
        n_estimators=int(pm_options.n_estimators),
        silent=True,
        objective=str(pm_options.objective),
        gamma=float(pm_options.gamma),
        max_delta_step=int(pm_options.max_delta_step),
        subsample=float(pm_options.subsample),
        colsample_bytree=1,
        colsample_bylevel=1,
        reg_alpha=float(pm_options.reg_alpha),
        reg_lambda=float(pm_options.reg_lambda),
        scale_pos_weight=float(pm_options.scale_pos_weight),
        seed=1,
        missing=None)

    final_model.fit(X_train, y_train)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(X_train)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)

    print("Pred Labels: ", pred_labels)
    print("Pred Probabilities: ", pred_probs)

    accuracy = accuracy_score(y_test, pred_labels)
    print("Accuracy values: \n {0}".format(accuracy))
    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    # column_names = value.astype(str).tolist()
    print("Validation Actual Label distributions: \n {0}".format(
        label_distribution))

    # Output Label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Validation Actual Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Validation Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name(
        "Validation Prediction Label Distributions").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # ROC for the chosen model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("ROC AUC values: \n {}".format(roc_auc))

    #     Output ROC of the chosen model using MCenter
    mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert(
            "[Training] AUC Violation From Training Node",
            "AUC Went Below {}. Current AUC Is {}".format(
                min_auc_requirement, roc_auc))

    # ROC Curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    cg = MultiGraph().name(
        "Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='ROC Curve (Area = {0:0.2f})'
                  ''.format(roc_auc),
                  x=fpr.tolist(),
                  y=tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    max_pred_probs = pred_probs.max(axis=1)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test == 1],
                                   max_pred_probs[y_test == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    # Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
Esempio n. 30
0
def ab_test(options, start_time, end_time, mode):
    sc = None
    if mode == RunModes.PYSPARK:
        from pyspark import SparkContext
        sc = SparkContext(appName="pm-ab-testing")
        pm.init(sc)
    elif mode == RunModes.PYTHON:
        pm.init()
    else:
        raise Exception("Invalid mode " + mode)

    not_enough_data = False

    # Following are a and b component names
    a_prediction_component_name = options.nodeA
    b_prediction_component_name = options.nodeB

    conv_a_stat_name = options.conversionsA
    conv_b_stat_name = options.conversionsB

    samples_a_stat_name = options.samplesA
    samples_b_stat_name = options.samplesB

    a_agent = utils._get_agent_id(a_prediction_component_name, options.agentA)
    b_agent = utils._get_agent_id(b_prediction_component_name, options.agentB)

    if a_agent is None or b_agent is None:
        print("Invalid agent provided {} or {}".format(options.agentA, options.agentB))
        pm.system_alert("PyException",
                        "Invalid Agent {} or {}".format(options.agentA, options.agentB))
        return

    try:
        a_samples = pm.get_stats(name=samples_a_stat_name, mlapp_node=a_prediction_component_name,
                                 agent=a_agent, start_time=start_time,
                                 end_time=end_time)

        b_samples = pm.get_stats(name=samples_b_stat_name, mlapp_node=b_prediction_component_name,
                                 agent=b_agent, start_time=start_time,
                                 end_time=end_time)

        a_samples_pdf = pd.DataFrame(a_samples)
        b_samples_pdf = pd.DataFrame(b_samples)

        try:
            rowa1 = int(a_samples_pdf.tail(1)['value'])
            rowb1 = int(b_samples_pdf.tail(1)['value'])
        except Exception as e:
            not_enough_data = True
            print("Not enough samples stats produced in pipelines")
            raise ValueError("Not enough data to compare")

        a_conv = pm.get_stats(name=conv_a_stat_name, mlapp_node=a_prediction_component_name,
                              agent=a_agent, start_time=start_time,
                              end_time=end_time)
        b_conv = pm.get_stats(name=conv_b_stat_name, mlapp_node=b_prediction_component_name,
                              agent=b_agent, start_time=start_time,
                              end_time=end_time)

        a_conv_pdf = pd.DataFrame(a_conv)
        b_conv_pdf = pd.DataFrame(b_conv)

        try:
            rowa2 = int(a_conv_pdf.tail(1)['value'])
            rowb2 = int(b_conv_pdf.tail(1)['value'])
        except Exception as e:
            not_enough_data = True
            print("Not enough conversion stats produced in pipelines")
            raise ValueError("Not enough data to compare")

        abHealth = statsCalculator()
        abHealth.exptOutcome(float(rowa1), float(rowa2), float(rowb1), float(rowb2),
                             options.confidence)
        confidence = abHealth.calConfidence()
        out = abHealth.calSuccess(options.confidence)

        # calculate conversion rate
        convA = float(rowa2) / float(rowa1)
        convB = float(rowb2) / float(rowb1)
        if convA != 0.0:
            relUplift = (convB - convA) / (convA)
        else:
            relUplift = convB
        relUplift = relUplift * 100

        # AB Graphs
        ab = MultiGraph().name("AB").set_continuous()

        ab.x_title("Conversion Rate (%)")
        ab.y_title(" ")

        # normalizing x and y axis for A for display
        dist_a_norm_x = [a_x * 100.0 / rowa1 for a_x in abHealth._distControl[0].tolist()]
        dist_a_norm_y = [a_y * rowa1 / 100.0 for a_y in abHealth._distControl[1].tolist()]
        ab.add_series(label="A", x=dist_a_norm_x, y=dist_a_norm_y)

        # normalizing x and y axis for B for display
        dist_b_norm_x = [b_x * 100.0 / rowb1 for b_x in abHealth._distB[0].tolist()]
        dist_b_norm_y = [b_y * rowb1 / 100.0 for b_y in abHealth._distB[1].tolist()]
        ab.add_series(label="B", x=dist_b_norm_x, y=dist_b_norm_y)

        # annotate confidence line on normalized x-axis
        ab.annotate(label="{} %".format(options.confidence),
                    x=abHealth._verticalLine * 100.0 / rowa1)

        # for not overriding it in display
        # annotate CR line on normalized x-axis
        if convA != convB:
            ab.annotate(label="CR A {}".format(convA * 100.0), x=convA * 100.0)
            ab.annotate(label="CR B {}".format(convB * 100.0), x=convB * 100.0)
        else:
            ab.annotate(label="CR A & B {}".format(convA * 100.0), x=convA * 100.0)

        pm.set_stat(ab)

        # conversion rate
        cols = ["A", "B"]
        mlt = MultiLineGraph().name("ConversionRate").labels(cols).data(
            [convA * 100.0, convB * 100.0])
        pm.set_stat(mlt)

        # emit table with all stats
        tbl2 = Table().name("AB Stats").cols(
            ["Samples Processed", "Conversions", "Conversion Rate (%)",
             "Improvement (%)", "Chance to beat baseline (%)"])
        tbl2.add_row(options.champion,
                     [str(rowa1), str(rowa2), "{0:.2f}".format(convA * 100), "-", "-"])
        tbl2.add_row(options.challenger, [str(rowb1), str(rowb2), "{0:.2f}".format(convB * 100),
                                          "{0:.2f}".format(relUplift),
                                          "{0:.2f}".format(confidence)])
        pm.set_stat(tbl2)

        # set cookie
        tbl = Table().name("cookie").cols(["uplift", "champion", "challenger",
                                           "conversionA", "conversionB", "realUplift", "success",
                                           "confidence", "realConfidence"])
        tbl.add_row("1", [str(options.uplift), options.champion, options.challenger,
                          "{0:.2f}".format(convA * 100), "{0:.2f}".format(convB * 100),
                          "{0:.2f}".format(abHealth._uplift), str(out), str(options.confidence),
                          "{0:.2f}".format(abHealth.calConfidence())])
        pm.set_stat(tbl)

        if out == True:
            pm.data_alert("DataAlert", "AB Test Success zScore {}".format(abHealth._zScore))
            pm.set_stat("Success", 1, st.TIME_SERIES)
        else:
            pm.set_stat("Success", 0, st.TIME_SERIES)

    except Exception as e:
        if not_enough_data is False:
            print("Got exception while getting stats: {}".format(e))
            pm.system_alert("PyException", "Got exception {}".format(e))

    if mode == RunModes.PYSPARK:
        sc.stop()
    pm.done()