Exemple #1
0
def test_ensemble_prediction():

    ann_submodel_1 = ann_sm.ANN_Submodel(weight=1, pdf=None)
    ann_submodel_2 = ann_sm.ANN_Submodel(weight=1, pdf=None)
    ensemble = ens.ModelEnsmeble()
    print("ann_submodel_1={}".format(ann_submodel_1))
    print("ann_submodel_2={}".format(ann_submodel_2))
    print("ensemble={}".format(ensemble))

    # Setup 2 processes (class distributions are flipped)

    # class 1 Gaussian distribution params
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]
    # class 2 Gaussian distribution params
    mean_2 = [3, 3]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params_1 = []
    gauss_params_1.append((mean_1, cov_1))
    gauss_params_1.append((mean_2, cov_2))
    process_1 = prc.Process(num_dimensions=2,
                            num_classes=2,
                            class_distribution_parameters=gauss_params_1)

    gauss_params_2 = []
    gauss_params_2.append((mean_2, cov_2))
    gauss_params_2.append((mean_1, cov_1))
    process_2 = prc.Process(num_dimensions=2,
                            num_classes=2,
                            class_distribution_parameters=gauss_params_2)

    print(process_1)
    print(process_2)

    # Generate 2 training datasets from the 2 processes
    training_data_1 = process_1.generate_data_points_from_all_labels(
        total_count=1000)
    training_data_2 = process_2.generate_data_points_from_all_labels(
        total_count=1000)

    # Generate a test data from process_2
    test_data = process_2.generate_data_points_from_all_labels(
        total_count=1000)

    # Train ANN_Submodels and add to ensemble
    ann_submodel_1.train(training_data_1)
    ann_submodel_2.train(training_data_2)

    ensemble.add_submodel(ann_submodel_1)
    ensemble.add_submodel(ann_submodel_2)

    # Test ANN_Submodels and the ensemble seperately and report results
    predict_and_print_results(ann_submodel_1, test_data,
                              "test_ensemble_prediction: ann_submodel_1")
    predict_and_print_results(ann_submodel_2, test_data,
                              "test_ensemble_prediction: ann_submodel_2")
    predict_and_print_results(ensemble, test_data,
                              "test_ensemble_prediction: ensemble")
def test_generate_data_points_from_all_labels():
    gauss_params = []

    # class 1 Gaussian distribution params
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]

    # class 2 Gaussian distribution params
    mean_2 = [3, 3]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=2,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate data from all labels and plot
    data_points_all_labels = process.generate_data_points_from_all_labels(
        total_count=1000)

    x = [point.X[0] for point in data_points_all_labels]
    y = [point.X[1] for point in data_points_all_labels]
    plt.scatter(x, y)

    plt.show()
Exemple #3
0
def test_total_variation_distance_single_gaussians():
    # Generate some data from a class

    gauss_params = []
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]

    mean_2 = [4, 4]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=2,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate data
    data_points1 = process.generate_data_points(label=0, count=500)
    data_points2 = process.generate_data_points(label=0, count=500)
    data_points3 = process.generate_data_points(
        label=1, count=500)  # From different label

    X_dataset1 = [point.X for point in data_points1]
    X_dataset2 = [point.X for point in data_points2]
    X_dataset3 = [point.X for point in data_points3]

    # Estimate probability distributions of datasets
    kde_estimator1 = ddif.estimate_pdf_kde(X_dataset1)
    kde_estimator2 = ddif.estimate_pdf_kde(X_dataset2)
    kde_estimator3 = ddif.estimate_pdf_kde(X_dataset3)

    (w1, w2, bounds1) = dd.prepare_sample_windows(data_points1, data_points2)
    (w1, w2, bounds2) = dd.prepare_sample_windows(data_points1, data_points3)

    # Compute difference between distributions (multiple times)

    diff_list_1 = []
    diff_list_2 = []

    # Calculate difference metric multiple times
    for i in range(50):
        diff1 = ddif.total_variation_distance(kde_estimator1, kde_estimator2,
                                              bounds1)
        diff2 = ddif.total_variation_distance(kde_estimator1, kde_estimator3,
                                              bounds2)

        diff_list_1.append(diff1)
        diff_list_2.append(diff2)

        if (i % 5 == 0):
            print("index={}".format(i))

    # Plot
    plt.plot(diff_list_1, label='diff_1_2 - same label')
    plt.plot(diff_list_2, label='diff_1_3 - different labels')
    plt.legend(loc='upper right')
    plt.ylabel('diff')
    plt.show()
Exemple #4
0
def test_estimate_2d_pdf_kde():

    # Generate some data from a class

    gauss_params = []
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]

    mean_2 = [3, 3]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=2,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate data
    data_points_1 = process.generate_data_points(label=0, count=1000)
    X_dataset_1 = [point.X for point in data_points_1]

    data_points_2 = process.generate_data_points_from_all_labels(
        total_count=1000)
    X_dataset_2 = [point.X for point in data_points_2]

    # Estimate pdf
    kde_estimator_1 = ddif.estimate_pdf_kde(X_dataset_1)
    kde_estimator_2 = ddif.estimate_pdf_kde(X_dataset_2)

    # Plot pdf
    log_values_1 = kde_estimator_1.score_samples(X_dataset_1)
    pdf_values_1 = np.exp(log_values_1)

    log_values_2 = kde_estimator_2.score_samples(X_dataset_2)
    pdf_values_2 = np.exp(log_values_2)

    # X_dataset_1, pdf_values_1 = (list(x) for x in zip(*sorted(zip(X_dataset_1, pdf_values_1))))

    x = [sample[0] for sample in X_dataset_1]
    y = [sample[1] for sample in X_dataset_1]

    # Plot
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.scatter(x, y, pdf_values_1, label='single-gaussian')

    x = [sample[0] for sample in X_dataset_2]
    y = [sample[1] for sample in X_dataset_2]

    ax.scatter(x, y, pdf_values_2, label='two-gaussians')

    plt.legend(loc='upper right')
    plt.show()
Exemple #5
0
def test_estimate_1d_pdf_kde():

    # Generate some data from a class

    gauss_params = []
    mean_1 = [0]
    cov_1 = [[1]]

    mean_2 = [4]
    cov_2 = [[1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=1,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate data
    data_points_1 = process.generate_data_points(label=0, count=1000)
    X_dataset_1 = [point.X for point in data_points_1]

    data_points_2 = process.generate_data_points_from_all_labels(
        total_count=1000)
    X_dataset_2 = [point.X for point in data_points_2]

    # Estimate pdf
    kde_estimator_1 = ddif.estimate_pdf_kde(X_dataset_1)
    kde_estimator_2 = ddif.estimate_pdf_kde(X_dataset_2)

    # Get pdf values at dataset samples
    log_values_1 = kde_estimator_1.score_samples(X_dataset_1)
    pdf_values_1 = np.exp(log_values_1)

    log_values_2 = kde_estimator_2.score_samples(X_dataset_2)
    pdf_values_2 = np.exp(log_values_2)

    X_dataset_1, pdf_values_1 = (list(x) for x in zip(
        *sorted(zip(X_dataset_1, pdf_values_1))))
    X_dataset_2, pdf_values_2 = (list(x) for x in zip(
        *sorted(zip(X_dataset_2, pdf_values_2))))

    # Plot
    plt.scatter(X_dataset_1, pdf_values_1, label='single-gaussian')
    plt.plot(X_dataset_1, pdf_values_1)

    plt.scatter(X_dataset_2, pdf_values_2, color='r', label='two-gaussians')
    plt.plot(X_dataset_2, pdf_values_2, color='r')

    plt.legend(loc='upper right')
    plt.show()
def test_results_manager():
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]
    process = prc.Process(num_dimensions=2,
                          num_classes=1,
                          class_distribution_parameters=[(mean_1, cov_1)])

    data_points = process.generate_data_points(label=0, count=100)

    results_manager = rman.ResultsManager(avg_error_window_size=10,
                                          title_suffix="Test_scenario")

    batch_size = 5
    batch = []

    diff_sum = 0
    for index, data_point in enumerate(data_points):
        batch.append(data_point)
        diff = 1 / (index + 1)
        diff_sum += diff
        is_drift_detected = False

        # Set predicted y values
        data_point.predicted_y = data_point.true_y
        if (index % 3 == 0):
            data_point.predicted_y = -100

        if (index % 15 == 0):
            is_drift_detected = True

        if (index % batch_size == 0):
            results_manager.add_prediction_result(index, batch)
            results_manager.add_detection_info(index, diff, diff_sum,
                                               is_drift_detected)
            batch = []

        if (index % 20 == 0):
            results_manager.print_results()

    results_manager.add_special_marker(73, "marker_1")
    results_manager.add_special_marker(83, "marker_2")
    results_manager.plot_results()
Exemple #7
0
def test_tree_submodel_training():

    tree_submodel = tree_sm.DecisionTreeSubmodel(weight=1,
                                                 pdf=None,
                                                 classifer_type="Artificial")
    print(tree_submodel)

    # Generate some data from a 2 class stochastic process (for training and testing)

    # Setup process

    gauss_params = []
    # class 1 Gaussian distribution params
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]
    # class 2 Gaussian distribution params
    mean_2 = [3, 3]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=2,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate training and test data
    training_data = process.generate_data_points_from_all_labels(
        total_count=1000)
    test_data = process.generate_data_points_from_all_labels(total_count=1000)

    # Train ANN_Submodel
    tree_submodel.train(training_data)

    # Test ANN_Submodel and report results
    predict_and_print_results(tree_submodel, test_data,
                              "test_tree_submodel_training")
Exemple #8
0
def test_adaptation():
    ensemble = ens.ModelEnsmeble()
    adaptor = da.DriftAdaptor(ensemble, "ANN_Submodel", "Artifcial")

    print(adaptor)

    # Setup 2 processes

    # class 1 Gaussian distribution params
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]
    # class 2 Gaussian distribution params
    mean_2 = [3, 3]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params_1 = []
    gauss_params_1.append((mean_1, cov_1))
    gauss_params_1.append((mean_2, cov_2))
    process_1 = prc.Process(num_dimensions=2,
                            num_classes=2,
                            class_distribution_parameters=gauss_params_1)

    # class 1 Gaussian distribution params
    mean_1 = [0, 5]
    cov_1 = [[1, 0], [0, 1]]
    # class 2 Gaussian distribution params
    mean_2 = [5, 0]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params_2 = []
    gauss_params_2.append((mean_1, cov_1))
    gauss_params_2.append((mean_2, cov_2))
    process_2 = prc.Process(num_dimensions=2,
                            num_classes=2,
                            class_distribution_parameters=gauss_params_2)

    print(process_1)
    print(process_2)

    # Generate 2 training datasets from the 2 processes
    training_data_1 = process_1.generate_data_points_from_all_labels(
        total_count=1000)
    training_data_2 = process_2.generate_data_points_from_all_labels(
        total_count=1000)

    # Generate a test data from process_2
    test_data = process_2.generate_data_points_from_all_labels(total_count=500)

    # Call adapt_ensemble() on first dataset
    adaptor.adapt_ensemble(training_data_1)
    print("adaptor after first adapt_ensemble() call = {}".format(adaptor))

    adaptor.adapt_ensemble(training_data_2)
    print("adaptor after second adapt_ensemble() call = {}".format(adaptor))

    predict_and_print_results(adaptor.ensemble, test_data,
                              "test_adaptation: ensemble")

    # For comparision train 2 ANN_Submodels on the 2 datasets and check results

    ann_submodel_1 = ann_sm.ANN_Submodel(weight=1, pdf=None)
    ann_submodel_2 = ann_sm.ANN_Submodel(weight=1, pdf=None)

    ann_submodel_1.train(training_data_1)
    ann_submodel_2.train(training_data_2)

    predict_and_print_results(ann_submodel_1, test_data,
                              "test_adaptation: ann_submodel_1")
    predict_and_print_results(ann_submodel_2, test_data,
                              "test_adaptation: ann_submodel_2")
Exemple #9
0
    def __init__(self, sys_params):

        self.ensemble = ens.ModelEnsmeble()

        self.detector = dd.DriftDetector(
            sys_params.detector_window_size,
            sys_params.detector_diff_threshold_to_sum,
            sys_params.detector_diff_sum_threshold_to_detect)

        self.results_manager = rman.ResultsManager(
            sys_params.results_manager_avg_error_window_size,
            sys_params.system_coordinator_drift_scenario)

        self.results_manager.init_baseline(baseline_name="no_adaptation",
                                           baseline_num=0)
        self.results_manager.init_baseline(baseline_name="all_data",
                                           baseline_num=1)
        self.results_manager.init_baseline(baseline_name="latest_window",
                                           baseline_num=2)

        self.initial_dataset_size = sys_params.system_coordinator_initial_dataset_size
        self.total_sequence_size = sys_params.system_coordinator_total_sequence_size
        self.batch_size = sys_params.system_coordinator_batch_size  # How many DataPoints are generated per main loop iteration
        self.submodel_type = sys_params.adaptor_submodel_type
        self.drift_scenario = sys_params.system_coordinator_drift_scenario

        # Parameters specific to scenarios

        create_process = True  # For artificial datasets
        classfier_type = "Artificial"  # For artificial datasets

        if (self.drift_scenario == "Abrupt_Drift"):
            self.process_class_distribution_parameters_2 = sys_params.process_class_distribution_parameters_2
            self.was_drift_occured = False  # To ensure we set the second set of process parameters only once
            self.midpoint = (self.total_sequence_size +
                             self.initial_dataset_size) / 2

            self.set_process_parameters = self.set_abrupt_drift_process_params
            self.generate_data_batch = self.generate_artificial_data_batch

        elif (self.drift_scenario == "Gradual_Drift"):
            self.midpoint = (self.total_sequence_size +
                             self.initial_dataset_size) / 2

            drift_period_size = sys_params.system_coordinator_drift_period_size
            self.drift_start_seq = self.midpoint - drift_period_size / 2
            self.drift_end_seq = self.midpoint + drift_period_size / 2

            self.increment = sys_params.system_coordinator_mean_dim1_shift * self.batch_size / drift_period_size

            self.is_left_printed = False  # To print drift start and end only once
            self.is_right_printed = False

            self.set_process_parameters = self.set_gradual_drift_process_params
            self.generate_data_batch = self.generate_artificial_data_batch

        elif (self.drift_scenario == "Recurring_Context"):
            self.process_class_distribution_parameters = sys_params.process_class_distribution_parameters
            self.process_class_distribution_parameters_2 = sys_params.process_class_distribution_parameters_2

            self.recurrence_count = sys_params.system_coordinator_recurrence_count

            self.between_switch_size = (self.total_sequence_size +
                                        self.initial_dataset_size) / (
                                            self.recurrence_count + 1)
            assert self.between_switch_size >= 4 * self.detector.window_size  # To have enough diff-stable periods between switches
            self.next_switch_point = self.between_switch_size

            self.set_process_parameters = self.set_recurring_context_process_params
            self.generate_data_batch = self.generate_artificial_data_batch

        elif (self.drift_scenario == "Real_World_Dataset"):
            self.real_data_points = []
            self.curr_real_dataset_pos = 0
            self.load_real_dataset(
                sys_params.system_coordinator_real_dataset_filename)

            self.set_process_parameters = self.set_real_dataset_params
            self.generate_data_batch = self.generate_real_data_batch

            create_process = False
            classfier_type = "Real"

        else:
            assert False

        self.process = None
        if (create_process == True):
            self.process = prc.Process(
                sys_params.process_num_dimensions,
                sys_params.process_num_classes,
                sys_params.process_class_distribution_parameters)

        # Baseline models
        self.original_model = da.create_submodel(
            sys_params.adaptor_submodel_type, classfier_type)
        self.all_data_model = da.create_submodel(
            sys_params.adaptor_submodel_type, classfier_type)
        self.latest_window_model = da.create_submodel(
            sys_params.adaptor_submodel_type, classfier_type)
        self.all_data = [
        ]  # Store all the data points, to be used by "all_data" baseline

        self.adaptor = da.DriftAdaptor(self.ensemble,
                                       sys_params.adaptor_submodel_type,
                                       classfier_type)
Exemple #10
0
def test_abrupt_drift_detection():

    window_size = 500
    drift_detector = dd.DriftDetector(window_size=window_size,
                                      diff_threshold_to_sum=0.005,
                                      diff_sum_threshold_to_detect=0.05)
    print(drift_detector)

    # Generate some data from a 2 class stochastic process

    # Setup process

    gauss_params = []
    # class 1 Gaussian distribution params
    mean_1 = [0, 0]
    cov_1 = [[1, 0], [0, 1]]
    # class 2 Gaussian distribution params
    mean_2 = [4, 4]
    cov_2 = [[1, 0], [0, 1]]

    gauss_params.append((mean_1, cov_1))
    gauss_params.append((mean_2, cov_2))

    process = prc.Process(num_dimensions=2,
                          num_classes=2,
                          class_distribution_parameters=gauss_params)
    print(process)

    # Generate data
    count = 4000
    midpoint = int(count / 2)
    data_points = process.generate_data_points_from_all_labels(
        total_count=count)  # First half from label=0, second half from label=1

    # Emulate a data point sequence
    seq_no = []
    expected_drift_seq = []
    detection_batch_size = 50  # 10 is a good value
    for index, point in enumerate(data_points):
        drift_detector.add_data_points([point])

        if (index % detection_batch_size == 0
            ):  # Run detection after a batch of samples has been added
            drift_detector.run_detection(index)
            seq_no.append(index)

            # Expected drift
            if (index > midpoint and index < midpoint + 2 * window_size
                ):  # Part of the two windows fall on either side of 'count'
                to_right = index - midpoint
                to_left = 2 * window_size - to_right
                expected_drift = min(to_left, to_right) / window_size
            else:
                expected_drift = 0

            expected_drift_seq.append(expected_drift)

            print("index={}".format(index))

    print("detector={}".format(drift_detector))

    # Scale expected drift values to compare with actual drift values
    peak_actual_drift = max(drift_detector.diff_sequence)
    expected_drift_seq = [
        val * peak_actual_drift for val in expected_drift_seq
    ]

    plt.plot(seq_no, drift_detector.diff_sequence, label='actual_drift')
    plt.plot(seq_no, drift_detector.diff_sum_sequence, label='actual_diff_sum')
    plt.plot(seq_no, expected_drift_seq, label='expected_drift')
    plt.legend(loc='upper right')
    plt.ylabel('diff')
    plt.show()