def test_ensemble_prediction(): ann_submodel_1 = ann_sm.ANN_Submodel(weight=1, pdf=None) ann_submodel_2 = ann_sm.ANN_Submodel(weight=1, pdf=None) ensemble = ens.ModelEnsmeble() print("ann_submodel_1={}".format(ann_submodel_1)) print("ann_submodel_2={}".format(ann_submodel_2)) print("ensemble={}".format(ensemble)) # Setup 2 processes (class distributions are flipped) # class 1 Gaussian distribution params mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [3, 3] cov_2 = [[1, 0], [0, 1]] gauss_params_1 = [] gauss_params_1.append((mean_1, cov_1)) gauss_params_1.append((mean_2, cov_2)) process_1 = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params_1) gauss_params_2 = [] gauss_params_2.append((mean_2, cov_2)) gauss_params_2.append((mean_1, cov_1)) process_2 = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params_2) print(process_1) print(process_2) # Generate 2 training datasets from the 2 processes training_data_1 = process_1.generate_data_points_from_all_labels( total_count=1000) training_data_2 = process_2.generate_data_points_from_all_labels( total_count=1000) # Generate a test data from process_2 test_data = process_2.generate_data_points_from_all_labels( total_count=1000) # Train ANN_Submodels and add to ensemble ann_submodel_1.train(training_data_1) ann_submodel_2.train(training_data_2) ensemble.add_submodel(ann_submodel_1) ensemble.add_submodel(ann_submodel_2) # Test ANN_Submodels and the ensemble seperately and report results predict_and_print_results(ann_submodel_1, test_data, "test_ensemble_prediction: ann_submodel_1") predict_and_print_results(ann_submodel_2, test_data, "test_ensemble_prediction: ann_submodel_2") predict_and_print_results(ensemble, test_data, "test_ensemble_prediction: ensemble")
def test_generate_data_points_from_all_labels(): gauss_params = [] # class 1 Gaussian distribution params mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [3, 3] cov_2 = [[1, 0], [0, 1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate data from all labels and plot data_points_all_labels = process.generate_data_points_from_all_labels( total_count=1000) x = [point.X[0] for point in data_points_all_labels] y = [point.X[1] for point in data_points_all_labels] plt.scatter(x, y) plt.show()
def test_total_variation_distance_single_gaussians(): # Generate some data from a class gauss_params = [] mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] mean_2 = [4, 4] cov_2 = [[1, 0], [0, 1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate data data_points1 = process.generate_data_points(label=0, count=500) data_points2 = process.generate_data_points(label=0, count=500) data_points3 = process.generate_data_points( label=1, count=500) # From different label X_dataset1 = [point.X for point in data_points1] X_dataset2 = [point.X for point in data_points2] X_dataset3 = [point.X for point in data_points3] # Estimate probability distributions of datasets kde_estimator1 = ddif.estimate_pdf_kde(X_dataset1) kde_estimator2 = ddif.estimate_pdf_kde(X_dataset2) kde_estimator3 = ddif.estimate_pdf_kde(X_dataset3) (w1, w2, bounds1) = dd.prepare_sample_windows(data_points1, data_points2) (w1, w2, bounds2) = dd.prepare_sample_windows(data_points1, data_points3) # Compute difference between distributions (multiple times) diff_list_1 = [] diff_list_2 = [] # Calculate difference metric multiple times for i in range(50): diff1 = ddif.total_variation_distance(kde_estimator1, kde_estimator2, bounds1) diff2 = ddif.total_variation_distance(kde_estimator1, kde_estimator3, bounds2) diff_list_1.append(diff1) diff_list_2.append(diff2) if (i % 5 == 0): print("index={}".format(i)) # Plot plt.plot(diff_list_1, label='diff_1_2 - same label') plt.plot(diff_list_2, label='diff_1_3 - different labels') plt.legend(loc='upper right') plt.ylabel('diff') plt.show()
def test_estimate_2d_pdf_kde(): # Generate some data from a class gauss_params = [] mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] mean_2 = [3, 3] cov_2 = [[1, 0], [0, 1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate data data_points_1 = process.generate_data_points(label=0, count=1000) X_dataset_1 = [point.X for point in data_points_1] data_points_2 = process.generate_data_points_from_all_labels( total_count=1000) X_dataset_2 = [point.X for point in data_points_2] # Estimate pdf kde_estimator_1 = ddif.estimate_pdf_kde(X_dataset_1) kde_estimator_2 = ddif.estimate_pdf_kde(X_dataset_2) # Plot pdf log_values_1 = kde_estimator_1.score_samples(X_dataset_1) pdf_values_1 = np.exp(log_values_1) log_values_2 = kde_estimator_2.score_samples(X_dataset_2) pdf_values_2 = np.exp(log_values_2) # X_dataset_1, pdf_values_1 = (list(x) for x in zip(*sorted(zip(X_dataset_1, pdf_values_1)))) x = [sample[0] for sample in X_dataset_1] y = [sample[1] for sample in X_dataset_1] # Plot fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(x, y, pdf_values_1, label='single-gaussian') x = [sample[0] for sample in X_dataset_2] y = [sample[1] for sample in X_dataset_2] ax.scatter(x, y, pdf_values_2, label='two-gaussians') plt.legend(loc='upper right') plt.show()
def test_estimate_1d_pdf_kde(): # Generate some data from a class gauss_params = [] mean_1 = [0] cov_1 = [[1]] mean_2 = [4] cov_2 = [[1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=1, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate data data_points_1 = process.generate_data_points(label=0, count=1000) X_dataset_1 = [point.X for point in data_points_1] data_points_2 = process.generate_data_points_from_all_labels( total_count=1000) X_dataset_2 = [point.X for point in data_points_2] # Estimate pdf kde_estimator_1 = ddif.estimate_pdf_kde(X_dataset_1) kde_estimator_2 = ddif.estimate_pdf_kde(X_dataset_2) # Get pdf values at dataset samples log_values_1 = kde_estimator_1.score_samples(X_dataset_1) pdf_values_1 = np.exp(log_values_1) log_values_2 = kde_estimator_2.score_samples(X_dataset_2) pdf_values_2 = np.exp(log_values_2) X_dataset_1, pdf_values_1 = (list(x) for x in zip( *sorted(zip(X_dataset_1, pdf_values_1)))) X_dataset_2, pdf_values_2 = (list(x) for x in zip( *sorted(zip(X_dataset_2, pdf_values_2)))) # Plot plt.scatter(X_dataset_1, pdf_values_1, label='single-gaussian') plt.plot(X_dataset_1, pdf_values_1) plt.scatter(X_dataset_2, pdf_values_2, color='r', label='two-gaussians') plt.plot(X_dataset_2, pdf_values_2, color='r') plt.legend(loc='upper right') plt.show()
def test_results_manager(): mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] process = prc.Process(num_dimensions=2, num_classes=1, class_distribution_parameters=[(mean_1, cov_1)]) data_points = process.generate_data_points(label=0, count=100) results_manager = rman.ResultsManager(avg_error_window_size=10, title_suffix="Test_scenario") batch_size = 5 batch = [] diff_sum = 0 for index, data_point in enumerate(data_points): batch.append(data_point) diff = 1 / (index + 1) diff_sum += diff is_drift_detected = False # Set predicted y values data_point.predicted_y = data_point.true_y if (index % 3 == 0): data_point.predicted_y = -100 if (index % 15 == 0): is_drift_detected = True if (index % batch_size == 0): results_manager.add_prediction_result(index, batch) results_manager.add_detection_info(index, diff, diff_sum, is_drift_detected) batch = [] if (index % 20 == 0): results_manager.print_results() results_manager.add_special_marker(73, "marker_1") results_manager.add_special_marker(83, "marker_2") results_manager.plot_results()
def test_tree_submodel_training(): tree_submodel = tree_sm.DecisionTreeSubmodel(weight=1, pdf=None, classifer_type="Artificial") print(tree_submodel) # Generate some data from a 2 class stochastic process (for training and testing) # Setup process gauss_params = [] # class 1 Gaussian distribution params mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [3, 3] cov_2 = [[1, 0], [0, 1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate training and test data training_data = process.generate_data_points_from_all_labels( total_count=1000) test_data = process.generate_data_points_from_all_labels(total_count=1000) # Train ANN_Submodel tree_submodel.train(training_data) # Test ANN_Submodel and report results predict_and_print_results(tree_submodel, test_data, "test_tree_submodel_training")
def test_adaptation(): ensemble = ens.ModelEnsmeble() adaptor = da.DriftAdaptor(ensemble, "ANN_Submodel", "Artifcial") print(adaptor) # Setup 2 processes # class 1 Gaussian distribution params mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [3, 3] cov_2 = [[1, 0], [0, 1]] gauss_params_1 = [] gauss_params_1.append((mean_1, cov_1)) gauss_params_1.append((mean_2, cov_2)) process_1 = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params_1) # class 1 Gaussian distribution params mean_1 = [0, 5] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [5, 0] cov_2 = [[1, 0], [0, 1]] gauss_params_2 = [] gauss_params_2.append((mean_1, cov_1)) gauss_params_2.append((mean_2, cov_2)) process_2 = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params_2) print(process_1) print(process_2) # Generate 2 training datasets from the 2 processes training_data_1 = process_1.generate_data_points_from_all_labels( total_count=1000) training_data_2 = process_2.generate_data_points_from_all_labels( total_count=1000) # Generate a test data from process_2 test_data = process_2.generate_data_points_from_all_labels(total_count=500) # Call adapt_ensemble() on first dataset adaptor.adapt_ensemble(training_data_1) print("adaptor after first adapt_ensemble() call = {}".format(adaptor)) adaptor.adapt_ensemble(training_data_2) print("adaptor after second adapt_ensemble() call = {}".format(adaptor)) predict_and_print_results(adaptor.ensemble, test_data, "test_adaptation: ensemble") # For comparision train 2 ANN_Submodels on the 2 datasets and check results ann_submodel_1 = ann_sm.ANN_Submodel(weight=1, pdf=None) ann_submodel_2 = ann_sm.ANN_Submodel(weight=1, pdf=None) ann_submodel_1.train(training_data_1) ann_submodel_2.train(training_data_2) predict_and_print_results(ann_submodel_1, test_data, "test_adaptation: ann_submodel_1") predict_and_print_results(ann_submodel_2, test_data, "test_adaptation: ann_submodel_2")
def __init__(self, sys_params): self.ensemble = ens.ModelEnsmeble() self.detector = dd.DriftDetector( sys_params.detector_window_size, sys_params.detector_diff_threshold_to_sum, sys_params.detector_diff_sum_threshold_to_detect) self.results_manager = rman.ResultsManager( sys_params.results_manager_avg_error_window_size, sys_params.system_coordinator_drift_scenario) self.results_manager.init_baseline(baseline_name="no_adaptation", baseline_num=0) self.results_manager.init_baseline(baseline_name="all_data", baseline_num=1) self.results_manager.init_baseline(baseline_name="latest_window", baseline_num=2) self.initial_dataset_size = sys_params.system_coordinator_initial_dataset_size self.total_sequence_size = sys_params.system_coordinator_total_sequence_size self.batch_size = sys_params.system_coordinator_batch_size # How many DataPoints are generated per main loop iteration self.submodel_type = sys_params.adaptor_submodel_type self.drift_scenario = sys_params.system_coordinator_drift_scenario # Parameters specific to scenarios create_process = True # For artificial datasets classfier_type = "Artificial" # For artificial datasets if (self.drift_scenario == "Abrupt_Drift"): self.process_class_distribution_parameters_2 = sys_params.process_class_distribution_parameters_2 self.was_drift_occured = False # To ensure we set the second set of process parameters only once self.midpoint = (self.total_sequence_size + self.initial_dataset_size) / 2 self.set_process_parameters = self.set_abrupt_drift_process_params self.generate_data_batch = self.generate_artificial_data_batch elif (self.drift_scenario == "Gradual_Drift"): self.midpoint = (self.total_sequence_size + self.initial_dataset_size) / 2 drift_period_size = sys_params.system_coordinator_drift_period_size self.drift_start_seq = self.midpoint - drift_period_size / 2 self.drift_end_seq = self.midpoint + drift_period_size / 2 self.increment = sys_params.system_coordinator_mean_dim1_shift * self.batch_size / drift_period_size self.is_left_printed = False # To print drift start and end only once self.is_right_printed = False self.set_process_parameters = self.set_gradual_drift_process_params self.generate_data_batch = self.generate_artificial_data_batch elif (self.drift_scenario == "Recurring_Context"): self.process_class_distribution_parameters = sys_params.process_class_distribution_parameters self.process_class_distribution_parameters_2 = sys_params.process_class_distribution_parameters_2 self.recurrence_count = sys_params.system_coordinator_recurrence_count self.between_switch_size = (self.total_sequence_size + self.initial_dataset_size) / ( self.recurrence_count + 1) assert self.between_switch_size >= 4 * self.detector.window_size # To have enough diff-stable periods between switches self.next_switch_point = self.between_switch_size self.set_process_parameters = self.set_recurring_context_process_params self.generate_data_batch = self.generate_artificial_data_batch elif (self.drift_scenario == "Real_World_Dataset"): self.real_data_points = [] self.curr_real_dataset_pos = 0 self.load_real_dataset( sys_params.system_coordinator_real_dataset_filename) self.set_process_parameters = self.set_real_dataset_params self.generate_data_batch = self.generate_real_data_batch create_process = False classfier_type = "Real" else: assert False self.process = None if (create_process == True): self.process = prc.Process( sys_params.process_num_dimensions, sys_params.process_num_classes, sys_params.process_class_distribution_parameters) # Baseline models self.original_model = da.create_submodel( sys_params.adaptor_submodel_type, classfier_type) self.all_data_model = da.create_submodel( sys_params.adaptor_submodel_type, classfier_type) self.latest_window_model = da.create_submodel( sys_params.adaptor_submodel_type, classfier_type) self.all_data = [ ] # Store all the data points, to be used by "all_data" baseline self.adaptor = da.DriftAdaptor(self.ensemble, sys_params.adaptor_submodel_type, classfier_type)
def test_abrupt_drift_detection(): window_size = 500 drift_detector = dd.DriftDetector(window_size=window_size, diff_threshold_to_sum=0.005, diff_sum_threshold_to_detect=0.05) print(drift_detector) # Generate some data from a 2 class stochastic process # Setup process gauss_params = [] # class 1 Gaussian distribution params mean_1 = [0, 0] cov_1 = [[1, 0], [0, 1]] # class 2 Gaussian distribution params mean_2 = [4, 4] cov_2 = [[1, 0], [0, 1]] gauss_params.append((mean_1, cov_1)) gauss_params.append((mean_2, cov_2)) process = prc.Process(num_dimensions=2, num_classes=2, class_distribution_parameters=gauss_params) print(process) # Generate data count = 4000 midpoint = int(count / 2) data_points = process.generate_data_points_from_all_labels( total_count=count) # First half from label=0, second half from label=1 # Emulate a data point sequence seq_no = [] expected_drift_seq = [] detection_batch_size = 50 # 10 is a good value for index, point in enumerate(data_points): drift_detector.add_data_points([point]) if (index % detection_batch_size == 0 ): # Run detection after a batch of samples has been added drift_detector.run_detection(index) seq_no.append(index) # Expected drift if (index > midpoint and index < midpoint + 2 * window_size ): # Part of the two windows fall on either side of 'count' to_right = index - midpoint to_left = 2 * window_size - to_right expected_drift = min(to_left, to_right) / window_size else: expected_drift = 0 expected_drift_seq.append(expected_drift) print("index={}".format(index)) print("detector={}".format(drift_detector)) # Scale expected drift values to compare with actual drift values peak_actual_drift = max(drift_detector.diff_sequence) expected_drift_seq = [ val * peak_actual_drift for val in expected_drift_seq ] plt.plot(seq_no, drift_detector.diff_sequence, label='actual_drift') plt.plot(seq_no, drift_detector.diff_sum_sequence, label='actual_diff_sum') plt.plot(seq_no, expected_drift_seq, label='expected_drift') plt.legend(loc='upper right') plt.ylabel('diff') plt.show()