def test_page_hinkley(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ ph = PageHinkley() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1013, 1335, 1505, 1758] detected_indices = [] for i in range(data_stream.size): ph.update(data_stream[i]) if ph.change_detected: detected_indices.append(i) assert detected_indices == expected_indices
def test_drift_pca_pagehinkley(self): drift_algorithm = PageHinkley(min_instances=30, delta=0.005, threshold=80, alpha=1 - 0.01) dimesionality_reduction = PCA() detector = DriftDetector(drift_algorithm, dimesionality_reduction) detector.update_base_data(self.training_data) in_drift, drift_index = detector.drift_check(self.test_data) print("in_drift", str(in_drift)) print("drift_index", drift_index)
def test_page_hinkley(): expected_indices = [1020, 1991] detected_indices = perform_test(PageHinkley(), data_stream_1) assert detected_indices == expected_indices
def setUpClass(self): self.is_initial_training_from_topic = False self.initially_load_models = True self.initial_training_data = None self.inference_data_topic = 'inference' self.prediction_result_topic = 'prediction' self.training_data_topic = 'training' # # Send training data if not self.initially_load_models: adoc_dataset_location = ADOC_DATASET_LOCATION video_files = os.listdir(adoc_dataset_location) train_video_files = [x for x in video_files if x[0:5] == 'train'] train_video_files.sort() train_video_files = train_video_files[1: 2] # not all videos for test for video in train_video_files: video_producer = VideoProducer("localhost:29092", self.training_data_topic, os.path.join( adoc_dataset_location, video), debug=True, resize_to_dimension=(256, 256)) video_producer.send_video( extra_fields={"sequence_name": video}) self.user_constraints = { "is_real_time": False, "minimum_efectiveness": None } self.models = [{ "name": "gaussian_1", "training_rate": 200, "efectiveness": 30, "inference_rate": 10, "model": Gaussian(model_name='gaussian_1', pca=True, pca_n_components=.95) }, { "name": "gaussian_2", "training_rate": 250, "efectiveness": 25, "inference_rate": 10, "model": Gaussian(model_name='gaussian_2', pca=True, pca_n_components=.90) }] self.drift_algorithm = PageHinkley(min_instances=10, delta=0.005, threshold=10, alpha=1 - 0.01) self.dimensionality_reduction = PCA() self.number_training_frames_after_drift = 10 self.handler = MainHandler( models=self.models, user_constraints=self.user_constraints, number_training_frames_after_drift=self. number_training_frames_after_drift, drift_algorithm=self.drift_algorithm, dimensionality_reduction=self.dimensionality_reduction, training_data_topic=self.training_data_topic, is_initial_training_from_topic=self.is_initial_training_from_topic, initial_training_data=self.initial_training_data, prediction_result_topic=self.prediction_result_topic, inference_data_topic=self.inference_data_topic, initially_load_models=self.initially_load_models)
# pageHinkley import numpy as np from river.drift import PageHinkley np.random.seed(12345) ph = PageHinkley() # Simulate a data stream composed by two data distributions data_stream = np.concatenate((np.random.randint(3, size=1000), np.random.randint(4, high=16, size=1000))) # Update drift detector and verify if change is detected for i, val in enumerate(data_stream): in_drift, in_warning = ph.update(val) if in_drift: print(f"Change detected at index {i}, input value: {val}") # Change detected at index 1009, input value: 5
def setUpClass(self): self.is_initial_training_from_topic = False self.inference_data_topic = 'inference' self.prediction_result_topic = 'prediction' # Mock training data self.training_data_topic = None dataset = load_sample_images() sequence_1 = [dataset.images[0] for x in range(20)] sequence_2 = [dataset.images[1] for x in range(20)] self.initial_training_data = sequence_1 + sequence_2 for i in range(0, len(self.initial_training_data)): self.initial_training_data[i] = cv2.resize( self.initial_training_data[i], (256, 256)) # # Send training data self.training_data_topic = 'training' # adoc_dataset_location = ADOC_DATASET_LOCATION # video_files = os.listdir(adoc_dataset_location) # train_video_files = [x for x in video_files if x[0:5] == 'train'] # train_video_files.sort() # train_video_files = train_video_files[1:2] # not all videos for test # for video in train_video_files: # video_producer = VideoProducer("localhost:29092", self.training_data_topic, os.path.join(adoc_dataset_location, video), debug=True, resize_to_dimension=(256,256)) # video_producer.send_video(extra_fields={"sequence_name": video}) self.user_constraints = { "is_real_time": False, "minimum_efectiveness": None } self.models = [{ "name": "model_1", "training_rate": 200, "efectiveness": 30, "inference_rate": 10, "model": MockModel(40, model_name="model_1") }, { "name": "model_2", "training_rate": 300, "efectiveness": 20, "inference_rate": 20, "model": MockModel(30, model_name="model_2") }, { "name": "model_3", "training_rate": 400, "efectiveness": 20, "inference_rate": 20, "model": MockModel(10, model_name="model_3") }] self.drift_algorithm = PageHinkley(min_instances=20, delta=0.005, threshold=10, alpha=1 - 0.01) self.dimensionality_reduction = PCA() self.number_training_frames_after_drift = 10 self.handler = MainHandler( models=self.models, user_constraints=self.user_constraints, number_training_frames_after_drift=self. number_training_frames_after_drift, drift_algorithm=self.drift_algorithm, dimensionality_reduction=self.dimensionality_reduction, training_data_topic=self.training_data_topic, is_initial_training_from_topic=self.is_initial_training_from_topic, initial_training_data=self.initial_training_data, prediction_result_topic=self.prediction_result_topic, inference_data_topic=self.inference_data_topic, provide_training_data_after_drift=True)
def setUpClass(self): self.inference_data_topic = 'inference' self.prediction_result_topic = 'prediction' # # Send training data self.training_data_topic = 'training' adoc_dataset_location = ADOC_DATASET_LOCATION video_files = os.listdir(adoc_dataset_location) train_video_files = [x for x in video_files if x[0:5] == 'train'] train_video_files.sort() train_video_files = train_video_files[1:2] # not all videos for test for video in train_video_files: video_producer = VideoProducer(KAFKA_BROKER_LIST, self.training_data_topic, os.path.join(adoc_dataset_location, video), debug=True, resize_to_dimension=(256,256)) video_producer.send_video(extra_fields={"sequence_name": video}) self.user_constraints = { "is_real_time": False, "minimum_efectiveness": None } self.models = [ { "name": "model_1", "training_rate": 200, "efectiveness": 30, "inference_rate": 10, "model": MockModel(40, model_name= "model_1") }, { "name": "model_2", "training_rate": 300, "efectiveness": 20, "inference_rate": 20, "model": MockModel(30, model_name= "model_2") }, { "name": "model_3", "training_rate": 400, "efectiveness": 20, "inference_rate": 20, "model": MockModel(10, model_name= "model_3") } ] self.drift_algorithm = PageHinkley(min_instances=20, delta=0.005, threshold=10, alpha=1 - 0.01) self.dimensionality_reduction = PCA() self.number_training_frames_after_drift = 10 self.handler = MainHandler( models=self.models, user_constraints=self.user_constraints, number_training_frames_after_drift=self.number_training_frames_after_drift, drift_algorithm=self.drift_algorithm, dimensionality_reduction=self.dimensionality_reduction, training_data_topic=self.training_data_topic, prediction_result_topic=self.prediction_result_topic, inference_data_topic=self.inference_data_topic )
def setUpClass(self): self.is_initial_training_from_topic = False self.inference_data_topic = 'inference' self.prediction_result_topic = 'prediction' # Mock training data self.training_data_topic = None dataset = load_sample_images() sequence_1 = [dataset.images[0] for x in range(20)] self.initial_training_data = sequence_1 for i in range(0, len(self.initial_training_data)): self.initial_training_data[i] = cv2.resize(self.initial_training_data[i], (256,256)) # # Send training data self.training_data_topic = 'training' self.user_constraints = { "is_real_time": False, "minimum_efectiveness": None } self.models = [ { "name": "model_1", "training_rate": 200, "efectiveness": 30, "inference_rate": 10, "model": MockModel(50, model_name= "model_1") }, { "name": "model_2", "training_rate": 300, "efectiveness": 20, "inference_rate": 20, "model": MockModel(30, model_name= "model_2") }, { "name": "model_3", "training_rate": 400, "efectiveness": 20, "inference_rate": 20, "model": MockModel(10, model_name= "model_3") } ] self.drift_algorithm = PageHinkley(min_instances=5, delta=0.005, threshold=10, alpha=1 - 0.01) self.dimensionality_reduction = PCA() self.number_training_frames_after_drift = 5 # What happens if there are less infered examples than this number? self.handler = MainHandler( models=self.models, user_constraints=self.user_constraints, number_training_frames_after_drift=self.number_training_frames_after_drift, drift_algorithm=self.drift_algorithm, dimensionality_reduction=self.dimensionality_reduction, training_data_topic=self.training_data_topic, is_initial_training_from_topic=self.is_initial_training_from_topic, initial_training_data=self.initial_training_data, prediction_result_topic=self.prediction_result_topic, inference_data_topic=self.inference_data_topic )