Esempio n. 1
0
def test_page_hinkley(test_path):
    """
    ADWIN drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.
    """
    ph = PageHinkley()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [1013, 1335, 1505, 1758]
    detected_indices = []

    for i in range(data_stream.size):
        ph.update(data_stream[i])
        if ph.change_detected:
            detected_indices.append(i)

    assert detected_indices == expected_indices
    def test_drift_pca_pagehinkley(self):
        drift_algorithm = PageHinkley(min_instances=30,
                                      delta=0.005,
                                      threshold=80,
                                      alpha=1 - 0.01)
        dimesionality_reduction = PCA()
        detector = DriftDetector(drift_algorithm, dimesionality_reduction)
        detector.update_base_data(self.training_data)

        in_drift, drift_index = detector.drift_check(self.test_data)
        print("in_drift", str(in_drift))
        print("drift_index", drift_index)
Esempio n. 3
0
def test_page_hinkley():
    expected_indices = [1020, 1991]
    detected_indices = perform_test(PageHinkley(), data_stream_1)

    assert detected_indices == expected_indices
    def setUpClass(self):

        self.is_initial_training_from_topic = False
        self.initially_load_models = True
        self.initial_training_data = None

        self.inference_data_topic = 'inference'
        self.prediction_result_topic = 'prediction'
        self.training_data_topic = 'training'

        # # Send training data
        if not self.initially_load_models:

            adoc_dataset_location = ADOC_DATASET_LOCATION
            video_files = os.listdir(adoc_dataset_location)
            train_video_files = [x for x in video_files if x[0:5] == 'train']
            train_video_files.sort()
            train_video_files = train_video_files[1:
                                                  2]  # not all videos for test
            for video in train_video_files:
                video_producer = VideoProducer("localhost:29092",
                                               self.training_data_topic,
                                               os.path.join(
                                                   adoc_dataset_location,
                                                   video),
                                               debug=True,
                                               resize_to_dimension=(256, 256))
                video_producer.send_video(
                    extra_fields={"sequence_name": video})

        self.user_constraints = {
            "is_real_time": False,
            "minimum_efectiveness": None
        }

        self.models = [{
            "name":
            "gaussian_1",
            "training_rate":
            200,
            "efectiveness":
            30,
            "inference_rate":
            10,
            "model":
            Gaussian(model_name='gaussian_1', pca=True, pca_n_components=.95)
        }, {
            "name":
            "gaussian_2",
            "training_rate":
            250,
            "efectiveness":
            25,
            "inference_rate":
            10,
            "model":
            Gaussian(model_name='gaussian_2', pca=True, pca_n_components=.90)
        }]
        self.drift_algorithm = PageHinkley(min_instances=10,
                                           delta=0.005,
                                           threshold=10,
                                           alpha=1 - 0.01)
        self.dimensionality_reduction = PCA()
        self.number_training_frames_after_drift = 10

        self.handler = MainHandler(
            models=self.models,
            user_constraints=self.user_constraints,
            number_training_frames_after_drift=self.
            number_training_frames_after_drift,
            drift_algorithm=self.drift_algorithm,
            dimensionality_reduction=self.dimensionality_reduction,
            training_data_topic=self.training_data_topic,
            is_initial_training_from_topic=self.is_initial_training_from_topic,
            initial_training_data=self.initial_training_data,
            prediction_result_topic=self.prediction_result_topic,
            inference_data_topic=self.inference_data_topic,
            initially_load_models=self.initially_load_models)
Esempio n. 5
0
# pageHinkley

import numpy as np
from river.drift import PageHinkley
np.random.seed(12345)

ph = PageHinkley()

# Simulate a data stream composed by two data distributions
data_stream = np.concatenate((np.random.randint(3, size=1000),
                               np.random.randint(4, high=16, size=1000)))

# Update drift detector and verify if change is detected
for i, val in enumerate(data_stream):
    in_drift, in_warning = ph.update(val)
    if in_drift:
        print(f"Change detected at index {i}, input value: {val}")
# Change detected at index 1009, input value: 5
    def setUpClass(self):

        self.is_initial_training_from_topic = False

        self.inference_data_topic = 'inference'
        self.prediction_result_topic = 'prediction'

        # Mock training data
        self.training_data_topic = None
        dataset = load_sample_images()
        sequence_1 = [dataset.images[0] for x in range(20)]
        sequence_2 = [dataset.images[1] for x in range(20)]
        self.initial_training_data = sequence_1 + sequence_2

        for i in range(0, len(self.initial_training_data)):
            self.initial_training_data[i] = cv2.resize(
                self.initial_training_data[i], (256, 256))

        # # Send training data
        self.training_data_topic = 'training'

        # adoc_dataset_location = ADOC_DATASET_LOCATION
        # video_files = os.listdir(adoc_dataset_location)
        # train_video_files = [x for x in video_files if x[0:5] == 'train']
        # train_video_files.sort()
        # train_video_files = train_video_files[1:2] # not all videos for test
        # for video in train_video_files:
        #     video_producer = VideoProducer("localhost:29092", self.training_data_topic, os.path.join(adoc_dataset_location, video), debug=True, resize_to_dimension=(256,256))
        #     video_producer.send_video(extra_fields={"sequence_name": video})

        self.user_constraints = {
            "is_real_time": False,
            "minimum_efectiveness": None
        }

        self.models = [{
            "name": "model_1",
            "training_rate": 200,
            "efectiveness": 30,
            "inference_rate": 10,
            "model": MockModel(40, model_name="model_1")
        }, {
            "name": "model_2",
            "training_rate": 300,
            "efectiveness": 20,
            "inference_rate": 20,
            "model": MockModel(30, model_name="model_2")
        }, {
            "name": "model_3",
            "training_rate": 400,
            "efectiveness": 20,
            "inference_rate": 20,
            "model": MockModel(10, model_name="model_3")
        }]
        self.drift_algorithm = PageHinkley(min_instances=20,
                                           delta=0.005,
                                           threshold=10,
                                           alpha=1 - 0.01)
        self.dimensionality_reduction = PCA()
        self.number_training_frames_after_drift = 10

        self.handler = MainHandler(
            models=self.models,
            user_constraints=self.user_constraints,
            number_training_frames_after_drift=self.
            number_training_frames_after_drift,
            drift_algorithm=self.drift_algorithm,
            dimensionality_reduction=self.dimensionality_reduction,
            training_data_topic=self.training_data_topic,
            is_initial_training_from_topic=self.is_initial_training_from_topic,
            initial_training_data=self.initial_training_data,
            prediction_result_topic=self.prediction_result_topic,
            inference_data_topic=self.inference_data_topic,
            provide_training_data_after_drift=True)
Esempio n. 7
0
    def setUpClass(self):


        self.inference_data_topic = 'inference'
        self.prediction_result_topic = 'prediction'


       
        
        # # Send training data
        self.training_data_topic = 'training'

        adoc_dataset_location = ADOC_DATASET_LOCATION
        video_files = os.listdir(adoc_dataset_location)
        train_video_files = [x for x in video_files if x[0:5] == 'train']
        train_video_files.sort()
        train_video_files = train_video_files[1:2] # not all videos for test
        for video in train_video_files:
            video_producer = VideoProducer(KAFKA_BROKER_LIST, self.training_data_topic, os.path.join(adoc_dataset_location, video), debug=True, resize_to_dimension=(256,256))
            video_producer.send_video(extra_fields={"sequence_name": video})




        self.user_constraints = {
            "is_real_time": False,
            "minimum_efectiveness": None
        }
        
        self.models = [
            {
                "name": "model_1",
                "training_rate": 200,
                "efectiveness": 30,
                "inference_rate": 10,
                "model":  MockModel(40, model_name= "model_1")
            },
            {
                "name": "model_2",
                "training_rate": 300,
                "efectiveness": 20,
                "inference_rate": 20,
                "model":  MockModel(30, model_name= "model_2")
            },
            {
                "name": "model_3",
                "training_rate": 400,
                "efectiveness": 20,
                "inference_rate": 20,
                "model":  MockModel(10, model_name= "model_3")
            }
        ]
        self.drift_algorithm = PageHinkley(min_instances=20, delta=0.005, threshold=10, alpha=1 - 0.01)
        self.dimensionality_reduction = PCA()
        self.number_training_frames_after_drift = 10
        


        self.handler = MainHandler(
            models=self.models,
            user_constraints=self.user_constraints,
            number_training_frames_after_drift=self.number_training_frames_after_drift,
            drift_algorithm=self.drift_algorithm,
            dimensionality_reduction=self.dimensionality_reduction,
            training_data_topic=self.training_data_topic,
            prediction_result_topic=self.prediction_result_topic,
            inference_data_topic=self.inference_data_topic
            )
Esempio n. 8
0
    def setUpClass(self):

        self.is_initial_training_from_topic = False    

        self.inference_data_topic = 'inference'
        self.prediction_result_topic = 'prediction'


        # Mock training data
        self.training_data_topic = None
        dataset = load_sample_images() 
        sequence_1 = [dataset.images[0] for x in range(20)]
        self.initial_training_data = sequence_1

        for i in range(0, len(self.initial_training_data)):
            self.initial_training_data[i] = cv2.resize(self.initial_training_data[i], (256,256))
        

        
        # # Send training data
        self.training_data_topic = 'training'

        self.user_constraints = {
            "is_real_time": False,
            "minimum_efectiveness": None
        }
        
        self.models = [
            {
                "name": "model_1",
                "training_rate": 200,
                "efectiveness": 30,
                "inference_rate": 10,
                "model":  MockModel(50, model_name= "model_1")
            },
            {
                "name": "model_2",
                "training_rate": 300,
                "efectiveness": 20,
                "inference_rate": 20,
                "model":  MockModel(30, model_name= "model_2")
            },
            {
                "name": "model_3",
                "training_rate": 400,
                "efectiveness": 20,
                "inference_rate": 20,
                "model":  MockModel(10, model_name= "model_3")
            }
        ]
        self.drift_algorithm = PageHinkley(min_instances=5, delta=0.005, threshold=10, alpha=1 - 0.01)
        self.dimensionality_reduction = PCA()



        self.number_training_frames_after_drift = 5 # What happens if there are less infered examples than this number? 
        


        self.handler = MainHandler(
            models=self.models,
            user_constraints=self.user_constraints,
            number_training_frames_after_drift=self.number_training_frames_after_drift,
            drift_algorithm=self.drift_algorithm,
            dimensionality_reduction=self.dimensionality_reduction,
            training_data_topic=self.training_data_topic,
            is_initial_training_from_topic=self.is_initial_training_from_topic,
            initial_training_data=self.initial_training_data,
            prediction_result_topic=self.prediction_result_topic,
            inference_data_topic=self.inference_data_topic
            )