Exemple #1
0
def test_ddm(test_path):
    """
    DDM drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.
    """
    ddm = DDM()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [1009]
    detected_indices = []

    for i in range(data_stream.size):
        ddm.add_element(data_stream[i])
        if ddm.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices
Exemple #2
0
    def run(self):
        '''
        main method to simulate new experiment
        '''
        print(f"Starting Experiment:{self}")
        try:
            start_window_size = self.window_size
            num_of_correct_predictions, predictions_counter = 0, 0
            ddm = DDM()
            for record in range(self.X.shape[0]):
                x_record, y_record = np.array([self.X[record, :]]), np.ravel(np.array([self.y[record]]))
                if record < self.window_size:  # aggregate records till window size
                    continue
                elif record == self.window_size:  # first initialization
                    try:
                        self.init_ofs_ol(record)
                    except Exception as e:
                        # case where ofs failed to find features - try to add more records and replay process
                        if self.window_size > start_window_size * 4: raise Exception("OFS could not find features.")
                        self.window_size += 50
                        logging.info(f"Changed window size from {self.window_size - 50} to {self.window_size}")
                    continue

                # predict
                my_pred = self.ol.created_model.predict(
                    x_record) if self.ofs is None else self.ol.created_model.predict(
                    x_record[:, self.current_selected_features])
                predictions_counter += 1
                if y_record[0] == my_pred[0]: num_of_correct_predictions += 1

                ddm.add_element(num_of_correct_predictions / predictions_counter)  # add result to concept drift model
                self.prequential_accuracy.append(num_of_correct_predictions / predictions_counter)  # add accuracy
                self.memory_usage.append(psutil.Process(os.getpid()).memory_info().rss)  # add memory usage

                if self.ol.lazy:  # partial fit for lazy models
                    self.fit_lazy(x_record, y_record)
                if ddm.detected_change():  # check for concept drift
                    self.concept_drift_detection(start_window_size, record)
                elif record != self.X.shape[0] - 1 and self.ofs:
                    self.selected_features.append(self.selected_features[-1])
        except Experiment as e:
            logging.error(f"Error: {str(e)}")
Exemple #3
0
def test_ddm():
    """
    DDM drift detection test.
    The first half of the data contains a sequence corresponding to a normal distribution with mean 0 and sigma 0.1.
    The second half corresponds to a normal distribution with mean 0.5 and sigma 0.1.
    """
    ddm = DDM()

    # Data
    np.random.seed(1)
    mu, sigma = 0, 0.1  # mean and standard deviation
    d_1 = np.random.normal(mu, sigma, 1000) > 0
    mu, sigma = 0.5, 0.1  # mean and standard deviation
    d_2 = np.random.normal(mu, sigma, 1000) > 0
    data_stream = np.concatenate((d_1.astype(int), d_2.astype(int)))

    expected_indices = [103, 1060]
    detected_indices = []

    for i in range(data_stream.size):
        ddm.add_element(data_stream[i])
        if ddm.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices

    expected_info = "DDM(min_num_instances=None, out_control_level=3.0, warning_level=2.0)"
    assert ddm.get_info() == expected_info
Exemple #4
0
def skmultiflow_detector(drift_detector_type: str) -> BaseDriftDetector:
    if drift_detector_type == "SKMULTIFLOW_EDDM":
        multiflow_detector = EDDM()
    elif drift_detector_type == "SKMULTIFLOW_PageHinkley":
        multiflow_detector = PageHinkley()
    elif drift_detector_type == "SKMULTIFLOW_DDM":
        multiflow_detector = DDM()
    elif drift_detector_type == "SKMULTIFLOW_ADWIN":
        multiflow_detector = ADWIN()
    else:
        raise Exception("Drift detector %s not implemented" %
                        drift_detector_type)
    return multiflow_detector
Exemple #5
0
def sim_ddm(input_stream, start_point=0):
    ddm = DDM()
    change_point = []
    detected_warning = []
    for i in range(len(input_stream)):
        ddm.add_element(input_stream[i])
        if ddm.detected_warning_zone():
            detected_warning.append(i + start_point)
        if ddm.detected_change():
            # plt.axvline(i, color='r', linestyle='dashed')
            change_point.append(i + start_point)
            # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n')

    return detected_warning, change_point
Exemple #6
0
def ddm_test():
    ddm = DDM()
    true_occur_position = 4443
    data_stream = np.load("data/stream_acc.npy")
    for i in tqdm(range(data_stream.shape[0])):
        # print(data_stream[i])
        # print(i)
        ddm.add_element(data_stream[i])
        if ddm.detected_warning_zone():
            print('Warning zone has been detected in data: ' +
                  str(data_stream[i]) + ' - of index: ' + str(i))
        if ddm.detected_change():
            print('Change has been detected in data: ' + str(data_stream[i]) +
                  ' - of index: ' + str(i))
ddm_param = [3, 5, 7]
ks_param1 = [100, 150, 200]
ks_param2 = [30, 50, 100]
ph_param1 = [25, 50, 75]
ph_param2 = [0.005, 0.01, 0.02]

knn = KNNClassifier()

stream = driftStreams[0]

for i in range(0, 3):
    trainX, trainY = stream.next_sample(2000)
    knn.partial_fit(trainX, trainY)

    adwin = ADWIN(delta=adwin_param[i])
    ddm = DDM(out_control_level=ddm_param[i])
    kswin1 = KSWIN(window_size=ks_param1[i])
    # kswin2 = KSWIN(stat_size=ks_param2[i])
    ph1 = PageHinkley(threshold=ph_param1[i])
    ph2 = PageHinkley(delta=ph_param2[i])

    adwin_results = []
    ddm_results = []
    kswin1_results = []
    kswin2_results = []
    ph1_results = []
    ph2_results = []

    n_samples = 0
    corrects = 0
    return stream


def drift_flow(stream, method, name, beginning_stream, end_tables):
    detected_change = []
    detected_warning = []
    number_of_changes = 0
    for i in range(len(stream)):
        method.add_element(stream[i])
        if method.detected_warning_zone():
            print(f'Warning zone has been detected in data: {stream[i]} - of index: {i}')
            detected_warning.append((stream[i]))
        if method.detected_change():
            detected_change.append(stream[i])
            print(f'Change has been detected in data: {stream[i]} - of index: {i}')
            number_of_changes += 1
        else:
            detected_change.append(None)
    print(f'{name} Detected changes: {number_of_changes}')
    print(f'{name} Detected warning zones: {str(len(detected_warning))}')
    plots(stream, detected_change, name, beginning_stream, end_tables)


stream = make_stream(PATH)

drift_flow(stream, EDDM(), 'EDDM', 0, 500)
drift_flow(stream, HDDM_A(), 'HDDM_A', 0, 500)
drift_flow(stream, HDDM_W(), 'HDDM_W', 0, 500)
drift_flow(stream, PageHinkley(), 'PH', 0, 500)
drift_flow(stream, DDM(), 'DDM', 0, 500)
Exemple #9
0
file_name = "CMGMM-"+test_dataset+".log"

DETECTOR=args.detector#""
nama_model = "CMGMM"
if (prune_comp):
    nama_model = nama_model+"+ "
else:
    nama_model = nama_model+" "
if DETECTOR == "ADWIN":
    print ("adwin")
    nama_model = nama_model+DETECTOR
    detector = ADWIN()
elif DETECTOR == "DDM":
    print ("DDM")
    nama_model = nama_model+DETECTOR
    detector = DDM()
elif DETECTOR == "EDDM":
    print ("EDDM")
    nama_model = nama_model+DETECTOR
    detector = EDDM()
elif DETECTOR == "HDDM_A":
    print ("HDDM_A")
    nama_model = nama_model+DETECTOR
    detector = HDDM_A()
elif DETECTOR == "HDDM_W":
    print ("HDDM_W")
    nama_model = nama_model+DETECTOR
    detector = HDDM_W()
elif DETECTOR == "KSWIN":
    print ("KSWIN")
    nama_model = nama_model+DETECTOR
Exemple #10
0
def main():
    
    overall_kswin_tp = overall_kswin_tn = overall_kswin_fp = overall_kswin_fn = 0
    overall_adwin_tp = overall_adwin_tn = overall_adwin_fp = overall_adwin_fn = 0
#   mebwin_drifts = []
    overall_k_swmebwin_tp = overall_k_swmebwin_tn = overall_k_swmebwin_fp = overall_k_swmebwin_fn = 0
    overall_swmebwin_tp = overall_swmebwin_tn = overall_swmebwin_fp = overall_swmebwin_fn = 0
    overall_eddm_tp = overall_eddm_tn = overall_eddm_fp = overall_eddm_fn = 0
    overall_ddm_tp = overall_ddm_tn = overall_ddm_fp = overall_ddm_fn = 0
    
    for stream in streams:
        print(stream.name)
        
        f = open('drifts.txt', 'a+')
        f.write(f'**{stream.name}**\n\n')
        f.close()
                
        stream.prepare_for_use()
        
        stream.next_sample()
        
#        mebwin = MEBWIN(epsilon=0.1, sensitivity=0.98, w_size=100, stat_size=30)
        adwin = []
        kswin = []
        ddm = DDM(min_num_instances=30)
        eddm = EDDM()
        
        data = []
        labels = []
        predictions = []
        
        kswin_drifts = []
        adwin_drifts = []
#        mebwin_drifts = []
        k_swmebwin_drifts = []
        swmebwin_drifts = []
        eddm_drifts = []
        ddm_drifts = []
        
        swmebwin = SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05)
#        k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05, gamma=10**10)
        k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05)
        # gamma maybe 1.0 / stream.current_sample_x.shape[1]
        RANGE = 1000000
        DIM = 50
        # - 2 because first drift is at 2000 not 1000 and last drift is not detectable
#        COUNT_DRIFTS = RANGE / 1000 - 2
        
        n_rand_dims = DIM - stream.current_sample_x.size
        multiply = n_rand_dims // stream.current_sample_x.size
        
        # partial fit -> pretrain
        for _m in range(multiply):
            current_sample_x = np.array([[]])
            current_sample_x = np.concatenate(
                        (current_sample_x, stream.current_sample_x), axis=1)
     
        bayes = NaiveBayes()
        bayes.partial_fit(np.array(current_sample_x), list(stream.current_sample_y.ravel()))
        
        for j in range(DIM):
            adwin.append(ADWIN(delta=0.002))
            kswin.append(KSWIN(w_size=300, stat_size=30, alpha=0.0001))
                    
        """Add dims"""
        for i in range(RANGE):
            current_sample_x = np.array([[]])
            for _m in range(multiply):
                current_sample_x = np.concatenate(
                        (current_sample_x, stream.current_sample_x), axis=1)
            data.append(current_sample_x.ravel())
            labels.append(stream.current_sample_y.ravel()[0])
            predictions.append(0 if bayes.predict(current_sample_x) == labels[i] else 1)
            bayes.partial_fit(current_sample_x, list(stream.current_sample_y.ravel()))
            stream.next_sample()
        
        # MEBWIN
    #    start = time.time()
    #    for i in range(RANGE):
    #        mebwin.add_element(data[i])
    #        
    #        if mebwin.change_detected is True:
    #            mebwin_drifts.append(i)
    #
    #    f = open('drifts.txt', 'a+')
    #    f.write(f'MEBWIN detected {len(mebwin_drifts)} drifts in {time.time() - start} {mebwin_drifts}\n\n')
    #    f.close() 
    #    print(f'MEBWIN took {time.time() - start} sec and detected {len(mebwin_drifts)} drifts')
    
        # Kernel SWMEBWIN
        start = time.time()
        for i in range(RANGE):
            k_swmebwin.add_element(value=data[i], label=labels[i])
            
            if k_swmebwin.change_detected is True:
                k_swmebwin_drifts.append(i)
          
        end = time.time() - start
    
        f1, tp, fp, tn, fn = confusion_matrix_stats(k_swmebwin_drifts, RANGE)
        overall_k_swmebwin_tp += tp
        overall_k_swmebwin_tn += tn
        overall_k_swmebwin_fp += fp
        overall_k_swmebwin_fn += fn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')
            
        f = open('drifts.txt', 'a+')
        f.write(f'K-SWMEB detected {len(k_swmebwin_drifts)} drifts in {time.time() - start} {k_swmebwin_drifts}\n\n')
        f.close()
        print(f'K-SW-MEBWIN took {end} sec and detected {len(k_swmebwin_drifts)} drifts\n')
             
        # SWMEBWIN
        start = time.time()
        for i in range(RANGE):
            swmebwin.add_element(value=data[i], label=labels[i])
            
            if swmebwin.change_detected is True:
                swmebwin_drifts.append(i)
          
        end = time.time() - start
    
        f1, tp, fp, tn, fn = confusion_matrix_stats(swmebwin_drifts, RANGE)
        
        overall_swmebwin_tp += tp
        overall_swmebwin_tn += tn
        overall_swmebwin_fp += fp
        overall_swmebwin_fn += fn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')
            
        f = open('drifts.txt', 'a+')
        f.write(f'SWMEB detected {len(swmebwin_drifts)} drifts in {time.time() - start} {swmebwin_drifts}\n\n')
        f.close()
        print(f'SW-MEBWIN took {end} sec and detected {len(swmebwin_drifts)} drifts\n')
                
        # ADWIN
        start = time.time()
        for i in range(RANGE):
            adwin_detected = False
        
            for j in range(data[i].size):
                adwin[j].add_element(data[i][j])
                if adwin[j].detected_change():
                    adwin_detected = True
                    
            if adwin_detected is True:
                adwin_drifts.append(i)
                
        end = time.time() - start
        
        f1, tp, fp, tn, fn = confusion_matrix_stats(adwin_drifts, RANGE)
        
        overall_adwin_tp += tp
        overall_adwin_tn += tn
        overall_adwin_fp += fp
        overall_adwin_fn += fn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')
            
        f = open('drifts.txt', 'a+')
        f.write(f'ADWIN detected {len(adwin_drifts)} drifts in {time.time() - start} at {adwin_drifts}\n\n')
        f.close()
        print(f'ADWIN took {end} sec and detected {len(adwin_drifts)} drifts\n')
        
        # KSWIN
        start = time.time()
        for i in range(RANGE):
            kswin_detected = False
            
            for j in range(data[i].size):    
                kswin[j].add_element(data[i][j])
                if kswin[j].detected_change():
                    kswin_detected = True
                    
            if kswin_detected is True:
                kswin_drifts.append(i)
          
        end = time.time() - start
        
        f1, tp, fp, tn, fn = confusion_matrix_stats(kswin_drifts, RANGE)
        
        overall_kswin_tp += tp
        overall_kswin_tn += tn
        overall_kswin_fp += fp
        overall_kswin_fn += fn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')     
        
        f = open('drifts.txt', 'a+')
        f.write(f'KSWIN detected {len(kswin_drifts)} drifts in {time.time() - start} at {kswin_drifts}\n\n')
        f.close()
        print(f'KSWIN took {end} sec and detected {len(kswin_drifts)} drifts\n')
        
        # EDDM
        start = time.time()
        for i in range(RANGE):
            eddm_detected = False
            
            eddm.add_element(predictions[i])
            
            if eddm.detected_change():
                eddm_detected = True
                    
            if eddm_detected is True:
                eddm_drifts.append(i)
                
        end = time.time() - start
          
        f1, tp, fp, tn, fn = confusion_matrix_stats(eddm_drifts, RANGE)
        
        overall_eddm_tp += tp
        overall_eddm_tn += tn
        overall_eddm_fp += fp
        overall_eddm_fn += fn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')
        
        f = open('drifts.txt', 'a+')
        f.write(f'EDDM detected {len(eddm_drifts)} drifts in {time.time() - start} at {eddm_drifts}\n\n')
        f.close()
        print(f'EDDM took {end} sec and detected {len(eddm_drifts)} drifts\n')
        
        # DDM
        start = time.time()
        for i in range(RANGE):
            ddm_detected = False
            ddm.add_element(predictions[i])
            if ddm.detected_change():
                ddm_detected = True
                    
            if ddm_detected is True:
                ddm_drifts.append(i)
                
        end = time.time() - start
        
        f1, tp, fp, tn, fn = confusion_matrix_stats(ddm_drifts, RANGE)
        
        overall_ddm_tp += tp
        overall_ddm_tn += tn
        overall_ddm_fp += fp
        overall_ddm_fn += tn
    
        print(f'F1-Score: {f1}')
        print(f'{tp} true positives, {fp} false positives')
        print(f'{tn} true negatives, {fn} false negatives')
        
        f = open('drifts.txt', 'a+')
        f.write(f'DDM detected {len(ddm_drifts)} drifts in {time.time() - start} at {ddm_drifts}\n\n')
        f.close()
        print(f'DDM took {end} sec and detected {len(ddm_drifts)} drifts\n')
        
    # OVERALL STATISTICS
    print(50 * '-')
    print('K-SWMEBWIN\n')
    print(f'Overall F1: {calc_f1(overall_k_swmebwin_tp, overall_k_swmebwin_fp, overall_k_swmebwin_tn, overall_k_swmebwin_fn)}')
    print(f'{overall_k_swmebwin_tp} true positives, {overall_k_swmebwin_fp} false positives')
    print(f'{overall_k_swmebwin_tn} true negatives, {overall_k_swmebwin_fn} false negatives')
    print(50* '-')
    
    print(50 * '-')
    print('SWMEBWIN\n')
    print(f'Overall F1: {calc_f1(overall_swmebwin_tp, overall_swmebwin_fp, overall_swmebwin_tn, overall_swmebwin_fn)}')
    print(f'{overall_swmebwin_tp} true positives, {overall_swmebwin_fp} false positives')
    print(f'{overall_swmebwin_tn} true negatives, {overall_swmebwin_fn} false negatives')
    print(50* '-')
    
    print(50 * '-')
    print('KSWIN\n')
    print(f'Overall F1: {calc_f1(overall_kswin_tp, overall_kswin_fp, overall_kswin_tn, overall_kswin_fn)}')
    print(f'{overall_kswin_tp} true positives, {overall_kswin_fp} false positives')
    print(f'{overall_kswin_tn} true negatives, {overall_kswin_fn} false negatives')
    print(50* '-')
    
    print(50 * '-')
    print('ADWIN\n')
    print(f'Overall F1: {calc_f1(overall_adwin_tp, overall_adwin_fp, overall_adwin_tn, overall_adwin_fn)}')
    print(f'{overall_adwin_tp} true positives, {overall_adwin_fp} false positives')
    print(f'{overall_adwin_tn} true negatives, {overall_adwin_fn} false negatives')
    print(50* '-')
    
    print(50 * '-')
    print('DDM\n')
    print(f'Overall F1: {calc_f1(overall_ddm_tp, overall_ddm_fp, overall_ddm_tn, overall_ddm_fn)}')
    print(f'{overall_ddm_tp} true positives, {overall_ddm_fp} false positives')
    print(f'{overall_ddm_tn} true negatives, {overall_ddm_fn} false negatives')
    print(50* '-')
    
    print(50 * '-')
    print('EDDM\n')
    print(f'Overall F1: {calc_f1(overall_eddm_tp, overall_eddm_fp, overall_eddm_tn, overall_eddm_fn)}')
    print(f'{overall_eddm_tp} true positives, {overall_eddm_fp} false positives')
    print(f'{overall_eddm_tn} true negatives, {overall_eddm_fn} false negatives')
    print(50* '-')
Exemple #11
0
def mapping_experiment(
    save_name,
    lstm_model_idx=0,
    transformer_model_trained=TransformerModel.BERT,
    transformer_model_untrained=TransformerModel.SCIBERT,
    linear=False,
    method="average",
    batch_size=1,
    transform=True,
    print_every=1,
    device="cpu",
):
    """
    Runs an adaptation experiments using the Procrustes linear mapping.

    Args:
        save_name (str): name of the file where the function saves the result
        lstm_model_idx (int): the index of the LSTM model (from the available ones)
        transformer_model_trained (TransformerModel): the embeddings on which the model was trained
        transformer_model_untrained (TransformerModel): the embeddings against which the model is compared
        linear (bool): False
        method (str): method parameter used for picking the adaptation dataset
        batch_size (int): the batch size for the stream
        transform (bool): whether to transform the text or used pre transformed one
        print_every (int): how often to print
        device (str): cpu or cuda

    Returns:
        a dictionary with the results

    """
    # Add method to save name
    save_name += "_" + method
    # Initialize the stream that the model was trained on
    stream_trained = WOSStream(
        transformer_model=transformer_model_trained,
        transform=transform,
        test_split=False,
        device=device,
    )
    stream_trained.prepare_for_use()
    # Initialize the stream with other embeddings, to add drift
    stream_untrained = WOSStream(
        transformer_model=transformer_model_untrained,
        transform=transform,
        test_split=False,
        device=device,
    )
    stream_untrained.prepare_for_use()
    # Initialize the adaptation dataset
    if linear:
        mapping = Procrustes(method=method, x_most_common=10000)
    else:
        mapping = MLPMapping(method=method, x_most_common=10000)
    # Load the LSTM model
    model = LSTM(
        embedding_dim=utils.EMBEDDING_DIM, no_classes=stream_trained.n_classes
    ).to(device)
    model.load_state_dict(
        torch.load(LSTM_MODELS[lstm_model_idx], map_location=device), strict=False
    )
    model.eval()

    # Initialize the drift detector
    drift_detector = DDM()

    # Run streams
    print("Running trained stream...")
    trained_accuracies = run_stream_lstm(
        stream_trained,
        model,
        drift_detector,
        batch_size=batch_size,
        print_every=print_every,
        warm_start=sys.maxsize,
        device=device,
    )
    print("Running untrained stream...")
    untrained_accuracies = run_stream_lstm(
        stream_untrained,
        model,
        drift_detector,
        batch_size=batch_size,
        print_every=print_every,
        warm_start=sys.maxsize,
        device=device,
    )

    # Run the stream with a mapping
    stream_untrained.restart()
    print("Running mapping stream...")
    mapping_accuracies = run_stream_with_mapping(
        stream_untrained,
        model,
        mapping,
        batch_size=batch_size,
        print_every=print_every,
    )

    # Save the results
    to_save = {
        "trained_accuracies": trained_accuracies,
        "untrained_accuracies": untrained_accuracies,
        "mapping_accuracies": mapping_accuracies,
    }

    with open(os.path.join(PATH_RESULTS, save_name + ".pkl"), "wb") as f:
        pickle.dump(to_save, f)

    return to_save
Exemple #12
0
def drift_detection_different_embeddings(
    save_name,
    lstm_model_idx=None,
    nb_model_idx=None,
    transformer_model_trained=None,
    transformer_model_untrained=None,
    batch_size=1,
    transform=True,
    print_every=1,
    device="cpu",
):
    """ Performs an experiment with two different streams on the same model.
    The first stream is the one with embeddings on which the model was trained on.
    The second stream is the one with embeddings that are different from the ones
    on which the model was trained on.
    The goal of the experiment is to find if the new embeddings can be substituted
    for the old ones, in which case no drift should occur, or otherwise they cannot be
    used and drift will be detected.

    Args:
        save_name (str): name of the file where the function saves the result
        lstm_model_idx (int): the index of the LSTM model (from the available ones)
        nb_model_idx (int): the index of the Naive Bayes model (from the available ones)
        transformer_model_trained (TransformerModel): the embeddings on which the model was trained
        transformer_model_untrained (TransformerModel): the embeddings against which the model is compared
        batch_size (int): the batch size for the stream
        transform (bool): whether to transform the text or used pre transformed one
        print_every (int): how often to print
        device (str): cpu or cuda

    Returns:
        a dictionary with the results
    """
    # Initialize the stream that the model was trained on
    stream_trained = WOSStream(
        transformer_model=transformer_model_trained,
        transform=transform,
        test_split=False,
        device=device,
    )
    stream_trained.prepare_for_use()
    # Initialize the stream with other embeddings, to add drift
    stream_untrained = WOSStream(
        transformer_model=transformer_model_untrained,
        transform=transform,
        test_split=False,
        device=device,
    )
    stream_untrained.prepare_for_use()

    # Load the model
    model = None
    stream_runner = None
    if lstm_model_idx is None and nb_model_idx is None:
        raise ValueError("No index provided for either the LSTM or the NB model.")
    if lstm_model_idx is not None:
        # Load the LSTM model
        model = LSTM(
            embedding_dim=utils.EMBEDDING_DIM, no_classes=stream_trained.n_classes
        ).to(device)
        model.load_state_dict(
            torch.load(LSTM_MODELS[lstm_model_idx], map_location=device), strict=False
        )
        model.eval()
        stream_runner = run_stream_lstm
    elif nb_model_idx is not None:
        # Load the Naive Bayes model
        model = load(NB_MODELS[nb_model_idx])
        stream_runner = run_stream_nb

    # Initialize drift detector
    drift_detector = DDM()

    # Run streams
    print("Running trained stream...")
    trained_accuracies = stream_runner(
        stream_trained,
        model,
        drift_detector,
        batch_size=batch_size,
        print_every=print_every,
        warm_start=sys.maxsize,
        device=device,
    )
    print("Running untrained stream...")
    untrained_accuracies = stream_runner(
        stream_untrained,
        model,
        drift_detector,
        batch_size=batch_size,
        print_every=print_every,
        warm_start=sys.maxsize,
        device=device,
    )

    # Save the results
    to_save = {
        "trained_accuracies": trained_accuracies,
        "untrained_accuracies": untrained_accuracies,
    }

    with open(os.path.join(PATH_RESULTS, save_name + ".pkl"), "wb") as f:
        pickle.dump(to_save, f)

    return to_save
Exemple #13
0
def drift_detection_gradual_noise(
    save_name,
    lstm_model_idx=None,
    nb_model_idx=None,
    transformer_model=TransformerModel.BERT,
    batch_size=1,
    max_std=0.1,
    warm_start=30,
    transform=True,
    print_every=1,
    device="cpu",
):
    """
    Performs an experiment with a stream on a model.
    The stream is gradually perturbed with noise such that
    it simulates gradual concept drift.

    Args:
        save_name (str): name of the file where the function saves the result
        lstm_model_idx (int): the index of the LSTM model (from the available ones)
        nb_model_idx (int): the index of the Naive Bayes model (from the available ones)
        transformer_model (TransformerModel): the embeddings on which the model was trained
        batch_size (int): the batch size for the stream
        max_std (float): the maximum standard deviation for the Gaussian noise
        warm_start (int): number of examples run before adding noise
        transform (bool): whether to transform the text or used pre transformed one
        print_every (int): how often to print
        device (str): cpu or cuda

    Returns:
        a dictionary with the results
    """
    # Initialize the stream
    stream = WOSStream(
        transformer_model=transformer_model,
        transform=transform,
        test_split=False,
        device=device,
    )
    stream.prepare_for_use()

    # Load the model
    model = None
    stream_runner = None
    if lstm_model_idx is None and nb_model_idx is None:
        raise ValueError("No index provided for either the LSTM or the NB model.")
    if lstm_model_idx is not None:
        # Load the LSTM model
        model = LSTM(embedding_dim=utils.EMBEDDING_DIM, no_classes=stream.n_classes).to(
            device
        )
        model.load_state_dict(
            torch.load(LSTM_MODELS[lstm_model_idx], map_location=device), strict=False
        )
        model.eval()
        stream_runner = run_stream_lstm
    elif nb_model_idx is not None:
        # Load the Naive Bayes model
        model = load(NB_MODELS[nb_model_idx])
        stream_runner = run_stream_nb

    # Initialize the drift detector
    drift_detector = DDM()

    n_iterations = stream.n_samples // batch_size + 1
    # Initialize the standard deviations for the normal distribution
    standard_devs = torch.arange(
        start=0, end=max_std, step=max_std / (n_iterations - warm_start)
    )
    # Run stream
    accuracies = stream_runner(
        stream,
        model,
        drift_detector,
        batch_size=batch_size,
        print_every=print_every,
        noise_stds=standard_devs,
        warm_start=warm_start,
        device=device,
    )

    to_save = {
        "accuracies": accuracies,
    }
    with open(os.path.join(PATH_RESULTS, save_name + ".pkl"), "wb") as f:
        pickle.dump(to_save, f)

    return to_save
Exemple #14
0
detect_end = n_global
mine_pr = []
mine_std = []
mine_alpha = []
pr_min = []
std_min = []
pi = []
mine_x_mean = []
mine_sum = []
mine_threshold = []
pred_grace_ht = []
pred_grace_ht_p = []
ht_p = None
ML_accuracy = 0

ddm = DDM()
h = hpy()
while elec_stream.has_more_samples():
    n_global += 1

    X_test, y_test = elec_stream.next_sample()
    y_predict = ht.predict(X_test)

    ddm_start_time = time.time()
    ddm.add_element(y_test != y_predict)
    ML_accuracy += 1 if y_test == y_predict else 0
    ddm_running_time = time.time() - ddm_start_time
    RT_ddm.append(ddm_running_time)
    if (n_global > grace_end):
        if (n_global > detect_end):
            if ht_p is not None:
Exemple #15
0
# Imports
import numpy as np
import sim_adwin as sim
from skmultiflow.drift_detection import ADWIN, EDDM,DDM
import matplotlib.pyplot as plt

ddm = DDM()
adwin = ADWIN()
eddm =EDDM()
# Simulating a data stream as a normal distribution of 1's and 0's
data_stream = np.random.randint(2, size=200)
# Changing the data concept from index 999 to 1500, simulating an
# increase in error rate
for i in range(100, 150):
     data_stream[i] = np.random.randint(4, high=8)
# Adding stream elements to DDM and verifying if drift occurred
plt.plot(data_stream)
fig= plt.gcf()
fig.set_size_inches(10, 5.5)
plt.ylabel('value')
plt.xlabel('Time')
for i in range(200):
    ddm.add_element(data_stream[i])
    if ddm.detected_warning_zone():
        plt.axvline(i, color='g', linestyle='--', linewidth=0.7)
        # print('Warning zone has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
    if ddm.detected_change():
        plt.axvline(i, color='r', linestyle='--', linewidth=0.7)
        # print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
plt.show()
    if A.detected_change():
        print('Concept Drift detected in data: ' + str(stream[j]) +
              ' - at index: ' + str(j))
### Output:
#Concept Drift detected in data: 8.0 - at index: 607
#Concept Drift detected in data: 5.0 - at index: 639
#Concept Drift detected in data: 6.0 - at index: 671

########

### DDM code
import numpy as np
from skmultiflow.drift_detection import DDM

# call the DDM object
d2m = DDM()

# set seed for reproducibility
np.random.seed(123)

# Simulate a data stream of size 1000 from a Standard normal distribution
stream = np.random.randn(1000)

stream[:10]
## Output-
#array([-1.0856306 ,  0.99734545,  0.2829785 , -1.50629471, -0.57860025,
#        1.65143654, -2.42667924, -0.42891263,  1.26593626, -0.8667404 ])

# Data concept are changed from index 299 to 600
for j in range(299, 600):
    stream[j] = np.random.randint(5, high=9)
Exemple #17
0
        def test_on_data_set(data_desc, D):
            r = {data_desc: {"HDDDM": [], "SWIDD": [], "EDDM": [], "DDM": [], "ADWIN": [], "PageHinkley": []}}

            training_buffer_size = 100  # Size of training buffer of the drift detector
            n_train = 200   # Initial training set size

            concept_drifts = D["drifts"]
            X, Y = D["data"]
            data_stream = np.concatenate((X, Y.reshape(-1, 1)), axis=1)


            X0, Y0 = X[0:n_train, :], Y[0:n_train, :]   # Training dataset
            data0 = data_stream[0:n_train,:]

            X_next, Y_next = X[n_train:, :], Y[n_train:, :]  # Test set
            data_next = data_stream[n_train:,:]

            # Run unsupervised drift detector  
            dd = DriftDetectorUnsupervised(HDDDM(data0, gamma=None, alpha=0.005), batch_size=50)
            changes_detected = dd.apply_to_stream(data_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["HDDDM"].append(scores)

            dd = DriftDetectorUnsupervised(SWIDD(max_window_size=300, min_window_size=100), batch_size=1)
            changes_detected = dd.apply_to_stream(data_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["SWIDD"].append(scores)

            # Run supervised drift detector
            model = GaussianNB()
            
            # EDDM
            drift_detector = EDDM()

            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["EDDM"].append(scores)

            # DDM
            drift_detector = DDM(min_num_instances=30, warning_level=2.0, out_control_level=3.0)
            
            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["DDM"].append(scores)

            # ADWIN
            drift_detector = ADWIN(delta=2.)

            clf = Classifier(model)
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["ADWIN"].append(scores)
            
            # PageHinkley
            drift_detector = PageHinkley()
            
            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["PageHinkley"].append(scores)
        
            return r