def test_hat_mc(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='mc') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HAT(binary_split=False, grace_period=200, leaf_prediction='mc',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \ " no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \ " split_confidence=1e-07, split_criterion='info_gain',\n" \ " stop_mem_management=False, tie_threshold=0.05)" assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 398.0, 1.0: 1000.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray stream.restart() X, y = stream.next_sample(5000) learner = HAT(max_byte_size=30, leaf_prediction='mc', grace_period=10) learner.partial_fit(X, y)
def test_hat_mc(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='mc') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: mc - nb_threshold: 0' \ ' - nominal_attributes: [] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 0.005295278636481529, 1.0: 1.9947047213635185}\n' expected_model_2 = 'Leaf = Class 1.0 | {0.0: 0.0052952786364815294, 1.0: 1.9947047213635185}\n' expected_model_3 = 'Leaf = Class 1.0 | {1.0: 1.9947047213635185, 0.0: 0.0052952786364815294}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2) \ or (learner.get_model_description() == expected_model_3) stream.restart() X, y = stream.next_sample(5000) learner = HAT(max_byte_size=30, leaf_prediction='mc', grace_period=10) learner.partial_fit(X, y)
def test_concept_drift_stream(test_path): stream = ConceptDriftStream(random_state=1, position=20, width=5) stream.prepare_for_use() assert stream.n_remaining_samples() == -1 expected_names = [ "salary", "commission", "age", "elevel", "car", "zipcode", "hvalue", "hyears", "loan" ] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['target'] assert stream.n_features == 9 assert stream.n_cat_features == 3 assert stream.n_num_features == 6 assert stream.n_targets == 1 assert stream.get_info() == 'ConceptDriftStream: ' \ 'First Stream: AGRAWALGenerator - ' \ 'Drift Stream: AGRAWALGenerator - ' \ 'alpha: 0.0 - position: 20 - width: 5' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'concept_drift_stream.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(30) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] assert 'stream' == stream.get_class_type()
def test_knn_adwin(): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1), drift_stream=SEAGenerator( random_state=2, classification_function=2), random_state=1, position=250, width=10) stream.prepare_for_use() learner = KNNADWINClassifier(n_neighbors=8, leaf_size=40, max_window_size=200) cnt = 0 max_samples = 1000 predictions = array('i') correct_predictions = 0 wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) if y[0] == predictions[-1]: correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1 ]) assert np.alltrue(predictions == expected_predictions) expected_correct_predictions = 46 assert correct_predictions == expected_correct_predictions learner.reset() assert learner.window.n_samples == 0 expected_info = 'KNNADWINClassifier(leaf_size=40, max_window_size=200, n_neighbors=8, nominal_attributes=None)' info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info stream.restart() X, y = stream.next_sample(max_samples) learner.fit(X[:950], y[:950]) predictions = learner.predict(X[951:]) correct_predictions = sum(np.array(predictions) == y[951:]) expected_correct_predictions = 47 assert correct_predictions == expected_correct_predictions assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_hat_nb(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='nb') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: nb - nb_threshold: 0' \ ' - nominal_attributes: [] - ' assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_hoeffding_adaptive_tree_nb(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator(random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='nb') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \ "leaf_prediction='nb', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \ "no_preprune=False, nominal_attributes=None, remove_poor_atts=False, split_confidence=1e-07, " \ "split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
class RecurringConceptGradualStream(RecurringConceptStream): """ A stream featuring gradual drift between given concepts. Uses the scikit-multiflow concept drift stream to blend concepts over a window. Parameters ---------- rctype: RCStreamType An enum describing the type of stream num_samples: int The number of samples in the stream noise: float The probability that noise will happen in the generation. At each new sample generated, the sample with will perturbed by the amount of perturbation. Values go from 0.0 to 1.0. concept_chain: list<int> or dict A dict with key observation number and value the concept begining at that observation or A list of concept ids. A dict will be generated with each concept lasting its length given in desc or uniform length. window_size: int The number of observations each gradual drift is spread over. seed: int Random seed. desc: dict<int><conceptOccurence> A map of concept ID to options boost_first_occurance: bool If true, double the observations drawn from the first occurence of a concept. Allows a better model to be built and stored. Examples -------- >>> # An example stream using the STAGGER Generator. >>> # Starts using generating function 0, then at >>> # observation 5000 transitions to generating function >>> # 1 then at 10000 transitions back to 0. >>> from skika.data.synthetic.reccurring_concept_stream import RCStreamType, RecurringConceptGradualStream, conceptOccurence >>> concept_chain = {0: 0, 5000: 1, 10000: 0} >>> num_samples = 15000 >>> # init concept >>> concept_0 = conceptOccurence(id = 0, difficulty = 2, noise = 0, appearences = 2, examples_per_appearence = 5000) >>> concept_1 = conceptOccurence(id = 1, difficulty = 3, noise = 0, appearences = 1, examples_per_appearence = 5000) >>> desc = {0: concept_0, 1: concept_1} >>> datastream = RecurringConceptGradualStream( rctype = RCStreamType.STAGGER, num_samples =num_samples, noise = 0, concept_chain = concept_chain, window_size = 1000, seed = 42, desc = desc, boost_first_occurance = False) >>> datastream.has_more_samples() True >>> datastream.get_drift_info() {0: 0, 5000: 1, 10000: 0} >>> datastream.n_remaining_samples() 15000 >>> datastream.get_stream_info() {0: 0, 5000: 1, 10000: 0} 0 - 5000: STAGGERGenerator(balance_classes=False, classification_function=0, random_state=42) 5000 - 10000: STAGGERGenerator(balance_classes=False, classification_function=1, random_state=43) 10000 - 15000: STAGGERGenerator(balance_classes=False, classification_function=0, random_state=42) >>> datastream.get_moa_stream_info() {0: 0, 5000: 1, 10000: 0} '(ConceptDriftStream -s (generators.STAGGERGenerator -f 1 -i 42) -d (ConceptDriftStream -s (generators.STAGGERGenerator -f 2 -i 43) -d (generators.STAGGERGenerator -f 1 -i 42) -p 5000 -w 1) -p 5000 -w 1)' >>> datastream.get_supplementary_info() >>> datastream.next_sample() (array([[2., 0., 2.]]), array([0])) >>> datastream.n_remaining_samples() 14999 >>> datastream.next_sample() (array([[2., 0., 0.]]), array([0])) >>> datastream.n_remaining_samples() 14998 """ def __init__(self, rctype, num_samples, noise, concept_chain, window_size=1000, seed=None, desc=None, boost_first_occurance=True): self.in_drift = False self.drift_switch = False self.window_size = window_size self.transition_stream = None super().__init__(rctype, num_samples, noise, concept_chain, seed=seed, desc=desc, boost_first_occurance=boost_first_occurance) def next_sample(self, batch_size=1): if batch_size > 1: print("Only batch size of 1 for now") return None if not self.in_drift: samples = self.concepts[self.current_concept].next_sample( batch_size) else: samples = self.transition_stream.next_sample(batch_size) last_switch_point = 0 - self.window_size // 2 next_switch_point = self.num_samples + self.window_size self.example_count += batch_size for concept_switch_index in sorted(self.concept_chain.keys()): if (concept_switch_index <= self.example_count): last_switch_point = concept_switch_index if concept_switch_index >= self.example_count: next_switch_point = concept_switch_index break self.drifted = False if not self.in_drift: if self.example_count >= next_switch_point - self.window_size // 2: self.in_drift = True self.drift_switch = True self.transition_stream = ConceptDriftStream( stream=self.concepts[self.concept_chain[last_switch_point]] .get_datastream(), drift_stream=self.concepts[self.concept_chain[ next_switch_point]].get_datastream(), position=self.window_size // 2, width=self.window_size) self.transition_stream.prepare_for_use() else: if self.example_count == next_switch_point: self.current_concept = self.concept_chain[next_switch_point] self.drifted = True self.drift_switch = False if self.example_count >= (last_switch_point + self.window_size // 2) and not self.drift_switch: self.in_drift = False return samples
class RecurringConceptGradualStream(RecurringConceptStream): def __init__(self, rctype, num_samples, noise, concept_chain, window_size=1000, seed=None, desc=None, boost_first_occurance=True): self.in_drift = False self.drift_switch = False self.window_size = window_size self.transition_stream = None super().__init__(rctype, num_samples, noise, concept_chain, seed=seed, desc=desc, boost_first_occurance=boost_first_occurance) def next_sample(self, batch_size=1): if batch_size > 1: print("Only batch size of 1 for now") return None if not self.in_drift: samples = self.concepts[self.current_concept].next_sample( batch_size) else: samples = self.transition_stream.next_sample(batch_size) last_switch_point = 0 - self.window_size // 2 next_switch_point = self.num_samples + self.window_size self.example_count += batch_size for concept_switch_index in sorted(self.concept_chain.keys()): if (concept_switch_index <= self.example_count): last_switch_point = concept_switch_index if concept_switch_index >= self.example_count: next_switch_point = concept_switch_index break self.drifted = False if not self.in_drift: # print(f"START GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}") if self.example_count >= next_switch_point - self.window_size // 2: # print(f"{self.example_count}: START GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}") self.in_drift = True self.drift_switch = True self.transition_stream = ConceptDriftStream( stream=self.concepts[self.concept_chain[last_switch_point]] .get_datastream(), drift_stream=self.concepts[self.concept_chain[ next_switch_point]].get_datastream(), position=self.window_size // 2, width=self.window_size) self.transition_stream.prepare_for_use() else: if self.example_count == next_switch_point: self.current_concept = self.concept_chain[next_switch_point] self.drifted = True self.drift_switch = False # print(f"{self.example_count}: SWITCH POINT") if self.example_count >= (last_switch_point + self.window_size // 2) and not self.drift_switch: self.in_drift = False # print(f"{self.example_count}: END GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}") return samples
random_state=None, alpha=0.0) stream_3 = ConceptDriftStream( stream=AGRAWALGenerator(balance_classes=False, classification_function=1, perturbation=0.0, random_state=11), drift_stream=AGRAWALGenerator(balance_classes=False, classification_function=2, perturbation=0.0, random_state=12), position=6000, width=500, random_state=None, alpha=0.0) stream_1.prepare_for_use() stream_2.prepare_for_use() stream_3.prepare_for_use() instances_num = 10000 instances_counter = 0 ENSEMBLE_TYPE = 'av' ### Arrays for storing accuracy values for Streams accuracies_1 = [] accuracies_2 = [] accuracies_3_mv = [] accuracies_3_av = [] accuracies_3_goowe = [] num_features = stream_1.n_features