def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) stream.prepare_for_use() X, y = stream.next_sample(5000) learner = HoeffdingTree(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) stream.prepare_for_use() X, y = stream.next_sample(1000) learner = HoeffdingTree(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def _process_chunk(self): """ A subroutine that runs at the end of each chunk, allowing the components to be trained and ensemble weights to be adjusted. Until the first _process_chunk call, the ensemble is not yet ready. At first call, the first component is learned. At the rest of the calls, new components are formed, and the older ones are trained by the given chunk. If the ensemble size is reached, then the lowest weighted component is removed from the ensemble. """ new_clf = HoeffdingTree() # with default parameters for now new_clf.reset() # Save records of previous chunk if (self._record and self._num_of_current_classifiers > 0): self._record_truths_this_chunk() self._record_comp_preds_this_chunk() self._record_weights_this_chunk() # Case 1: No classifier in the ensemble yet, first chunk: if (self._num_of_current_classifiers == 0): self._classifiers[0] = new_clf self._weights[0] = 1.0 # weight is 1 for the first clf self._num_of_current_classifiers += 1 else: # First, adjust the weights of the old component classifiers # according to what happened in this chunk. self._adjust_weights() # Case 2: There are classifiers in the ensemble but # the ensemble size is still not capped. if (self._num_of_current_classifiers < self._num_of_max_classifiers): # Put the new classifier to ensemble with the weight of 1 self._classifiers[self._num_of_current_classifiers] = new_clf self._weights[self._num_of_current_classifiers] = float(1.0) self._num_of_current_classifiers += 1 # Case 3: Ensemble size is capped. Need to replace the component # with lowest weight. else: assert (self._num_of_current_classifiers == self._num_of_max_classifiers), "Ensemble not full." index_of_lowest_weight = np.argmin(self._weights) self._classifiers[index_of_lowest_weight] = new_clf self._weights[index_of_lowest_weight] = 1.0 # Normalizing weigths to simplify numbers self._normalize_weights_softmax() # maybe useful. we'll see. if (self._Logging): print("After normalization weights: ") print(self._weights) # Ensemble maintenance is done. Now train all classifiers # in the ensemble from the current chunk. # Can be parallelized. data_features = self._chunk_data.get_attributes_matrix() data_truths = self._chunk_data.get_targets_matrix() data_truths = data_truths.astype(int).flatten() if (self._Logging): print("Starting training the components with the current chunk...") for k in range(self._num_of_current_classifiers): print("Training classifier {}".format(k)) self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) print( "Training the components with the current chunk completed...") else: for k in range(self._num_of_current_classifiers): self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) return