def test_naive_bayes(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'NaiveBayes: nominal attributes: [] - ' assert learner.get_info() == expected_info learner.reset() learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500])) expected_score = 0.9378757515030061 assert np.isclose( expected_score, learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:]))) assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_clone(): stream = SEAGenerator(random_state=1) learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=[0, 1]) cnt += 1 cloned = clone(learner) assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
def partial_fit(self, X, y=None, classes=None, weight=None): """ Fit the ensemble to a data chunk Implement the basic Algorithm 1 as described in the paper :param X: the training data (a data chunk S) :param y: the training labels :param classes: array-like, contains all possible labels, if not provided, it will be derived from y :param weight: array-like, instance weight if not provided, uniform weights are assumed :return: self """ # if the classes are not provided, we derive it from y N, D = X.shape class_count = None # avoid calling unique multiple times if classes is None: classes, class_count = np.unique(y, return_counts=True) # (1) train classifier C' from X # allows a wider variety of classifiers # not a lot but still... if self.base_learner == "bayes": # Naive Bayes C_new = NaiveBayes() else: # by default, set to Hoeffding Tree C_new = HoeffdingTree() C_new.partial_fit(X, y, classes=classes) # (2) compute error rate/benefit of C_new via cross-validation on S # MSE_r: compute the baseline error rate given by a random classifier # a. class distribution learnt from the data # use this improve the performance if class_count is None: _, class_count = np.unique(classes, return_counts=True) class_dist = [class_count[i] / N for i, c in enumerate(classes)] MSE_r = np.sum([class_dist[i] * ((1 - class_dist[i]) ** 2) for i, c in enumerate(classes)]) # b. assumption: uniform distribution # p_c = 1/L # MSE_r = L * (p_c * ((1 - p_c) ** 2)) # MSE_i: compute the error rate of C_new via cross-validation on X # f_ic = the probability given by C_new that x is an instance of class c MSE_i = self.compute_MSE(y, C_new.predict_proba(X), classes) # (3) derive weight w_new for C_new using (8) or (9) w_new = MSE_r - MSE_i # create a new classifier with its associated weight, # the unique labels of the data chunk it is trained on clf_new = self.WeightedClassifier(clf=C_new, weight=w_new, chunk_labels=classes) # (4) update the weights of each classifier in the ensemble for i, clf in enumerate(self.models): MSE_i = self.compute_MSE(y, clf.clf.predict_proba(X), clf.chunk_labels) # apply Ci on S to derive MSE_i clf.weights = MSE_r - MSE_i # update wi based on (8) or (9) # (5) C <- top K weighted classifiers in C U { C' } # selecting top K models by dropping the worst model i.e. clf with smallest weight in C U { C' } if len(self.models) < self.K: # just push the new model in if there is still slots hq.heappush(self.models, clf_new) else: # if the new model has a weight > that of the bottom classifier (worst one) if clf_new.weight > self.models[0].weight: hq.heappushpop(self.models, clf_new) # push the new classifier and remove the bottom one # do nothing if the new model has a weight even lower than that of the worst classifier return self