def test_regression_hoeffding_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) # Remove class value stream = stream[:, np.delete(np.arange(8), 7)] # Removes the last column (used only in the multi-target regression case) stream = stream[:, :-1] X, y = stream[:, :-1], stream[:, -1] nominal_attr_idx = np.arange(7).tolist() learner = RegressionHoeffdingTree(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y) expected_description = "if Attribute 4 = 0.0:\n" \ " Leaf = Statistics {0: 606.0000, 1: 1212.0000, 2: 3626.0000}\n" \ "if Attribute 4 = 1.0:\n" \ " Leaf = Statistics {0: 551.0000, 1: 1128.0000, 2: 3400.0000}\n" \ "if Attribute 4 = 2.0:\n" \ " Leaf = Statistics {0: 566.0000, 1: 1139.0000, 2: 3423.0000}\n" \ "if Attribute 4 = 3.0:\n" \ " Leaf = Statistics {0: 577.0000, 1: 1138.0000, 2: 3374.0000}\n" \ "if Attribute 4 = 4.0:\n" \ " Leaf = Statistics {0: 620.0000, 1: 1233.0000, 2: 3725.0000}\n" \ "if Attribute 4 = -3.0:\n" \ " Leaf = Statistics {0: 80.0000, 1: 163.0000, 2: 483.0000}\n" assert SequenceMatcher( None, expected_description, learner.get_model_description() ).ratio() > 0.9
def test_hoeffding_tree_coverage(test_path): # Cover nominal attribute observer test_file = os.path.join(test_path, 'regression_data.npz') data = np.load(test_file) X = data['X'] y = data['y'] learner = RegressionHoeffdingTree(leaf_prediction='mean', nominal_attributes=[i for i in range(3)]) learner.partial_fit(X, y)
def InnerCycle_Train(X, y, inject_drift, perc_train): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None #) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) results = dict() results["Teacher"] = stream_clf results["Student"] = student_clf results["StudentRegression"] = student_regr results["Driftpoints"] = dpoints results["n"] = ntrain results["Stream"] = stream results["Xtrain"] = Xtrain return (results)
def test_hoeffding_tree_perceptron(): stream = RegressionGenerator(n_samples=500, n_features=20, n_informative=15, random_state=1) stream.prepare_for_use() learner = RegressionHoeffdingTree(leaf_prediction='perceptron', random_state=1) cnt = 0 max_samples = 500 y_pred = array('d') y_true = array('d') wait_samples = 10 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_true.append(y[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [ 1198.4326121743168, 456.36607750881586, 927.9912160545144, 1160.4797981899128, 506.50541829176535, -687.8187227095925, -677.8120094065415, 231.14888704761225, -284.46324039942937, -255.69195985557175, 47.58787439365423, -135.22494016284043, -10.351457437330152, 164.95903200643997, 360.72854984472383, 193.30633911830088, -64.23638301570358, 587.9771578214296, 649.8395655757931, 481.01214222804026, 305.4402728117724, 266.2096493865043, -445.11447171009775, -567.5748694154349, -68.70070048021438, -446.79910655850153, -115.892348067663, -98.26862866231015, 71.04707905920286, -10.239274802165584, 18.748731569441812, 4.971217265129857, 172.2223575990573, -655.2864976783711, -129.69921313686626, -114.01187375876822, -405.66166686550963, -215.1264381928009, -345.91020370426247, -80.49330468453074, 108.78958382083302, 134.95267043280126, -398.5273538477553, -157.1784910649728, 219.72541225645654, -100.91598162899217, 80.9768574308987, -296.8856956382453, 251.9332271253148 ]) assert np.allclose(y_pred, expected_predictions) error = mean_absolute_error(y_true, y_pred) expected_error = 362.98595964244623 assert np.isclose(error, expected_error) expected_info = 'RegressionHoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 ' \ '- grace_period: 200 - split_criterion: variance reduction - split_confidence: 1e-07 ' \ '- tie_threshold: 0.05 - binary_split: False - stop_mem_management: False ' \ '- remove_poor_atts: False - no_pre_prune: False - leaf_prediction: perceptron - nb_threshold: 0 ' \ '- nominal_attributes: [] - ' assert learner.get_info() == expected_info assert isinstance(learner.get_model_description(), type('')) assert type(learner.predict(X)) == np.ndarray
def test_hoeffding_tree(): stream = RegressionGenerator(n_samples=500, n_features=20, n_informative=15, random_state=1) stream.prepare_for_use() learner = RegressionHoeffdingTree(leaf_prediction='mean') cnt = 0 max_samples = 500 y_pred = array('d') y_true = array('d') wait_samples = 10 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_true.append(y[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [ 102.38946041769101, 55.6584574987656, 5.746076599168373, 17.11797209372667, 2.566888222752787, 9.188247802192826, 17.87894804676911, 15.940629626883966, 8.981172175448485, 13.152624115190092, 11.106058099429399, 6.473195313058236, 4.723621479590173, 13.825568609556493, 8.698873073880696, 1.6452441811010252, 5.123496188584294, 6.34387187194982, 5.9977733790395105, 6.874251577667707, 4.605348088338317, 8.20112636572672, 9.032631648758098, 4.428189978974459, 4.249801041367518, 9.983272668044492, 12.859518508979734, 11.741395774380285, 11.230028410261868, 9.126921979081521, 9.132146661688296, 7.750655625124709, 6.445145118245414, 5.760928671876355, 4.041291302080659, 3.591837600560529, 0.7640424010500604, 0.1738639840537784, 2.2068337802212286, -81.05302946841077, 96.17757415335177, -77.35894903819677, 95.85568683733698, 99.1981674250886, 99.89327888035015, 101.66673013734784, -79.1904234513751, -80.42952143783687, 100.63954789983896 ]) assert np.allclose(y_pred, expected_predictions) error = mean_absolute_error(y_true, y_pred) expected_error = 143.11351404083086 assert np.isclose(error, expected_error) expected_info = 'RegressionHoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 ' \ '- grace_period: 200 - split_criterion: variance reduction - split_confidence: 1e-07 ' \ '- tie_threshold: 0.05 - binary_split: False - stop_mem_management: False ' \ '- remove_poor_atts: False - no_pre_prune: False - leaf_prediction: mean - nb_threshold: 0 ' \ '- nominal_attributes: [] - ' assert learner.get_info() == expected_info assert isinstance(learner.get_model_description(), type('')) assert type(learner.predict(X)) == np.ndarray
def test_hoeffding_tree_coverage(test_path): # Cover nominal attribute observer test_file = os.path.join(test_path, 'regression_data.npz') data = np.load(test_file) X = data['X'] y = data['y'] # Typo in leaf prediction learner = RegressionHoeffdingTree( leaf_prediction='percptron', nominal_attributes=[i for i in range(3)] ) print(learner.split_criterion) # Invalid split_criterion learner.split_criterion = 'VR' learner.partial_fit(X, y) assert learner._estimator_type == 'regressor'
def test_regression_hoeffding_tree_model_description(): stream = RegressionGenerator( n_samples=500, n_features=20, n_informative=15, random_state=1 ) stream.prepare_for_use() learner = RegressionHoeffdingTree(leaf_prediction='mean') max_samples = 500 X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) expected_description = "if Attribute 6 <= 0.1394515530995348:\n" \ " Leaf = Statistics {0: 276.0000, 1: -21537.4157, 2: 11399392.2187}\n" \ "if Attribute 6 > 0.1394515530995348:\n" \ " Leaf = Statistics {0: 224.0000, 1: 22964.8868, 2: 10433581.2534}\n" assert SequenceMatcher( None, expected_description, learner.get_model_description() ).ratio() > 0.9
def test_evaluate_regression_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. from skmultiflow.data import RegressionGenerator from skmultiflow.trees import RegressionHoeffdingTree max_samples = 1000 # Stream stream = RegressionGenerator(n_samples=max_samples) stream.prepare_for_use() # Learner htr = RegressionHoeffdingTree() output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = ['mean_square_error', 'mean_absolute_error'] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) evaluator.evaluate(stream=stream, model=htr, model_names=['HTR'])
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval, prob_instance, inst_delay): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) ####### Call drift detectors ## Supervised # Supervised with ADWIN S_ADWIN = ADWIN() #(delta=delta) S_ADWIN_alarms = [] # Supervised with PHT S_PHT = PHT() #(min_instances=window,delta=delta) S_PHT_alarms = [] # Delayed Supervised with ADWIN DS_ADWIN = ADWIN() #(delta=delta) DS_ADWIN_alarms = [] # Delayed Supervised with PHT DS_PHT = PHT() #(min_instances=window,delta=delta) DS_PHT_alarms = [] ## Semi-supervised # Semi-Supervised with ADWIN WS_ADWIN = ADWIN() #(delta=delta) WS_ADWIN_alarms = [] # Supervised with PHT WS_PHT = PHT() #(min_instances=window,delta=delta) WS_PHT_alarms = [] # Delayed Supervised with ADWIN DWS_ADWIN = ADWIN() #(delta=delta) DWS_ADWIN_alarms = [] # Delayed Supervised with PHT DWS_PHT = PHT() #(min_instances=window,delta=delta) DWS_PHT_alarms = [] ##### Unsupervised # Student with ADWIN U_ADWIN = ADWIN() #(delta=delta) U_ADWIN_alarms = [] # Student with PHT U_PHT = PHT() #(min_instances=window,delta=delta) U_PHT_alarms = [] # Student with ADWIN UR_ADWIN = ADWIN() #(delta=delta) UR_ADWIN_alarms = [] # Student with PHT UR_PHT = PHT() #(min_instances=window,delta=delta) UR_PHT_alarms = [] # WRS with output WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Output_alarms = [] # WRS with class prob WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Prob_alarms = [] # TT with output TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Output_alarms = [] # TT with class prob TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Prob_alarms = [] # KS with output KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Output_alarms = [] # KS with class prob KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Prob_alarms = [] Driftmodels = [ S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT, U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output, WRS_Prob, TT_Prob, KS_Prob ] Driftmodels_alarms = [ S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms, WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms, U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms, WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms, TT_Prob_alarms, KS_Prob_alarms ] S_driftmodels = Driftmodels[0:2] DS_driftmodels = Driftmodels[2:4] WS_driftmodels = Driftmodels[4:6] DWS_driftmodels = Driftmodels[6:8] Ustd_driftmodels = Driftmodels[8:10] Ustdreg_driftmodels = Driftmodels[10:12] Uoutput_driftmodels = Driftmodels[12:15] Uprob_driftmodels = Driftmodels[15:18] # always updated S_clf = copy.deepcopy(stream_clf) # always updated with delay DS_clf = copy.deepcopy(stream_clf) # updated immediately with some prob WS_clf = copy.deepcopy(stream_clf) # updated with delay with some prob DWS_clf = copy.deepcopy(stream_clf) # never updated U_clf = copy.deepcopy(stream_clf) i = ntrain k = 0 DWS_yhat_hist = [] DS_yhat_hist = [] X_hist = [] y_hist = [] while (stream.has_more_samples()): print(i) #i=3000 Xi, yi = stream.next_sample() y_hist.append(yi[0]) X_hist.append(Xi) ext_Xi = np.concatenate([Xtrain[-10:], Xi]) U_prob = U_clf.predict_proba(ext_Xi)[-1] U_yhat = U_clf.predict(ext_Xi)[-1] S_yhat = S_clf.predict(ext_Xi)[-1] WS_yhat = WS_clf.predict(ext_Xi)[-1] DS_yhat = DS_clf.predict(ext_Xi)[-1] DWS_yhat = DWS_clf.predict(ext_Xi)[-1] DWS_yhat_hist.append(DWS_yhat) DS_yhat_hist.append(DS_yhat) if len(U_prob) < 2: U_yhat_prob_i = U_prob[0] elif len(U_prob) == 2: U_yhat_prob_i = U_prob[1] else: U_yhat_prob_i = np.max(U_prob) y_meta_hat_i = student_clf.predict(ext_Xi)[-1] y_meta_prob = student_regr.predict(ext_Xi)[-1] # Updating student model student_clf.partial_fit(Xi, [U_yhat]) # Updating supervised model S_clf.partial_fit(Xi, yi) # Computing loss S_err_i = int(yi[0] != S_yhat) student_err_i = int(y_meta_hat_i != U_yhat) student_prob_err_i = U_yhat_prob_i - y_meta_prob for model in S_driftmodels: model.add_element(S_err_i) for model in Ustd_driftmodels: model.add_element(student_err_i) for model in Ustdreg_driftmodels: model.add_element(student_prob_err_i) for model in Uoutput_driftmodels: model.add_element(U_yhat) for model in Uprob_driftmodels: model.add_element(U_yhat_prob_i) put_i_available = np.random.binomial(1, prob_instance) if k >= inst_delay: DS_err_i = int( y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay]) DS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DS_driftmodels: model.add_element(DS_err_i) if put_i_available > 0: DWS_err_i = int( y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay]) DWS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DWS_driftmodels: model.add_element(DWS_err_i) if put_i_available > 0: WS_err_i = int(yi[0] != WS_yhat) WS_clf.partial_fit(Xi, yi) for model in WS_driftmodels: model.add_element(WS_err_i) # detect changes for j, model in enumerate(Driftmodels): has_change = model.detected_change() if has_change: Driftmodels_alarms[j].append(i) i += 1 k += 1 return ([Driftmodels_alarms, dpoints])
X = tdf[["Pressure (millibars)", "Humidity", "Wind Speed (km/h)"]].resample("6H").mean() y = tdf[["Temperature (C)"]].resample("6H").max() X.plot(subplots=True, layout=(1, 3)) y.plot() #%% reload(samknnreg) from samknnreg import SAMKNNRegressor sam = SAMKNNRegressor() hat = RegressionHAT() rht = RegressionHoeffdingTree() ds = DataStream(X, y=y) ds.prepare_for_use() evaluator = EvaluatePrequential( show_plot=True, n_wait=730, batch_size=28, metrics=['mean_square_error', 'true_vs_predicted']) #%% evaluator.evaluate(stream=ds, model=[sam, rht, hat], model_names=[ "SAM", "Hoeffding Tree Regressor", "Hoeffding Tree Regressor (Adaptive)"