def __init__(self, n_estimators=10, max_features='auto', grace_period=50): super(ARFPredictor, self).__init__() self._n_estimators = n_estimators self._max_features = max_features self._classifier = AdaptiveRandomForest( n_estimators=self._n_estimators, max_features=self._max_features, grace_period=grace_period, random_state=42) self._trained_samples = 0
def test_adaptive_random_forests_nb(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112, leaf_prediction='nb') X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1 ] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. assert type(learner.predict(X)) == np.ndarray assert np.alltrue(predictions == last_version_predictions) expected_info = "AdaptiveRandomForest(binary_split=False, disable_weighted_vote=False,\n" \ " drift_detection_method=ADWIN(delta=0.001), grace_period=50,\n" \ " lambda_value=6, leaf_prediction='nb',\n" \ " max_byte_size=33554432, max_features=5,\n" \ " memory_estimate_period=2000000, n_estimators=3,\n" \ " nb_threshold=0, no_preprune=False, nominal_attributes=None,\n" \ " performance_metric='acc', random_state=112,\n" \ " remove_poor_atts=False, split_confidence=0.01,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05,\n" \ " warning_detection_method=ADWIN(delta=0.01))" assert learner.get_info() == expected_info
def test_grid(): clfs = [ OzaBagging(base_estimator=KNN()), OzaBaggingAdwin(base_estimator=KNN()), AdaptiveRandomForest(), SAMKNN() ] cv = CrossValidation(clfs=clfs, max_samples=1000000, test_size=1) cv.streams = [ ConceptDriftStream( stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, alpha=90.0, # angle of change grade 0 - 90 position=250000, width=1), ConceptDriftStream( stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, alpha=90.0, # angle of change grade 0 - 90 position=250000, width=50000) ] cv.test() cv.save_summary()
def __call__(self): l = TS_HoeffdingTree(max_byte_size=self.max_size, memory_estimate_period=1000) if self.learner_type == 'NBN': l = NaiveBayes() if self.learner_type == 'ARFN': l = AdaptiveRandomForest(n_estimators=n_estimators) if self.learner_type == 'HATN': l = get_TS_HAT_learner() if self.learner_type == 'ARF_HATN': l = get_ARF_HAT_learner() if self.learner_type == 'HC': l = get_HC_learner(options.max_size, w)
def test_adaptive_random_forests_batch_predict_proba(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 500 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample(5) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): p = learner.predict_proba(X) assert p.shape == (5, 2) predictions.append(p) true_labels.append(y) learner.partial_fit(X, y) cnt += 1 all_predictions = np.concatenate(predictions) # all_true_labels = np.asarray(true_labels).flatten() # correct_predictions = sum(np.equal(all_true_labels, all_predictions.argmax(axis=1))) assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in all_predictions ]), "Probabilities should sum to 1." assert all_predictions.shape == (4 * 5, 2) last_version_predictions = [ 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1 ] assert type(learner.predict_proba(X)) == np.ndarray assert np.alltrue( all_predictions.argmax(axis=1) == last_version_predictions)
def test_adaptive_random_forests(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. # If these tests fail, make sure that what is worked on *should* change the predictions of ARF. if sys.version_info.major == 3 and sys.version_info.minor >= 6: # Temporary disable as pre-3.6 give different predictions than 3.6+ assert np.alltrue(predictions == last_version_predictions) assert type(learner.predict(X)) == np.ndarray
def init_classifiers(): n_prototypes_per_class = 4 sigma = 4 rslvq = RSLVQ(prototypes_per_class=4, sigma=4) arslvq = ARSLVQ(prototypes_per_class=n_prototypes_per_class, sigma=sigma, confidence=0.0001, window_size=300) oza = OzaBaggingAdwin(base_estimator=KNN()) adf = AdaptiveRandomForest() samknn = SAMKNN() hat = HAT() clfs = [samknn] names = ["SamKnn"] # clfs = [rslvq] # names = ["rslvq"] return clfs, names
def test_adaptive_random_forests_labels_given(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict_proba(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1].argmax()): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in predictions ]), "Probabilities should sum to 1." class_probabilities = np.asarray(predictions).squeeze() assert class_probabilities.shape == (49, 2) predictions = class_probabilities.argmax(axis=1) last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0 ] assert np.alltrue(predictions == last_version_predictions)
print("Class {}: {}".format(i, recall)) hatc = HoeffdingAdaptiveTreeClassifier() evaluator.evaluate(stream = stream_, model = [hatc], model_names = ['Adaptive Tree']) cm = evaluator.get_mean_measurements(0).confusion_matrix print("Recall per class") for i in range(cm.n_classes): recall = cm.data[(i,i)]/cm.sum_col[i] \ if cm.sum_col[i] != 0 else 'Ill-defined' print("Class {}: {}".format(i, recall)) ooza = OzaBaggingAdwin() evaluator.evaluate(stream = stream_, model = [ooza], model_names = ['Oza Bagging Adwin']) cm = evaluator.get_mean_measurements(0).confusion_matrix print("Recall per class") for i in range(cm.n_classes): recall = cm.data[(i,i)]/cm.sum_col[i] \ if cm.sum_col[i] != 0 else 'Ill-defined' print("Class {}: {}".format(i, recall)) arf = AdaptiveRandomForest() evaluator.evaluate(stream = stream_, model = [arf], model_names = ['Adaptive Random Forest']) cm = evaluator.get_mean_measurements(0).confusion_matrix print("Recall per class") for i in range(cm.n_classes): recall = cm.data[(i,i)]/cm.sum_col[i] \ if cm.sum_col[i] != 0 else 'Ill-defined' print("Class {}: {}".format(i, recall))
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([options.moa_learner, str(options.concept_limit), 'py']) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle" pickle_full_fn = os.sep.join([fn_path, pickle_fn]) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) if os.path.exists(pickle_full_fn): skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn datastream_pickle_filename = pickle_full_fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: with open(f'{datastream_pickle_filename}', 'rb') as f: concept_chain = pickle.load(f) print(concept_chain) concepts = sorted(list(concept_chain.keys())) num_examples = concepts[-1] + (concepts[-1] - concepts[-2]) stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f"{bat_filename} {options.moa_location}" print(command) print(options.moa_learner) if options.moa_learner != 'arf': if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0], dtype='float64') df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) right = 0 wrong = 0 overall_log = [] while datastream.has_more_samples(): X, y = datastream.next_sample() prediction = learner.predict(X) is_correct = prediction[0] == y[0] if is_correct: right += 1 else: wrong += 1 learner.partial_fit(X, y) if (right + wrong) > 0 and (right + wrong) % 200 == 0: overall_log.append((right + wrong, right / (right + wrong))) print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end="") overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy']) overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") print("") print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start)
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([ options.moa_learner, str(options.concept_limit), 'pyn', str(options.seed) ]) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0]) print(df.tail()) for c in df.columns: print(f"Factoizing {c}") if pd.api.types.is_string_dtype(df[c]): print(pd.factorize(df[c])[0].shape) df[c] = pd.factorize(df[c])[0] print(df.tail()) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: num_examples = df.shape[0] stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux, name=name, num_features=len(df.columns) - 1) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f'{bat_filename} "{options.moa_location}"' print(command) print(options.moa_learner) if options.moa_learner != 'arf' or options.use_moa: if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: # df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) avg_memory, max_memory = evaluate_prequential( datastream=datastream, classifier=learner, directory=options.experiment_directory, name=name) # right = 0 # wrong = 0 # overall_log = [] # while datastream.has_more_samples(): # X,y = datastream.next_sample() # prediction = learner.predict(X) # is_correct = prediction[0] == y[0] # if is_correct: # right += 1 # else: # wrong += 1 # learner.partial_fit(X, y) # if (right + wrong) > 0 and (right + wrong) % 200 == 0: # overall_log.append((right+ wrong, right / (right + wrong))) # print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end = "") # overall = pd.DataFrame(overall_log, columns = ['ex', 'overall_accuracy']) # overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") # print("") # print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start) with open(f"{options.experiment_directory}{os.sep}{name}_timer.txt", "w") as f: f.write( f"Elapsed time during the whole program in seconds: {t_stop-t_start}" )
def test_grid(): clfs = [RRSLVQ(prototypes_per_class=4,sigma=8),RSLVQ(prototypes_per_class=4,sigma=8),HAT(),OzaBaggingAdwin(base_estimator=KNN()),AdaptiveRandomForest(),SAMKNN()] cv = CrossValidation(clfs=clfs,max_samples=1000000,test_size=1) cv.streams = cv.init_reoccuring_streams() cv.test() cv.save_summary() print("here")
def tune(self, stock_data, symbols, num_features, measure='f1', trading_frequency=10, training_years=3, trading_days_per_year=246): gp_space = [10, 15, 20] n_est_space = [50, 100] sqrt_feat = sqrt(num_features) max_f_space = [round(sqrt_feat)] accuracies = [] f_scores = [] params = [] for gp in gp_space: for n_estimators in n_est_space: for max_features in max_f_space: accs = [] fs = [] for stock in symbols: print(stock) self._n_estimators = n_estimators self._max_features = max_features self._trained_samples = 0 self._classifier = AdaptiveRandomForest( n_estimators=self._n_estimators, max_features=self._max_features, grace_period=gp, random_state=42) preds = [] X = stock_data[stock][[ 'I{}'.format(x) for x in range(1, num_features + 1) ]] y = stock_data[stock]['target'] first_trading_day = training_years * trading_days_per_year + trading_frequency last_trading_day = first_trading_day - trading_frequency for trading_day in range(first_trading_day, len(y)): sys.stdout.write('\r%i/%i' % (trading_day, len(y))) sys.stdout.flush() preds.append( self.predict(X.iloc[:last_trading_day].values, y.iloc[:last_trading_day].values, X.iloc[trading_day], training_years, trading_days_per_year)) last_trading_day = trading_day print('') preds = pd.Series(preds, name='Predictions') accuracy = metrics.accuracy_score( y.iloc[first_trading_day:], preds) accs.append(accuracy) f_score = metrics.f1_score(y.iloc[first_trading_day:], preds) fs.append(f_score) mean_acc = np.array(accs).mean() accuracies.append(mean_acc) mean_f_score = np.array(fs).mean() f_scores.append(mean_f_score) params.append((n_estimators, max_features, gp)) print('\nARF n_estimators=%i max_features=%i gp=%i' % (n_estimators, max_features, gp)) print('ARF Accuracy: %.3f' % mean_acc) print('ARF F1 Score: %.3f' % mean_f_score) accuracies = np.array(accuracies) f_scores = np.array(f_scores) print('\nARF FINAL RESULTS') if measure == 'accuracy': (n_estimators, max_features, gp) = params[accuracies.argmax()] print( 'ARF Best result: accuracy %.3f (n_estimators=%i max_features=%i gp=%i)' % (accuracies.max(), n_estimators, max_features, gp)) elif measure == 'f1': (n_estimators, max_features, gp) = params[f_scores.argmax()] print( 'ARF Best result: f score %.3f (n_estimators=%i max_features=%i gp=%i)' % (f_scores.max(), n_estimators, max_features, gp)) else: raise NotImplementedError return [n_estimators, max_features]
def l(): return AdaptiveRandomForest(n_estimators=n_estimators)