def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs): orig_X = data[:, :-1] orig_y = data[:, -1].astype(int) stream = DataStream(orig_X, orig_y) hf = HoeffdingTreeClassifier(**hf_kwargs) pretrainX, pretrainy = stream.next_sample(pre_train_size) # Pre-train hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values) evaluations = [] while stream.has_more_samples(): X, y = stream.next_sample() # Evaluation y_hat = hf.predict(X) evaluations.append(int(y_hat[0] == y[0])) # Train hf.partial_fit(X, y, classes=stream.target_values) return evaluations
mine_std = [] mine_alpha = [] pr_min = [] std_min = [] pi = [] mine_x_mean = [] mine_sum = [] mine_threshold = [] pred_grace_ht = [] pred_grace_ht_p = [] ht_p = None ML_accuracy = 0 ddm = DDM() h = hpy() while elec_stream.has_more_samples(): n_global += 1 X_test, y_test = elec_stream.next_sample() y_predict = ht.predict(X_test) ddm_start_time = time.time() ddm.add_element(y_test != y_predict) ML_accuracy += 1 if y_test == y_predict else 0 ddm_running_time = time.time() - ddm_start_time RT_ddm.append(ddm_running_time) if (n_global > grace_end): if (n_global > detect_end): if ht_p is not None: drift_point = detect_end - 2 * grace print("Accuracy of ht: " + str(np.mean(pred_grace_ht)))
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([options.moa_learner, str(options.concept_limit), 'py']) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle" pickle_full_fn = os.sep.join([fn_path, pickle_fn]) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) if os.path.exists(pickle_full_fn): skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn datastream_pickle_filename = pickle_full_fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: with open(f'{datastream_pickle_filename}', 'rb') as f: concept_chain = pickle.load(f) print(concept_chain) concepts = sorted(list(concept_chain.keys())) num_examples = concepts[-1] + (concepts[-1] - concepts[-2]) stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f"{bat_filename} {options.moa_location}" print(command) print(options.moa_learner) if options.moa_learner != 'arf': if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0], dtype='float64') df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) right = 0 wrong = 0 overall_log = [] while datastream.has_more_samples(): X, y = datastream.next_sample() prediction = learner.predict(X) is_correct = prediction[0] == y[0] if is_correct: right += 1 else: wrong += 1 learner.partial_fit(X, y) if (right + wrong) > 0 and (right + wrong) % 200 == 0: overall_log.append((right + wrong, right / (right + wrong))) print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end="") overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy']) overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") print("") print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start)
mine_std = [] mine_alpha = [] pr_min = [] std_min = [] pi = [] mine_x_mean = [] mine_sum = [] mine_threshold = [] pred_grace_ht = [] pred_grace_ht_p = [] ht_p = None ML_accuracy = 0 ddm = DDM() h = hpy() while weather_stream.has_more_samples(): n_global += 1 X_test, y_test = weather_stream.next_sample() y_predict = ht.predict(X_test) ddm_start_time = time.time() ddm.add_element(y_test != y_predict) ML_accuracy += 1 if y_test == y_predict else 0 ddm_running_time = time.time() - ddm_start_time RT_ddm.append(ddm_running_time) if (n_global > grace_end): if (n_global > detect_end): if ht_p is not None: drift_point = detect_end - 2 * grace print("Accuracy of ht: " + str(np.mean(pred_grace_ht)))
def increment_model(ht_regressor): try: start_time = time.time() # val_df = pd.read_sql(engine.execute("select * from consumption where integrated = 0 limit 0,10").statement,session.bind) logging.info("[ML - modIncrement] Loading data... Time: " + str(round(time.time() - start_time, 2))) val_df = pd.read_sql( session.query(Consumption).filter( Consumption.integrated == False).limit(2000000).statement, session.bind) logging.info("[ML - modIncrement] Data loaded... Time: " + str(round(time.time() - start_time, 2))) n_samples = 0 cnter = 0 client_ids = [] logging.info( "[ML - modIncrement] Starting model incremental fitting... Time: " + str(round(time.time() - start_time, 2))) client_id_max = max(val_df.client_id.unique()) client_id_min = min(val_df.client_id.unique()) df = val_df.drop( columns=['id', 'client_id', 'year', 'month', 'integrated']) stream = DataStream(data=df, target_idx=0) plr = [] plprev_ht = [] while stream.has_more_samples(): X, y = stream.next_sample() if (cnter % 7000 == 0): y_prev = ht_regressor.predict(X) plr.append(y) plprev_ht.append(y_prev) ht_regressor.partial_fit(X, y) if (cnter % 10000 == 0): logging.info("[ML - modIncrement] Extracting element #" + str(cnter) + " Time: " + str(round(time.time() - start_time, 2))) n_samples += 1 cnter += 1 fig, ax = plt.subplots(figsize=(15, 6)) plt.plot(range(len(plr)), plr, 'b-', label='Real') plt.plot(range(len(plprev_ht)), plprev_ht, 'g--', label='HoeffdingTreeRegressor') plt.legend() mse = mean_squared_error(plr, plprev_ht) r2 = r2_score(plr, plprev_ht) plt.suptitle(client_id_max, fontsize=12) plt.title("R2: " + str(r2) + " MSE: " + str(mse)) filename = "images/predictionHT12F" + str(r2) + ".png" plt.savefig(filename) plt.close() #Updating logging.info("[ML - modIncrement] Execution %d --- %s seconds ---" % (cnter, round(time.time() - start_time, 2))) return ht_regressor, client_id_min, client_id_max except: logging.error("[ML - modIncrement] Stopping...")