time_left_over = 0 for basename in datanames: # Loop over datasets vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print (D) vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode<1: time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET! else: time_budget = max_time overall_time_budget = overall_time_budget + time_budget vprint( verbose, "[+] Cumulated time budget (all tasks so far) %5.2f sec" % (overall_time_budget)) # We do not add the time left over form previous dataset: time_budget += time_left_over vprint( verbose, "[+] Time budget for this task %5.2f sec" % time_budget) time_spent = time.time() - start vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False
pass try: os.remove(filename_test) except: pass logger.info("************************************************") logger.info("******** Processing dataset " + basename.capitalize() + "********") logger.info("************************************************") # ======== Creating a data object with data, informations about it logger.info("========= Reading and converting data ==========") D = DataManager(basename, data_dir, max_samples=max_samples) logger.info(str(D)) logger.info("[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) overall_time_budget = min(max_time, D.info['time_budget']) # ======== Create auto-sklearn model new_info_object = {} new_info_object['is_sparse'] = D.info['is_sparse'] new_info_object['task'] = STRING_TO_TASK_TYPES[D.info['task']] new_info_object['metric'] = STRING_TO_METRIC[D.info['metric']] configuration_space = get_configuration_space(new_info_object) try: config = ConfigSpace.Configuration(configuration_space, configuration) except Exception as inst: execution_success = False logger.critical(inst) continue
# ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint(verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print D vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode < 1: time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET! else: time_budget = max_time overall_time_budget = overall_time_budget + time_budget vprint( verbose, "[+] Cumulated time budget (all tasks so far) %5.2f sec" % (overall_time_budget)) # We do not add the time left over form previous dataset: time_budget += time_left_over vprint(verbose, "[+] Time budget for this task %5.2f sec" % time_budget) time_spent = time.time() - start
#### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_version() data_io.show_io(input_dir, output_dir) exit(0) vprint( verbose, "****************************************************") vprint( verbose, "******** Processing spatio-temporal dataset ********") vprint( verbose, "****************************************************") #### Instanciate input data manager and load data vprint( verbose, "========= Reading and converting data ==========") Din = DataManager(datatype="input", verbose=verbose) Din.loadData(input_dir) vprint( verbose, Din) vprint( verbose, "[+] Size of uploaded data {:5.2f} bytes".format(data_io.total_size(Din))) #### Instanciate output data manager and load data Dout = DataManager(datatype="output", verbose=verbose) Dout.col_names = Din.col_names[Din.ycol0:] Dout.horizon = Din.horizon Dout.stride = Din.horizon #### In debug mode, cheat and get the solution too if debug_mode>1: Dsol = DataManager(datatype="input", verbose=verbose) Dsol.loadData(input_dir) #### Instanciate predictive model vprint( verbose, "======== Creating model ==========") M = Model()
for basename in datanames: # Loop over datasets vprint( verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print(D) vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode<1: time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET! else: time_budget = max_time overall_time_budget = overall_time_budget + time_budget vprint( verbose, "[+] Cumulated time budget (all tasks so far) %5.2f sec" % (overall_time_budget)) # We do not add the time left over form previous dataset: time_budget += time_left_over vprint( verbose, "[+] Time budget for this task %5.2f sec" % time_budget) time_spent = time.time() - start vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False
continue time_predict_value = time_to_predict(D) time_budget = time_budget - time_spent # Remove time spent so far start = time.time() # Reset the counter time_spent = 0 # Initialize time spent learning M.time_limit = time_budget * time_predict_value * 0.9 vprint(verbose, "[+] Time budget to train the model %5.2f sec" % M._time_limit) Xtest = None if D.info['test_num'] < 1000: Xtest = np.array([x.hy.full_array() for x in read_data(test_fname)]).T M.fit(X, y, test_set=Xtest) # log_reg = LogisticRegression(random_state=0, class_weight='balanced') # log_reg.fit(M.raw_decision_function(X), y) vprint(verbose, "=========== " + basename.capitalize() + " Training cycle " + " ================") vprint(verbose, "[+] Fitting success, time spent so far %5.2f sec" % (time.time() - start)) vprint(verbose, "[+] Size of trained model %5.2f bytes" % data_io.total_size(M)) # Make predictions # ----------------- if os.path.isfile(valid_fname): Y_valid = M.predict_proba(read_data(valid_fname))[:, 1] # Y_valid = log_reg.predict_proba(M.raw_decision_function(read_data(valid_fname)))[:, 1] else: Y_valid = None if Xtest is None: Xtest = read_data(test_fname) Y_test = M.predict_proba(Xtest)[:, 1] # Y_test = log_reg.predict_proba(M.raw_decision_function(read_data(test_fname)))[:, 1] vprint(verbose, "[+] Prediction success, time spent so far %5.2f sec" % (time.time() - start)) # Write results # ------------- filename_valid = basename + '_valid.predict'
time_left_over = 0 for basename in datanames: # Loop over datasets vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print D vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode<1: time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET! else: time_budget = max_time overall_time_budget = overall_time_budget + time_budget vprint( verbose, "[+] Cumulated time budget (all tasks so far) %5.2f sec" % (overall_time_budget)) # We do not add the time left over form previous dataset: time_budget += time_left_over vprint( verbose, "[+] Time budget for this task %5.2f sec" % time_budget) time_spent = time.time() - start vprint( verbose, "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent)) if time_spent >= time_budget: vprint( verbose, "[-] Sorry, time budget exceeded, skipping this task") execution_success = False
# ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint(verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print(D) vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode < 1: time_budget = D.info['time_budget'] # <== HERE IS THE TIME BUDGET! else: time_budget = max_time #print overall_time_budget #print time_budget time_budget = float(time_budget) overall_time_budget = overall_time_budget + time_budget vprint( verbose, "[+] Cumulated time budget (all tasks so far) %5.2f sec" % (overall_time_budget)) # We do not add the time left over form previous dataset: time_budget += time_left_over