logger.debug("Using output_dir: " + output_dir) logger.debug("Using ingestion_program_dir: " + ingestion_program_dir) logger.debug("Using code_dir: " + code_dir) # Our libraries path.append(ingestion_program_dir) path.append(code_dir) #IG: to allow submitting the starting kit as sample submission path.append(code_dir + '/sample_code_submission') import data_io from dataset import AutoSpeechDataset # THE class of AutoNLP datasets data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(dataset_dir) #### Delete zip files and metadata file datanames = [x for x in datanames if x.endswith('.data')] if len(datanames) != 1: raise ValueError("{} datasets found in dataset_dir={}!\n"\ .format(len(datanames), dataset_dir) + "Please put only ONE dataset under dataset_dir.") basename = datanames[0] D = AutoSpeechDataset(os.path.join(dataset_dir, basename)) metadata = D.get_metadata() time_budget = metadata.get("time_budget", time_budget) logger.info("Time budget: {}".format(time_budget)) write_start_file(output_dir, start_time=start, time_budget=time_budget,
input_dir = default_input_dir output_dir = default_output_dir else: input_dir = argv[1] output_dir = os.path.abspath(argv[2]); vprint( verbose, "Using input_dir: " + input_dir) vprint( verbose, "Using output_dir: " + output_dir) # Move old results and create a new output directory if not(running_on_codalab) and save_previous_results: data_io.mvdir(output_dir, output_dir+'_'+the_date) data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) # Overwrite the "natural" order #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******")
from data_io import vprint # print only in verbose mode from data_manager import DataManager # load/save data and get info about them sys.path.append("libs") default_input_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\input" default_output_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\output" if len(argv)==1: # Use the default input and output directories if no arguments are provided input_dir = default_input_dir output_dir = default_output_dir else: input_dir = argv[1] output_dir = os.path.abspath(argv[2]); #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing for basename in datanames: # Loop over datasets if basename not in ["robert"]: continue vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************")
def ingestion_fn(dataset_dir, code_dir, time_budget, time_budget_approx, output_dir, score_dir, model_config_name=None, model_config=None): #### Check whether everything went well ingestion_success = True # Parse directories root_dir = _HERE(os.pardir) ingestion_program_dir = join(root_dir, "ingestion_program") if dataset_dir.endswith("run/input") and code_dir.endswith("run/program"): logger.debug( "Since dataset_dir ends with 'run/input' and code_dir " "ends with 'run/program', suppose running on " + "CodaLab platform. Modify dataset_dir to 'run/input_data' " "and code_dir to 'run/submission'. " + "Directory parsing should be more flexible in the code of " + "compute worker: we need explicit directories for " + "dataset_dir and code_dir.") dataset_dir = dataset_dir.replace("run/input", "run/input_data") code_dir = code_dir.replace("run/program", "run/submission") # Show directories for debugging logger.debug("sys.argv = " + str(sys.argv)) logger.debug("Using dataset_dir: " + dataset_dir) logger.debug("Using output_dir: " + output_dir) logger.debug("Using ingestion_program_dir: " + ingestion_program_dir) logger.debug("Using code_dir: " + code_dir) # Our libraries path.append(ingestion_program_dir) path.append(code_dir) # IG: to allow submitting the starting kit as sample submission path.append(code_dir + "/sample_code_submission") import data_io from dataset import AutoDLDataset # THE class of AutoDL datasets data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(dataset_dir) #### Delete zip files and metadata file datanames = [x for x in datanames if x.endswith(".data")] if len(datanames) != 1: raise ValueError("{} datasets found in dataset_dir={}!\n".format( len(datanames), dataset_dir) + "Please put only ONE dataset under dataset_dir.") basename = datanames[0] logger.info("************************************************") logger.info("******** Processing dataset " + basename[:-5].capitalize() + " ********") logger.info("************************************************") logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION)) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages # try: # init_time_budget = 20 * 60 # time budget for initilization. # timer = Timer() # timer.set(init_time_budget) # with timer.time_limit("Initialization"): ##### Begin creating model ##### logger.info("Creating model...this process should not exceed 20min.") from model import Model # in participants' model.py # The metadata of D_train and D_test only differ in sample_count M = Model(D_train.get_metadata(), model_config_name=model_config_name, model_config=model_config) ###### End creating model ###### # except TimeoutException as e: # logger.info("[-] Initialization phase exceeded time budget. Move to train/predict phase") # except Exception as e: # logger.error("Failed to initializing model.") # logger.error("Encountered exception:\n" + str(e), exc_info=True) # # Mark starting time of ingestion start = time.time() logger.info("=" * 5 + " Start core part of ingestion program. " + "Version: {} ".format(VERSION) + "=" * 5) write_start_file(output_dir, start_time=start, time_budget=time_budget, task_name=basename.split(".")[0]) try: # Check if the model has methods `train` and `test`. for attr in ["train", "test"]: if not hasattr(M, attr): raise ModelApiError( "Your model object doesn't have the method " + "`{}`. Please implement it in model.py.") # Check if model.py uses new done_training API instead of marking # stopping by returning None use_done_training_api = hasattr(M, "done_training") if not use_done_training_api: logger.warning( "Your model object doesn't have an attribute " + "`done_training`. But this is necessary for ingestion " + "program to know whether the model has done training " + "and to decide whether to proceed more training. " + "Please add this attribute to your model.") # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process while not (use_done_training_api and M.done_training): remaining_time_budget = start + time_budget - time.time() # Train the model logger.info("Begin training the model...") M.train(D_train.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished training the model.") # Make predictions using the trained model logger.info("Begin testing the model by making predictions " + "on test set...") remaining_time_budget = start + time_budget - time.time() Y_pred = M.test(D_test.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished making predictions.") if Y_pred is None: # Stop train/predict process if Y_pred is None logger.info("The method model.test returned `None`. " + "Stop train/predict process.") break else: # Check if the prediction has good shape prediction_shape = tuple(Y_pred.shape) if prediction_shape != correct_prediction_shape: raise BadPredictionShapeError( "Bad prediction shape! Expected {} but got {}.".format( correct_prediction_shape, prediction_shape)) remaining_time_budget = start + time_budget_approx - time.time() if remaining_time_budget < 0: break # Write timestamp to 'start.txt' write_timestamp(output_dir, predict_idx=prediction_order_number, timestamp=time.time()) # Prediction files: adult.predict_0, adult.predict_1, ... filename_test = basename[:-5] + ".predict_" + str( prediction_order_number) # Write predictions to output_dir data_io.write(os.path.join(output_dir, filename_test), Y_pred) prediction_order_number += 1 logger.info( "[+] {0:d} predictions made, time spent so far {1:.2f} sec". format(prediction_order_number, time.time() - start)) remaining_time_budget = start + time_budget_approx - time.time() logger.info( "[+] Time left {0:.2f} sec".format(remaining_time_budget)) except Exception as e: ingestion_success = False logger.info("Failed to run ingestion.") logger.error("Encountered exception:\n" + str(e), exc_info=True) # Finishing ingestion program end_time = time.time() overall_time_spent = end_time - start # Write overall_time_spent to a end.txt file end_filename = "end.txt" with open(os.path.join(output_dir, end_filename), "w") as f: f.write("ingestion_duration: " + str(overall_time_spent) + "\n") f.write("ingestion_success: " + str(int(ingestion_success)) + "\n") f.write("end_time: " + str(end_time) + "\n") logger.info("Wrote the file {} marking the end of ingestion.".format( end_filename)) if ingestion_success: logger.info("[+] Done. Ingestion program successfully terminated.") logger.info("[+] Overall time spent %5.2f sec " % overall_time_spent) else: logger.info( "[-] Done, but encountered some errors during ingestion.") logger.info("[-] Overall time spent %5.2f sec " % overall_time_spent) # Copy all files in output_dir to score_dir os.system("cp -R {} {}".format(os.path.join(output_dir, "*"), score_dir)) logger.debug("Copied all ingestion output to scoring output directory.") logger.info("[Ingestion terminated]")
def _main(args): # Mark starting time of ingestion start = time.time() logger.info("=" * 5 + " Start ingestion program. ") #### Check whether everything went well ingestion_success = True dataset_dir = args.dataset_dir output_dir = args.output_dir ingestion_program_dir = args.ingestion_program_dir code_dir = args.code_dir score_dir = args.score_dir time_budget = args.time_budget if dataset_dir.endswith('run/input') and\ code_dir.endswith('run/program'): logger.debug( "Since dataset_dir ends with 'run/input' and code_dir " "ends with 'run/program', suppose running on " + "CodaLab platform. Modify dataset_dir to 'run/input_data' " "and code_dir to 'run/submission'. " + "Directory parsing should be more flexible in the code of " + "compute worker: we need explicit directories for " + "dataset_dir and code_dir.") dataset_dir = dataset_dir.replace('run/input', 'run/input_data') code_dir = code_dir.replace('run/program', 'run/submission') # Show directories for debugging logger.debug("sys.argv = " + str(sys.argv)) logger.debug("Using dataset_dir: " + dataset_dir) logger.debug("Using output_dir: " + output_dir) logger.debug("Using ingestion_program_dir: " + ingestion_program_dir) logger.debug("Using code_dir: " + code_dir) # Our libraries path.append(ingestion_program_dir) path.append(code_dir) #IG: to allow submitting the starting kit as sample submission path.append(code_dir + '/sample_code_submission') import data_io from dataset import AutoSpeechDataset # THE class of AutoNLP datasets #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(dataset_dir) #### Delete zip files and metadata file datanames = [x for x in datanames if x.endswith('.data')] if len(datanames) != 1: raise ValueError("{} datasets found in dataset_dir={}!\n"\ .format(len(datanames), dataset_dir) + "Please put only ONE dataset under dataset_dir.") basename = datanames[0] D = AutoSpeechDataset(os.path.join(dataset_dir, basename)) metadata = D.get_metadata() time_budget = metadata.get("time_budget", time_budget) logger.info("Time budget: {}".format(time_budget)) write_start_file(output_dir, start_time=start, time_budget=time_budget, task_name=basename.split('.')[0]) logger.info("************************************************") logger.info("******** Processing dataset " + basename[:-5].capitalize() + " ********") logger.info("************************************************") ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D.read_dataset() ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D.get_test_num() output_dim = D.get_class_num() correct_prediction_shape = (num_examples_test, output_dim) try: # ========= Creating a model timer = Timer() timer.set( 20 * 60 ) # 20 min for participants to initializing and install other packages with timer.time_limit("Importing model"): from model import Model # in participants' model.py ##### Begin creating model ##### logger.info("Creating model...") with timer.time_limit('Initialization'): M = Model(metadata) ###### End creating model ###### except TimeoutException as e: logger.info( "[-] Initialization phase exceeded time budget. Move to train/predict phase" ) except Exception as e: logger.info("Failed to initializing model.") logger.error("Encountered exception:\n" + str(e), exc_info=True) raise finally: try: timer = Timer() timer.set(time_budget) # Check if the model has methods `train` and `test`. for attr in ['train', 'test']: if not hasattr(M, attr): raise ModelApiError( "Your model object doesn't have the method " + "`{}`. Please implement it in model.py.") # Check if model.py uses new done_training API instead of marking # stopping by returning None use_done_training_api = hasattr(M, 'done_training') if not use_done_training_api: logger.warning( "Your model object doesn't have an attribute " + "`done_training`. But this is necessary for ingestion " + "program to know whether the model has done training " + "and to decide whether to proceed more training. " + "Please add this attribute to your model.") # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process while (not (use_done_training_api and M.done_training)): # Train the model logger.info("Begin training the model...") remaining_time_budget = timer.remain with timer.time_limit('training'): M.train(D.get_train(), remaining_time_budget=timer.remain) logger.info("Finished training the model.") # Make predictions using the trained model logger.info("Begin testing the model by making predictions " + "on test set...") remaining_time_budget = timer.remain with timer.time_limit('predicting'): Y_pred = M.test( D.get_test(), remaining_time_budget=remaining_time_budget) logger.info("Finished making predictions.") if Y_pred is None: # Stop train/predict process if Y_pred is None logger.info("The method model.test returned `None`. " + "Stop train/predict process.") break else: # Check if the prediction has good shape prediction_shape = tuple(Y_pred.shape) if prediction_shape != correct_prediction_shape: raise BadPredictionShapeError( "Bad prediction shape! Expected {} but got {}."\ .format(correct_prediction_shape, prediction_shape) ) # Write timestamp to 'start.txt' write_timestamp(output_dir, predict_idx=prediction_order_number, timestamp=timer.exec) # Prediction files: adult.predict_0, adult.predict_1, ... filename_test = basename[:-5] + '.predict_' +\ str(prediction_order_number) # Write predictions to output_dir tmp_pred = np.argmax(Y_pred, axis=1) # data_io.write(os.path.join(output_dir,filename_test), Y_pred) data_io.write(os.path.join(output_dir, filename_test), tmp_pred) prediction_order_number += 1 logger.info("[+] {0:d} predictions made, time spent so far {1:.2f} sec"\ .format(prediction_order_number, time.time() - start)) logger.info("[+] Time left {0:.2f} sec".format(timer.remain)) except TimeoutException as e: logger.info( "[-] Ingestion program exceeded time budget. Predictions " "made so far will be used for evaluation.") except Exception as e: ingestion_success = False logger.info("Failed to run ingestion.") logger.error("Encountered exception:\n" + str(e), exc_info=True) raise finally: # Finishing ingestion program end_time = time.time() overall_time_spent = end_time - start # Write overall_time_spent to a end.txt file end_filename = 'end.txt' with open(os.path.join(output_dir, end_filename), 'w') as f: f.write('ingestion_duration: ' + str(overall_time_spent) + '\n') f.write('ingestion_success: ' + str(int(ingestion_success)) + '\n') f.write('end_time: ' + str(end_time) + '\n') logger.info("Wrote the file {} marking the end of ingestion."\ .format(end_filename)) if ingestion_success: logger.info( "[+] Done. Ingestion program successfully terminated.") logger.info("[+] Overall time spent %5.2f sec " % overall_time_spent) else: logger.info( "[-] Done, but encountered some errors during ingestion." ) logger.info("[-] Overall time spent %5.2f sec " % overall_time_spent) # Copy all files in output_dir to score_dir os.system("cp -R {} {}".format(os.path.join(output_dir, '*'), score_dir)) logger.debug( "Copied all ingestion output to scoring output directory.") logger.info("[Ingestion terminated]")
codalab_run_dir = os.path.join(run_dir, "program") if os.path.isdir(codalab_run_dir): run_dir=codalab_run_dir running_on_codalab = True print "Running on Codalab!" lib_dir = os.path.join(run_dir, "lib") res_dir = os.path.join(run_dir, "res") # Our libraries path.append (run_dir) path.append (lib_dir) import data_io # general purpose input/output functions from data_io import vprint # print only in verbose mode from data_manager import DataManager # load/save data and get info about them datanames = data_io.inventory_data(default_input_dir) verbose = True debug_mode = 0 zipme = True max_time = 90 max_cycle = 1 execution_success = True the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M") submission_filename = '../automl_sample_submission_' + the_date overall_start = time.time() if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint( verbose, "************************************************************************")
codalab_run_dir = os.path.join(run_dir, "program") if os.path.isdir(codalab_run_dir): run_dir = codalab_run_dir running_on_codalab = True print "Running on Codalab!" lib_dir = os.path.join(run_dir, "lib") res_dir = os.path.join(run_dir, "res") # Our libraries path.append(run_dir) path.append(lib_dir) import data_io # general purpose input/output functions from data_io import vprint # print only in verbose mode from data_manager import DataManager # load/save data and get info about them datanames = data_io.inventory_data(default_input_dir) verbose = True debug_mode = 0 zipme = True max_time = 90 max_cycle = 1 execution_success = True the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M") submission_filename = '../automl_sample_submission_' + the_date overall_start = time.time() if len(datanames) > 0: vprint( verbose, "************************************************************************" )