def read_data(self): """Given a dataset directory, read and return training/test set data as `AutoDLDataset` objects, along with other infomation. Args: dataset_dir: a string indicating the absolute or relative path of a formatted AutoDL dataset. Returns: d_train, d_test: 2 'AutoDLDataset' objects, containing training/test data. other_info: a dict containing some additional info on the dataset, e.g. the metadata on the column names and class names (contained in `label_to_index_map`). """ dataset_dir = self.dataset_dir files = os.listdir(dataset_dir) data_files = [x for x in files if x.endswith('.data')] assert len(data_files) == 1 dataset_name = data_files[0][:-5] solution_files = [x for x in files if x.endswith('.solution')] with_solution = None # With or without solution (i.e. training or test) if len(solution_files) == 1: solution_dataset_name = solution_files[0][:-9] if solution_dataset_name == dataset_name: with_solution = True else: raise ValueError("Wrong dataset name. Should be {} but got {}."\ .format(dataset_name, solution_dataset_name)) elif not solution_files: with_solution = False else: return ValueError("Multiple solution files found:" +\ " {}".format(solution_files)) d_train = AutoDLDataset( os.path.join(dataset_dir, dataset_name + '.data', "train")) d_test = AutoDLDataset( os.path.join(dataset_dir, dataset_name + '.data', "test")) other_info = {} other_info['dataset_name'] = dataset_name other_info['with_solution'] = with_solution label_to_index_map = d_train.get_metadata().get_label_to_index_map() if label_to_index_map: classes_list = [None] * len(label_to_index_map) for label in label_to_index_map: index = label_to_index_map[label] classes_list[index] = label other_info['classes_list'] = classes_list else: tf.logging.info( "No label_to_index_map found in metadata. Labels will " "only be represented by integers.") self.d_train, self.d_test, self.other_info = d_train, d_test, other_info if with_solution: solution_path = os.path.join(dataset_dir, solution_files[0]) self.other_info['Y_test'] = np.loadtxt(solution_path) return d_train, d_test, other_info
basename = datanames[0] logger.info("************************************************") logger.info("******** Processing dataset " + basename[:-5].capitalize() + " ********") logger.info("************************************************") logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION)) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages try: init_time_budget = 20 * 60 # time budget for initilization. timer = Timer() timer.set(init_time_budget) with timer.time_limit("Initialization"): ##### Begin creating model ##### logger.info("Creating model...this process should not exceed 20min.") from model import Model # in participants' model.py M = Model(D_train.get_metadata()) # The metadata of D_train and D_test only differ in sample_count ###### End creating model ###### except TimeoutException as e:
logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION)) # Write solution files in output_dir copy_solution(dataset_dir, output_dir) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) D_predict_data = None if os.path.exists(os.path.join(dataset_dir, basename, "unlabelled")): logger.info("Found some data to predict...") D_predict = AutoDLDataset( os.path.join(dataset_dir, basename, "unlabelled")) num_examples_predict = D_predict.get_metadata().size() D_predict_data = D_predict.get_dataset() ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages try: init_time_budget = 20 * 60 # time budget for initilization. timer = Timer() timer.set(init_time_budget) with timer.time_limit("Initialization"):
def ingestion_fn(dataset_dir, code_dir, time_budget, time_budget_approx, output_dir, score_dir, model_config_name=None, model_config=None): #### Check whether everything went well ingestion_success = True # Parse directories root_dir = _HERE(os.pardir) ingestion_program_dir = join(root_dir, "ingestion_program") if dataset_dir.endswith("run/input") and code_dir.endswith("run/program"): logger.debug( "Since dataset_dir ends with 'run/input' and code_dir " "ends with 'run/program', suppose running on " + "CodaLab platform. Modify dataset_dir to 'run/input_data' " "and code_dir to 'run/submission'. " + "Directory parsing should be more flexible in the code of " + "compute worker: we need explicit directories for " + "dataset_dir and code_dir.") dataset_dir = dataset_dir.replace("run/input", "run/input_data") code_dir = code_dir.replace("run/program", "run/submission") # Show directories for debugging logger.debug("sys.argv = " + str(sys.argv)) logger.debug("Using dataset_dir: " + dataset_dir) logger.debug("Using output_dir: " + output_dir) logger.debug("Using ingestion_program_dir: " + ingestion_program_dir) logger.debug("Using code_dir: " + code_dir) # Our libraries path.append(ingestion_program_dir) path.append(code_dir) # IG: to allow submitting the starting kit as sample submission path.append(code_dir + "/sample_code_submission") import data_io from dataset import AutoDLDataset # THE class of AutoDL datasets data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(dataset_dir) #### Delete zip files and metadata file datanames = [x for x in datanames if x.endswith(".data")] if len(datanames) != 1: raise ValueError("{} datasets found in dataset_dir={}!\n".format( len(datanames), dataset_dir) + "Please put only ONE dataset under dataset_dir.") basename = datanames[0] logger.info("************************************************") logger.info("******** Processing dataset " + basename[:-5].capitalize() + " ********") logger.info("************************************************") logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION)) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages # try: # init_time_budget = 20 * 60 # time budget for initilization. # timer = Timer() # timer.set(init_time_budget) # with timer.time_limit("Initialization"): ##### Begin creating model ##### logger.info("Creating model...this process should not exceed 20min.") from model import Model # in participants' model.py # The metadata of D_train and D_test only differ in sample_count M = Model(D_train.get_metadata(), model_config_name=model_config_name, model_config=model_config) ###### End creating model ###### # except TimeoutException as e: # logger.info("[-] Initialization phase exceeded time budget. Move to train/predict phase") # except Exception as e: # logger.error("Failed to initializing model.") # logger.error("Encountered exception:\n" + str(e), exc_info=True) # # Mark starting time of ingestion start = time.time() logger.info("=" * 5 + " Start core part of ingestion program. " + "Version: {} ".format(VERSION) + "=" * 5) write_start_file(output_dir, start_time=start, time_budget=time_budget, task_name=basename.split(".")[0]) try: # Check if the model has methods `train` and `test`. for attr in ["train", "test"]: if not hasattr(M, attr): raise ModelApiError( "Your model object doesn't have the method " + "`{}`. Please implement it in model.py.") # Check if model.py uses new done_training API instead of marking # stopping by returning None use_done_training_api = hasattr(M, "done_training") if not use_done_training_api: logger.warning( "Your model object doesn't have an attribute " + "`done_training`. But this is necessary for ingestion " + "program to know whether the model has done training " + "and to decide whether to proceed more training. " + "Please add this attribute to your model.") # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process while not (use_done_training_api and M.done_training): remaining_time_budget = start + time_budget - time.time() # Train the model logger.info("Begin training the model...") M.train(D_train.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished training the model.") # Make predictions using the trained model logger.info("Begin testing the model by making predictions " + "on test set...") remaining_time_budget = start + time_budget - time.time() Y_pred = M.test(D_test.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished making predictions.") if Y_pred is None: # Stop train/predict process if Y_pred is None logger.info("The method model.test returned `None`. " + "Stop train/predict process.") break else: # Check if the prediction has good shape prediction_shape = tuple(Y_pred.shape) if prediction_shape != correct_prediction_shape: raise BadPredictionShapeError( "Bad prediction shape! Expected {} but got {}.".format( correct_prediction_shape, prediction_shape)) remaining_time_budget = start + time_budget_approx - time.time() if remaining_time_budget < 0: break # Write timestamp to 'start.txt' write_timestamp(output_dir, predict_idx=prediction_order_number, timestamp=time.time()) # Prediction files: adult.predict_0, adult.predict_1, ... filename_test = basename[:-5] + ".predict_" + str( prediction_order_number) # Write predictions to output_dir data_io.write(os.path.join(output_dir, filename_test), Y_pred) prediction_order_number += 1 logger.info( "[+] {0:d} predictions made, time spent so far {1:.2f} sec". format(prediction_order_number, time.time() - start)) remaining_time_budget = start + time_budget_approx - time.time() logger.info( "[+] Time left {0:.2f} sec".format(remaining_time_budget)) except Exception as e: ingestion_success = False logger.info("Failed to run ingestion.") logger.error("Encountered exception:\n" + str(e), exc_info=True) # Finishing ingestion program end_time = time.time() overall_time_spent = end_time - start # Write overall_time_spent to a end.txt file end_filename = "end.txt" with open(os.path.join(output_dir, end_filename), "w") as f: f.write("ingestion_duration: " + str(overall_time_spent) + "\n") f.write("ingestion_success: " + str(int(ingestion_success)) + "\n") f.write("end_time: " + str(end_time) + "\n") logger.info("Wrote the file {} marking the end of ingestion.".format( end_filename)) if ingestion_success: logger.info("[+] Done. Ingestion program successfully terminated.") logger.info("[+] Overall time spent %5.2f sec " % overall_time_spent) else: logger.info( "[-] Done, but encountered some errors during ingestion.") logger.info("[-] Overall time spent %5.2f sec " % overall_time_spent) # Copy all files in output_dir to score_dir os.system("cp -R {} {}".format(os.path.join(output_dir, "*"), score_dir)) logger.debug("Copied all ingestion output to scoring output directory.") logger.info("[Ingestion terminated]")
##### Begin creating training set and test set ##### D_train = AutoDLDataset(os.path.join(input_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(input_dir, basename, "test")) ##### End creating training set and test set ##### # ======== Keep track of time if debug_mode<1: time_budget = get_time_budget(D_train) # <== HERE IS THE TIME BUDGET! else: time_budget = max_time # ========= Creating a model print_log("Creating model...") ##### Begin creating model ##### M = Model(D_train.get_metadata()) # The metadata of D_train and D_test only differ in sample_count ###### End creating model ###### # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process start = time.time() while(True): remaining_time_budget = start + time_budget - time.time() print_log("Training the model...") # Train the model M.train(D_train.get_dataset(), remaining_time_budget=remaining_time_budget) remaining_time_budget = start + time_budget - time.time() # Make predictions using the trained model