def news_baseline_test(): dataset = AutoDLDataset('./tweet.data/train/') dataset.init() iterator = dataset.get_dataset().make_one_shot_iterator() next_element = iterator.get_next() # features, labels = next_element # # features = features.eval() # labels = labels.eval() # # print next_element data = [] sess = tf.Session() for idx in range(10): print("Example " + str(idx)) data.append(sess.run(next_element)) for each_data in data: print each_data
if not use_done_training_api: logger.warning("Your model object doesn't have an attribute " + "`done_training`. But this is necessary for ingestion " + "program to know whether the model has done training " + "and to decide whether to proceed more training. " + "Please add this attribute to your model.") # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process while(not (use_done_training_api and M.done_training)): remaining_time_budget = start + time_budget - time.time() # Train the model logger.info("Begin training the model...") M.train(D_train.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished training the model.") remaining_time_budget = start + time_budget - time.time() # Make predictions using the trained model logger.info("Begin testing the model by making predictions " + "on test set...") Y_pred = M.test(D_test.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished making predictions.") if Y_pred is None: # Stop train/predict process if Y_pred is None logger.info("The method model.test returned `None`. " + "Stop train/predict process.") break else: # Check if the prediction has good shape prediction_shape = tuple(Y_pred.shape)
# Write solution files in output_dir copy_solution(dataset_dir, output_dir) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) D_predict_data = None if os.path.exists(os.path.join(dataset_dir, basename, "unlabelled")): logger.info("Found some data to predict...") D_predict = AutoDLDataset( os.path.join(dataset_dir, basename, "unlabelled")) num_examples_predict = D_predict.get_metadata().size() D_predict_data = D_predict.get_dataset() ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages try: init_time_budget = 20 * 60 # time budget for initilization. timer = Timer() timer.set(init_time_budget) with timer.time_limit("Initialization"): ##### Begin creating model #####
def ingestion_fn(dataset_dir, code_dir, time_budget, time_budget_approx, output_dir, score_dir, model_config_name=None, model_config=None): #### Check whether everything went well ingestion_success = True # Parse directories root_dir = _HERE(os.pardir) ingestion_program_dir = join(root_dir, "ingestion_program") if dataset_dir.endswith("run/input") and code_dir.endswith("run/program"): logger.debug( "Since dataset_dir ends with 'run/input' and code_dir " "ends with 'run/program', suppose running on " + "CodaLab platform. Modify dataset_dir to 'run/input_data' " "and code_dir to 'run/submission'. " + "Directory parsing should be more flexible in the code of " + "compute worker: we need explicit directories for " + "dataset_dir and code_dir.") dataset_dir = dataset_dir.replace("run/input", "run/input_data") code_dir = code_dir.replace("run/program", "run/submission") # Show directories for debugging logger.debug("sys.argv = " + str(sys.argv)) logger.debug("Using dataset_dir: " + dataset_dir) logger.debug("Using output_dir: " + output_dir) logger.debug("Using ingestion_program_dir: " + ingestion_program_dir) logger.debug("Using code_dir: " + code_dir) # Our libraries path.append(ingestion_program_dir) path.append(code_dir) # IG: to allow submitting the starting kit as sample submission path.append(code_dir + "/sample_code_submission") import data_io from dataset import AutoDLDataset # THE class of AutoDL datasets data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(dataset_dir) #### Delete zip files and metadata file datanames = [x for x in datanames if x.endswith(".data")] if len(datanames) != 1: raise ValueError("{} datasets found in dataset_dir={}!\n".format( len(datanames), dataset_dir) + "Please put only ONE dataset under dataset_dir.") basename = datanames[0] logger.info("************************************************") logger.info("******** Processing dataset " + basename[:-5].capitalize() + " ********") logger.info("************************************************") logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION)) ##### Begin creating training set and test set ##### logger.info("Reading training set and test set...") D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train")) D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test")) ##### End creating training set and test set ##### ## Get correct prediction shape num_examples_test = D_test.get_metadata().size() output_dim = D_test.get_metadata().get_output_size() correct_prediction_shape = (num_examples_test, output_dim) # 20 min for participants to initializing and install other packages # try: # init_time_budget = 20 * 60 # time budget for initilization. # timer = Timer() # timer.set(init_time_budget) # with timer.time_limit("Initialization"): ##### Begin creating model ##### logger.info("Creating model...this process should not exceed 20min.") from model import Model # in participants' model.py # The metadata of D_train and D_test only differ in sample_count M = Model(D_train.get_metadata(), model_config_name=model_config_name, model_config=model_config) ###### End creating model ###### # except TimeoutException as e: # logger.info("[-] Initialization phase exceeded time budget. Move to train/predict phase") # except Exception as e: # logger.error("Failed to initializing model.") # logger.error("Encountered exception:\n" + str(e), exc_info=True) # # Mark starting time of ingestion start = time.time() logger.info("=" * 5 + " Start core part of ingestion program. " + "Version: {} ".format(VERSION) + "=" * 5) write_start_file(output_dir, start_time=start, time_budget=time_budget, task_name=basename.split(".")[0]) try: # Check if the model has methods `train` and `test`. for attr in ["train", "test"]: if not hasattr(M, attr): raise ModelApiError( "Your model object doesn't have the method " + "`{}`. Please implement it in model.py.") # Check if model.py uses new done_training API instead of marking # stopping by returning None use_done_training_api = hasattr(M, "done_training") if not use_done_training_api: logger.warning( "Your model object doesn't have an attribute " + "`done_training`. But this is necessary for ingestion " + "program to know whether the model has done training " + "and to decide whether to proceed more training. " + "Please add this attribute to your model.") # Keeping track of how many predictions are made prediction_order_number = 0 # Start the CORE PART: train/predict process while not (use_done_training_api and M.done_training): remaining_time_budget = start + time_budget - time.time() # Train the model logger.info("Begin training the model...") M.train(D_train.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished training the model.") # Make predictions using the trained model logger.info("Begin testing the model by making predictions " + "on test set...") remaining_time_budget = start + time_budget - time.time() Y_pred = M.test(D_test.get_dataset(), remaining_time_budget=remaining_time_budget) logger.info("Finished making predictions.") if Y_pred is None: # Stop train/predict process if Y_pred is None logger.info("The method model.test returned `None`. " + "Stop train/predict process.") break else: # Check if the prediction has good shape prediction_shape = tuple(Y_pred.shape) if prediction_shape != correct_prediction_shape: raise BadPredictionShapeError( "Bad prediction shape! Expected {} but got {}.".format( correct_prediction_shape, prediction_shape)) remaining_time_budget = start + time_budget_approx - time.time() if remaining_time_budget < 0: break # Write timestamp to 'start.txt' write_timestamp(output_dir, predict_idx=prediction_order_number, timestamp=time.time()) # Prediction files: adult.predict_0, adult.predict_1, ... filename_test = basename[:-5] + ".predict_" + str( prediction_order_number) # Write predictions to output_dir data_io.write(os.path.join(output_dir, filename_test), Y_pred) prediction_order_number += 1 logger.info( "[+] {0:d} predictions made, time spent so far {1:.2f} sec". format(prediction_order_number, time.time() - start)) remaining_time_budget = start + time_budget_approx - time.time() logger.info( "[+] Time left {0:.2f} sec".format(remaining_time_budget)) except Exception as e: ingestion_success = False logger.info("Failed to run ingestion.") logger.error("Encountered exception:\n" + str(e), exc_info=True) # Finishing ingestion program end_time = time.time() overall_time_spent = end_time - start # Write overall_time_spent to a end.txt file end_filename = "end.txt" with open(os.path.join(output_dir, end_filename), "w") as f: f.write("ingestion_duration: " + str(overall_time_spent) + "\n") f.write("ingestion_success: " + str(int(ingestion_success)) + "\n") f.write("end_time: " + str(end_time) + "\n") logger.info("Wrote the file {} marking the end of ingestion.".format( end_filename)) if ingestion_success: logger.info("[+] Done. Ingestion program successfully terminated.") logger.info("[+] Overall time spent %5.2f sec " % overall_time_spent) else: logger.info( "[-] Done, but encountered some errors during ingestion.") logger.info("[-] Overall time spent %5.2f sec " % overall_time_spent) # Copy all files in output_dir to score_dir os.system("cp -R {} {}".format(os.path.join(output_dir, "*"), score_dir)) logger.debug("Copied all ingestion output to scoring output directory.") logger.info("[Ingestion terminated]")