def execute_validation(gpus, exp_folder, exp_alias, validation_datasets, erase_bad_validations, restart_validations, suppress_output=True): """ Args: gpus: The gpu being used for this execution. exp_folder: The folder this driving experiment is being executed exp_alias: The experiment alias, file name, to be executed. validation_datasets: Validation datasets to be deleted/restarted erase_bad_validations: restart_validations: suppress_output: Returns: """ validation_datasets = validation_datasets.split( ',') # Turn a string into a list of the names create_log_folder(exp_folder) create_exp_path(exp_folder, exp_alias) if erase_bad_validations: erase_wrong_plotting_summaries(exp_folder, validation_datasets) if restart_validations: erase_validations(exp_folder, validation_datasets) # The difference between train and validation is the p = multiprocessing.Process(target=validate.execute, args=(gpus, exp_folder, exp_alias, validation_datasets[0], suppress_output)) p.start()
def test_basic_data(self): # the town2-town01 data, try to load. g_conf.immutable(False) g_conf.EXPERIMENT_NAME = 'coil_icra' create_log_folder('sample') create_exp_path('sample', 'coil_icra') merge_with_yaml('configs/sample/coil_icra.yaml') set_type_of_process('train') full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], 'CoILTrain') dataset = CoILDataset(full_dataset, transform=None, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME)
def test_town3_data(self): # the town3 data has different names and does not have pedestrians of vehicle stop # indications g_conf.immutable(False) g_conf.EXPERIMENT_NAME = 'resnet34imnet' create_log_folder('town03') create_exp_path('town03', 'resnet34imnet') merge_with_yaml('configs/town03/resnet34imnet.yaml') set_type_of_process('train') full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], 'CoILTrainTown03') dataset = CoILDataset(full_dataset, transform=None, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME)
def execute_train(gpus, exp_folder, exp_alias, suppress_output=True, number_of_workers=12): """ Args: gpus: The gpu being used for this execution. exp_folder: Folder name in configs exp_alias: The experiment alias (yaml file) suppress_output: number_of_workers: Returns: """ create_log_folder(exp_folder) create_exp_path(exp_folder, exp_alias) p = multiprocessing.Process(target=train.execute, args=(gpus, exp_folder, exp_alias, suppress_output, number_of_workers)) p.start()
def execute_drive(gpus, exp_folder, exp_alias, exp_set_name, suppress_output, docker, record_collisions, no_screen): """ Args: gpus: The gpu being used for this execution. exp_folder: The folder this driving experiment is being executed exp_alias: The experiment alias, file name, to be executed. exp_set_name: suppress_output: docker: record_collisions: no_screen: Returns: """ create_log_folder(exp_folder) create_exp_path(exp_folder, exp_alias) p = multiprocessing.Process(target=run_drive.execute, args=(gpus, exp_folder, exp_alias, exp_set_name, suppress_output, docker, record_collisions, no_screen)) p.start()
# Check if the mandatory folder argument is passed if args.folder is None: raise ValueError( "You should set a folder name where the experiments are placed") # Check if the driving parameters are passed in a correct way if args.driving_environments is not None: for de in list(args.driving_environments): if len(de.split('_')) < 2: raise ValueError( "Invalid format for the driving environments should be Suite_Town" ) # This is the folder creation of the create_log_folder(args.folder) erase_logs(args.folder) if args.erase_bad_validations: erase_wrong_plotting_summaries(args.folder, list(args.validation_datasets)) if args.restart_validations: erase_validations(args.folder, list(args.validation_datasets)) # The definition of parameters for driving drive_params = { "suppress_output": True, "no_screen": args.no_screen, "docker": args.docker, "record_collisions": args.record_collisions } # There are two modes of execution
for gpu in args.gpus: try: int(gpu) except ValueError: # Reraise a meaningful error. raise ValueError("GPU is not a valid int number") # There are two modes of execution if args.single_process is not None: if args.single_process in ['train', 'validation']: # Check if the mandatory folder argument is passed if args.folder is None: raise ValueError( "You should set a folder name where the experiments are placed" ) # This is the folder creation of the logs create_log_folder(args.folder) if args.exp is None: raise ValueError("You should set the exp alias") # The definition of pre-trained encoder model used for training affordances if args.encoder_checkpoint and args.encoder_folder and args.encoder_exp: encoder_params = { 'encoder_checkpoint': args.encoder_checkpoint, 'encoder_folder': args.encoder_folder, 'encoder_exp': args.encoder_exp } elif all(v is None for v in [ args.encoder_checkpoint, args.encoder_folder, args.encoder_exp ]): encoder_params = None else:
def folder_execute(exp_folder, exp_set_name, gpus, validation_datasets, driving_environments, is_training, number_of_workers, suppress_output, docker, record_collisions, no_screen, erase_bad_validations, restart_validations): """ Execute a folder of experiments. It will execute trainings and all the selected evaluations for each of the models present on the folder. Args params: a dictionary containing: gpus: the gpu numbers that are going to be allocated for the experiment gpu_value: the "value" of each gpu, depending on the value more or less experiments will be allocated per GPU folder: the folder where all the experiment configuration files are validation_datasets: the validation datasets that are going to be validated per experiment driving_environments: The driving environments where the models are going to be tested. record_collisions: no_screen: erase_bad_validations: restart_validations: """ # We set by default that each gpu has a value of 3.5, allowing a training and a driving/validation allocation_parameters = { 'gpu_value': 3.5, 'train_cost': 1.5, 'validation_cost': 1.0, 'drive_cost': 1.5 } create_log_folder(exp_folder) experiments_list = os.listdir(os.path.join('configs', exp_folder)) experiments_list = [ experiment.split('.')[-2] for experiment in experiments_list ] allocated_gpus = {gpu: allocation_parameters['gpu_value'] for gpu in gpus} executing_processes = [] free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) # Is a queue of tasks to be executed. The priority is always train, then test, then val. tasks_queue = mount_experiment_heap(exp_folder, experiments_list, is_training, [], [], validation_datasets, driving_environments) # No process is executing right now. while True: # if not done or executing get to the list # If amount of resources is smaller than a threshold. while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'], allocation_parameters['validation_cost'], allocation_parameters['drive_cost']]) \ and tasks_queue != []: # Allocate all the gpus popped_thing = heapq.heappop(tasks_queue) process_specs = popped_thing[2] # To get directly the dict # Get the train status, that will affect in scheduling a validation or drive process train_status = monitorer.get_status(exp_folder, process_specs['experiment'], 'train')[0] # ADD TRAIN TO EXECUTE if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \ allocation_parameters['train_cost']: free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['train_cost']) execute_train(gpu_number, process_specs['folder'], process_specs['experiment'], number_of_workers) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD DRIVE TO EXECUTE elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \ allocation_parameters['drive_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['drive_cost']) execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['environment'], suppress_output, docker, record_collisions, no_screen) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD VALIDATION TO EXECUTE elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \ allocation_parameters['validation_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['validation_cost']) execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'], erase_bad_validations, restart_validations, suppress_output) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) tasks_queue = mount_experiment_heap(exp_folder, experiments_list, is_training, executing_processes, tasks_queue, validation_datasets, driving_environments, False) printer.plot_folder_summaries(exp_folder, is_training, validation_datasets, driving_environments) # Check allocated process, and look which ones finished. if len(tasks_queue) == 0 and len(executing_processes) == 0: break free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) time.sleep(10) print("ALL EXPERIMENTS EXECUTED")
from coil_core import execute_train from coilutils.general import create_log_folder, create_exp_path, erase_logs if __name__ == '__main__': folder = 'cvpr' exp = 'img_gtseg_camv_control' create_log_folder(folder) erase_logs(folder) create_exp_path(folder, exp) execute_train('0', folder, exp) print("SUCCESSFULLY RAN TRAINING")