if __name__ == '__main__': argparser = argparse.ArgumentParser(description=__doc__) argparser.add_argument( '--check-status', action='store_true', dest='check_status', ) argparser.add_argument('--folder', default='eccv', type=str) argparser.add_argument('--erase-experiments', nargs='+', dest='gpus', type=str) args = argparser.parse_args() # Obs this is like a fixed parameter, how much a validation and a train and drives ocupies if args.check_status: validation_datasets = get_validation_datasets(args.folder) drive_environments = get_driving_environments(args.folder) printer.plot_folder_summaries(args.folder, True, validation_datasets, drive_environments, verbose=False) if args.erase_experiments: pass
def folder_execute(params=None): """ Execute a folder of experiments. It will execute trainings and all the selected evaluations for each of the models present on the folder. Args params: a dictionary containing: gpus: the gpu numbers that are going to be allocated for the experiment gpu_value: the "value" of each gpu, depending on the value more or less experiments will be allocated per GPU folder: the folder where all the experiment configuration files are validation_datasets: the validation datasets that are going to be validated per experiment driving_environments: The driving environments where the models are going to be tested. """ folder = params['folder'] allocated_gpus = params['gpus'] validation_datasets = params['validation_datasets'] driving_environments = params['driving_environments'] allocation_parameters = params['allocation_parameters'] experiments_list = os.listdir(os.path.join('configs', folder)) experiments_list = [ experiment.split('.')[-2] for experiment in experiments_list ] allocated_gpus = { gpu: allocation_parameters['gpu_value'] for gpu in allocated_gpus } executing_processes = [] free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) # Is a queue of tasks to be executed. The priority is always train. # then test then val. tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], [], [], validation_datasets, driving_environments) # No process is executing right now. while True: # if not done or executing get to the list # If amount of resources is smaller than a threshold. while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'], allocation_parameters['validation_cost'], allocation_parameters['drive_cost']]) \ and tasks_queue != []: # Allocate all the gpus popped_thing = heapq.heappop(tasks_queue) process_specs = popped_thing[2] # To get directly the dict # Get the train status, that will affect in scheduling a validation or drive process train_status = monitorer.get_status(folder, process_specs['experiment'], 'train')[0] # ADD TRAIN TO EXECUTE if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \ allocation_parameters['train_cost']: free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['train_cost']) execute_train(gpu_number, process_specs['folder'], process_specs['experiment'], params['number_of_workers']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD DRIVE TO EXECUTE elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \ allocation_parameters['drive_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): print(process_specs['type']) free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['drive_cost']) execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['environment'], params['driving_parameters']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD VALIDATION TO EXECUTE elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \ allocation_parameters['validation_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['validation_cost']) execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['dataset']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], executing_processes, tasks_queue, validation_datasets, driving_environments, False) printer.plot_folder_summaries(folder, params['is_training'], validation_datasets, driving_environments) # Check allocated process, and look which ones finished. if len(tasks_queue) == 0 and len(executing_processes) == 0: break free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) time.sleep(10) print("ALL EXPERIMENTS EXECUTED")
def folder_execute(params=None): """ On this mode the training software keeps all It forks a process to run the monitor over the training logs. Arguments param, prioritize training, prioritize test, prioritize """ folder = params['folder'] allocated_gpus = params['gpus'] validation_datasets = params['validation_datasets'] driving_environments = params['driving_environments'] allocation_parameters = params['allocation_parameters'] experiments_list = os.listdir(os.path.join('configs', folder)) experiments_list = [experiment.split('.')[-2] for experiment in experiments_list] # Each gpu has maximun 2 slots allocated_gpus = {gpu: allocation_parameters['gpu_value'] for gpu in allocated_gpus} executing_processes = [] free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(allocated_gpus, executing_processes, allocation_parameters) # Is a queue of tasks to be executed. The priority is always train. # then test then val. # TODO: change the priority to test the ones that have already been trained. tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], [], [], validation_datasets, driving_environments) # No process is executing right now. print(tasks_queue) # TODO: the while should go outside, so the monitorer process is independent of the type of execution while True: # if not done or executing get to the list # If amount of resources is smaller than a threshold. while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'], allocation_parameters['validation_cost'], allocation_parameters['drive_cost']]) \ and tasks_queue != []: # Allocate all the gpus popped_thing = heapq.heappop(tasks_queue) process_specs = popped_thing[2] # To get directly the dict # Get the train status, that will affect in scheduling a validation or drive process train_status = monitorer.get_status(folder, process_specs['experiment'], 'train')[0] if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \ allocation_parameters['train_cost']: free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['train_cost']) execute_train(gpu_number, process_specs['folder'], process_specs['experiment']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \ allocation_parameters['validation_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['validation_cost']) execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['dataset']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \ allocation_parameters['drive_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['drive_cost']) execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['environment'], no_screen=params['no_screen']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], executing_processes, tasks_queue, validation_datasets, driving_environments, False) printer.plot_folder_summaries(folder, params['is_training'], validation_datasets, driving_environments) # Check allocated process, and look which ones finished. free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) if len(tasks_queue) == 0 and len(executing_processes) == 0: break print ("Task queue", tasks_queue) print ("") print ("exec proc", executing_processes) print("resources", free_gpus) time.sleep(10) print("ALL EXPERIMENTS EXECUTED")