def test_merge_yaml_line_globaldict(self): g_conf.NAME = 'experiment_1' merge_with_yaml('configs/eccv/experiment_1.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') print(g_conf) full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) dataset = CoILDataset(full_dataset, transform=transforms.Compose( [transforms.ToTensor()])) # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them for data in data_loader: a, b = data
def setup(self, path_to_config_file): yaml_conf, checkpoint_number = checkpoint_parse_configuration_file(path_to_config_file) # Take the checkpoint name and load it checkpoint = torch.load(os.path.join(os.sep, os.path.join(*os.path.realpath(__file__).split(os.sep)[:-2]), '_logs', yaml_conf.split(os.sep)[-2], yaml_conf.split('/')[-1].split('.')[-2] , 'checkpoints', str(checkpoint_number) + '.pth')) # do the merge here merge_with_yaml(os.path.join(os.sep, os.path.join(*os.path.realpath(__file__).split(os.sep)[:-2]), yaml_conf)) self.checkpoint = checkpoint # We save the checkpoint for some interesting future use. self._model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) self.first_iter = True logging.info("Setup Model") # Load the model and prepare set it for evaluation self._model.load_state_dict(checkpoint['state_dict']) self._model.cuda() self._model.eval() self.latest_image = None self.latest_image_tensor = None # We add more time to the curve commands self._expand_command_front = 5 self._expand_command_back = 3 self.track = 2 # Track.CAMERAS
def setup(self, path_to_config_file): yaml_conf, checkpoint_number = checkpoint_parse_configuration_file(path_to_config_file) # Take the checkpoint name and load it checkpoint = torch.load(os.path.join('/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), '_logs', yaml_conf.split('/')[-2], yaml_conf.split('/')[-1].split('.')[-2] , 'checkpoints', str(checkpoint_number) + '.pth')) # merge the specific agent config with global config _g_conf merge_with_yaml(os.path.join('/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), yaml_conf)) self.checkpoint = checkpoint # We save the checkpoint for some interesting future use. # TODO: retrain the model with MPSC self._model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) self.first_iter = True logging.info("Setup Model") # Load the model and prepare set it for evaluation self._model.load_state_dict(checkpoint['state_dict']) self._model.cuda() self._model.eval() self.latest_image = None self.latest_image_tensor = None # We add more time to the curve commands self._expand_command_front = 5 self._expand_command_back = 3 # check map waypoint format => carla_data_provider & http://carla.org/2018/11/16/release-0.9.1/ # e.g. from map.get_waypoint Waypoint(Transform(Location(x=338.763, y=226.453, z=0), Rotation(pitch=360, yaw=270.035, roll=0))) self.track = Track.ALL_SENSORS_HDMAP_WAYPOINTS # specify available track info, see autonomous_agent.py
def test_name_generation(self): g_conf.NAME = 'experiment_1' merge_with_yaml('configs/test_exps/experiment_1.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train')
def print_folder_process_names(exp_batch): experiments_list = os.listdir(os.path.join('configs', exp_batch)) sort_nicely(experiments_list) for experiment in experiments_list: if '.yaml' in experiment: g_conf.immutable(False) merge_with_yaml(os.path.join('configs', exp_batch, experiment)) print(experiment.split('.')[-2] + ': ' + g_conf.EXPERIMENT_GENERATED_NAME)
def execute(gpu, exp_batch, exp_alias, ckpt, model, city_name='Town01', memory_use=0.2, host='127.0.0.1'): # host,port,gpu_number,path,show_screen,resolution,noise_type,config_path,type_of_driver,experiment_name,city_name,game,drivers_name #drive_config.city_name = city_name # TODO Eliminate drive config. print("Running ", __file__, " On GPU ", gpu, "of experiment name ", exp_alias) os.environ["CUDA_VISIBLE_DEVICES"] = gpu sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) carla_process, port = start_carla_simulator(gpu, exp_batch, exp_alias, city_name) merge_with_yaml(os.path.join(exp_batch, exp_alias + '.yaml')) set_type_of_process('test') experiment_suite = TestSuite() # coil_icra, coil_unit, wgangp_lsd, unit_task_only architecture_name = model while True: try: with make_carla_client(host, port) as client: checkpoint = torch.load(os.path.join(ckpt)) coil_agent = CoILAgent(checkpoint, architecture_name) run_driving_benchmark( coil_agent, experiment_suite, city_name, exp_batch + '_' + exp_alias + 'iteration', False, host, port) break except TCPConnectionError as error: logging.error(error) time.sleep(1) carla_process.kill() except KeyboardInterrupt: carla_process.kill() except: traceback.print_exc() carla_process.kill() carla_process.kill()
def setup(self, path_to_config_file): yaml_conf, checkpoint_number = checkpoint_parse_configuration_file(path_to_config_file) # Take the checkpoint name and load it checkpoint = torch.load(os.path.join('/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), '_logs', yaml_conf.split('/')[-2], yaml_conf.split('/')[-1].split('.')[-2] , 'checkpoints', str(checkpoint_number) + '.pth')) # do the merge here merge_with_yaml(os.path.join('/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), yaml_conf)) self.checkpoint = checkpoint # We save the checkpoint for some interesting future use. self._model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) self.first_iter = True logging.info("Setup Model") # Load the model and prepare set it for evaluation self._model.load_state_dict(checkpoint['state_dict']) self._model.cuda() self._model.eval() # Set ERFnet for segmentation self.model_erf = ERFNet(20) self.model_erf = torch.nn.DataParallel(self.model_erf) self.model_erf = self.model_erf.cuda() print("LOAD ERFNet - drive") def load_my_state_dict(model, state_dict): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model self.model_erf = load_my_state_dict(self.model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth'))) self.model_erf.eval() print ("ERFNet and weights LOADED successfully") self.latest_image = None self.latest_image_tensor = None # We add more time to the curve commands self._expand_command_front = 5 self._expand_command_back = 3 self.track = Track.CAMERAS
def test_basic_data(self): # the town2-town01 data, try to load. g_conf.immutable(False) g_conf.EXPERIMENT_NAME = 'coil_icra' create_log_folder('sample') create_exp_path('sample', 'coil_icra') merge_with_yaml('configs/sample/coil_icra.yaml') set_type_of_process('train') full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], 'CoILTrain') dataset = CoILDataset(full_dataset, transform=None, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME)
def test_town3_data(self): # the town3 data has different names and does not have pedestrians of vehicle stop # indications g_conf.immutable(False) g_conf.EXPERIMENT_NAME = 'resnet34imnet' create_log_folder('town03') create_exp_path('town03', 'resnet34imnet') merge_with_yaml('configs/town03/resnet34imnet.yaml') set_type_of_process('train') full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], 'CoILTrainTown03') dataset = CoILDataset(full_dataset, transform=None, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME)
def test_check_status_error(self): g_conf.immutable(False) # TODO: THe error ? How do nicely merge with the other parts ?? g_conf.NAME = 'experiment_running_error' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_error.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }) coil_logger.add_message('Error', { "Iteration": 10, "Message": " Some data integrity problems ! " }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_error.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Error") print(status[1])
def test_check_status_finished(self): g_conf.immutable(False) g_conf.NAME = 'experiment_finished' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_finished.yaml') g_conf.NUMBER_ITERATIONS = 20 # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') # We set the number of iterations as coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 21): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }, i) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }, i) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_finished.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Finished")
def test_check_status_running_loading(self): g_conf.immutable(False) g_conf.NAME = 'experiment_running_loading' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_loading.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_loading.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Loading")
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12, encoder_params=None): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml( os.path.join('configs', exp_batch, exp_alias + '.yaml'), encoder_params) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]}) seed_everything(seed=g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option print(" GOING TO LOAD") if g_conf.PRELOAD_MODEL_ALIAS is not None: print(" LOADING A PRELOAD") checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) else: # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: print('loading previous checkpoint ', checkpoint_file) checkpoint = torch.load( os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 100000000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. #full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # We can save preload dataset depends on the json file name, then no need to load dataset for each time with the same dataset if len(g_conf.EXPERIENCE_FILE) == 1: json_file_name = str( g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] else: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split( '/')[-1].split('.')[-2] + '_' + str( g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2] dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + json_file_name + '_' + g_conf.DATA_USED) #dataset = CoILDataset(transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS)+ 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded Training dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) if g_conf.MODEL_TYPE in ['separate-affordances']: model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) print(model) # we use the pre-trained encoder model to extract bottleneck Z and train the E-t-E model if g_conf.MODEL_TYPE in ['separate-affordances']: encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.eval() # To freeze the pre-trained encoder model if g_conf.FREEZE_ENCODER: for param_ in encoder_model.parameters(): param_.requires_grad = False if encoder_params is not None: encoder_checkpoint = torch.load( os.path.join( '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) print( "Encoder model ", str(encoder_params['encoder_checkpoint']), "loaded from ", os.path.join('_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints')) encoder_model.load_state_dict(encoder_checkpoint['state_dict']) if g_conf.FREEZE_ENCODER: encoder_model.eval() # To freeze the pre-trained encoder model for param_ in encoder_model.parameters(): param_.requires_grad = False else: optimizer = optim.Adam(list(model.parameters()) + list(encoder_model.parameters()), lr=g_conf.LEARNING_RATE) for name_encoder, param_encoder in encoder_model.named_parameters( ): if param_encoder.requires_grad: print(' Unfrozen layers', name_encoder) else: print(' Frozen layers', name_encoder) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] for name, param in model.named_parameters(): if param.requires_grad: print(' Unfrozen layers', name) else: print(' Frozen layers', name) print("Before the loss") # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) model.zero_grad() if not g_conf.FREEZE_ENCODER: encoder_model.zero_grad() if g_conf.LABELS_SUPERVISED: inputs_data = torch.cat( (data['rgb'], torch.zeros(g_conf.BATCH_SIZE, 1, 88, 200)), dim=1).cuda() else: inputs_data = torch.squeeze(data['rgb'].cuda()) if g_conf.MODEL_TYPE in ['separate-affordances']: #TODO: for this two encoder models training, we haven't put speed as input to train yet if g_conf.ENCODER_MODEL_TYPE in [ 'action_prediction', 'stdim', 'forward', 'one-step-affordances' ]: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), # We also add measurements and commands torch.squeeze(dataset.extract_commands(data).cuda())) elif g_conf.ENCODER_MODEL_TYPE in ['ETE']: e, inter = encoder_model.forward_encoder( inputs_data, dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda())) loss_function_params = { 'classification_gt': dataset.extract_affordances_targets( data, 'classification').cuda(), # harzard stop, red_light.... 'class_weights': g_conf.AFFORDANCES_CLASS_WEIGHT, 'regression_gt': dataset.extract_affordances_targets(data, 'regression').cuda(), 'variable_weights': g_conf.AFFORDANCES_VARIABLE_WEIGHT } loss = model(e, loss_function_params) loss.backward() optimizer.step() else: raise RuntimeError( 'Not implement yet, this branch is only work for g_conf.MODEL_TYPE in [separate-affordances]' ) """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '.pth')) if not g_conf.FREEZE_ENCODER: encoder_state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( encoder_state, os.path.join('_logs', g_conf.EXPERIMENT_BATCH_NAME, g_conf.EXPERIMENT_NAME, 'checkpoints', str(iteration) + '_encoder.pth')) iteration += 1 """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration if iteration % 100 == 0: print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'. format(iteration, iteration, g_conf.NUMBER_ITERATIONS, 100. * iteration / g_conf.NUMBER_ITERATIONS, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias, dataset_name): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = '0' # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) print(full_dataset) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() model.eval() criterion = Loss() latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 latest = 200000 best_loss = 1000.0 best_error = 1000.0 best_loss_iter = 0 best_error_iter = 0 print(dataset.meta_data[0][0]) for k in dataset.meta_data: k[0] = str(k[0], 'utf-8') print(dataset.meta_data[0][0]) cpts = glob.glob( '/home-local/rohitrishabh/coil_20-06/_logs/eccv/experiment_1/checkpoints/*.pth' ) # while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): for ckpt in cpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) latest = int(ckpt[-10:-4]) # checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth')) checkpoint = torch.load(ckpt) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) accumulated_loss = 0.0 accumulated_error = 0.0 iteration_on_checkpoint = 0 for data in data_loader: input_data, float_data = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] # print (torch.squeeze(input_data['rgb']).shape) # print (control_position) # print (speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) # TODO: Change this a functional standard using the loss functions. loss = torch.mean( (output - dataset.extract_targets(float_data).cuda())**2).data.tolist() mean_error = torch.mean( torch.abs( output - dataset.extract_targets(float_data).cuda())).data.tolist() accumulated_error += mean_error accumulated_loss += loss error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) #print (output[position].data.tolist()) coil_logger.add_message( 'Iterating in Validation', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'Loss': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(float_data) [position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 checkpoint_average_loss = accumulated_loss / len(dataset) checkpoint_average_error = accumulated_error / len(dataset) coil_logger.add_scalar('Loss', checkpoint_average_loss, latest) coil_logger.add_scalar('Error', checkpoint_average_error, latest) print('Loss: ', checkpoint_average_loss, "----Error: ", checkpoint_average_error) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest state = { 'state_dict': model.state_dict(), 'best_loss': best_loss, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l2' + '.pth')) if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest state = { 'state_dict': model.state_dict(), 'best_error': best_error, 'best_error_iter': best_error_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l1' + '.pth')) print('Best Loss: ', best_loss, "Checkpoint", best_loss_iter) print('Best Error: ', best_error, "Checkpoint", best_error_iter) coil_logger.add_message( 'Iterating in Validation', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest })
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=dataset_name) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) # Set ERFnet for segmentation model_erf = ERFNet(20) model_erf = torch.nn.DataParallel(model_erf) model_erf = model_erf.cuda() print("LOAD ERFNet - validate") def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model_erf = load_my_state_dict( model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth'))) model_erf.eval() print("ERFNet and weights LOADED successfully") # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window(dataset_name, None) model.cuda() best_mse = 1000 best_error = 1000 best_mse_iter = 0 best_error_iter = 0 while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_mse = 0 accumulated_error = 0 iteration_on_checkpoint = 0 for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'] # Seg batch rgbs = data['rgb'] with torch.no_grad(): outputs = model_erf(rgbs) labels = outputs.max(1)[1].byte().cpu().data seg_road = (labels == 0) seg_not_road = (labels != 0) seg = torch.stack((seg_road, seg_not_road), 1).float() output = model.forward_branch( torch.squeeze(seg).cuda(), dataset.extract_inputs(data).cuda(), controls) # output = model.foward_branch(torch.squeeze(rgbs).cuda(), # dataset.extract_inputs(data).cuda(),controls) # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) mse = torch.mean( (output - dataset.extract_targets(data).cuda() )**2).data.tolist() mean_error = torch.mean( torch.abs(output - dataset.extract_targets(data).cuda()) ).data.tolist() accumulated_error += mean_error accumulated_mse += mse error = torch.abs(output - dataset.extract_targets(data).cuda()) # Log a random position position = random.randint(0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'MSE': mse, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) iteration_on_checkpoint += 1 print("Iteration %d on Checkpoint %d : Error %f" % (iteration_on_checkpoint, checkpoint_iteration, mean_error)) """ ######## Finish a round of validation, write results, wait for the next ######## """ checkpoint_average_mse = accumulated_mse / (len(data_loader)) checkpoint_average_error = accumulated_error / ( len(data_loader)) coil_logger.add_scalar('Loss', checkpoint_average_mse, latest, True) coil_logger.add_scalar('Error', checkpoint_average_error, latest, True) if checkpoint_average_mse < best_mse: best_mse = checkpoint_average_mse best_mse_iter = latest if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_mse, 'BestError': best_error, 'BestMSE': best_mse, 'BestMSECheckpoint': best_mse_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_error) coil_logger.write_on_error_csv(dataset_name, checkpoint_average_error) # If we are using the finish when validation stops, we check the current if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(dataset_name, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def execute(gpu, exp_batch, exp_alias): from time import gmtime, strftime manualSeed = g_conf.SEED torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) real_dataset = g_conf.TARGET_DOMAIN_PATH #main data loader dataset = CoILDataset(full_dataset, real_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl(iag.AdditiveGaussianNoise(loc=0, scale=( 0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5)), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization( (0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) l1weight = g_conf.L1_WEIGHT task_adv_weight = g_conf.TASK_ADV_WEIGHT image_size = tuple([88, 200]) print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) print("GPU", gpu) print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD_task = ganmodels_task._netD_task(loss=g_conf.LOSS_FUNCTION).cuda() netD_img = ganmodels_task._netD_img(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda() if g_conf.PRETRAINED == 'RECON': netF_statedict = torch.load('netF_GAN_Pretrained.wts') netF.load_state_dict(netF_statedict) elif g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth') model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches": # print("No Transfer of ", newkey, " to ", oldkey) # else: print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': netD_bin = ganmodels_task._netD_task().cuda() netD_img = ganmodels_task._netD_img().cuda() netG = ganmodels_task._netG().cuda() netF = ganmodels_task._netF().cuda() if g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load(g_conf.IL_AGENT_PATH) model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") #### if g_conf.IF_AUG: print("Loading Aug Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) else: print("Loading Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) model_dec_state_dict = model_dec['stateG_dict'] netG_state_dict = netG.state_dict() print(len(netG_state_dict.keys()), len(model_dec_state_dict.keys())) for i, keys in enumerate( zip(netG_state_dict.keys(), model_dec_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netG_state_dict[newkey] = model_dec_state_dict[oldkey] netG.load_state_dict(netG_state_dict) print("Decoder Model Loaded!") init_weights(netD_bin) init_weights(netD_img) # init_weights(netG) print(netD_bin) print(netF) optimD_bin = torch.optim.Adam(netD_bin.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimD_img = torch.optim.Adam(netD_img.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE) Task_Loss = TaskLoss() if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': print("Using cross entropy!") Loss = torch.nn.CrossEntropyLoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter_F = 0 best_loss_iter_G = 0 best_lossF = 1000000.0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 gen_iterations = 0 n_critic = g_conf.N_CRITIC lossF = Variable(torch.Tensor([100.0])) lossG_adv = Variable(torch.Tensor([100.0])) lossG_smooth = Variable(torch.Tensor([100.0])) lossG = Variable(torch.Tensor([100.0])) netD_bin.train() netD_img.train() netG.train() netF.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) #TODO check how C network is optimized in LSDSEG #TODO put family for losses #IMPORTANT WHILE RUNNING THIS, CONV.PY MUST HAVE BATCHNORMS fake_img_pool_src = ImagePool(50) fake_img_pool_tgt = ImagePool(50) for data in data_loader: set_requires_grad(netD_bin, True) set_requires_grad(netD_img, True) set_requires_grad(netG, True) set_requires_grad(netF, True) # print("ITERATION:", iteration) val = 0.0 input_data, float_data, tgt_imgs = data if g_conf.IF_AUG: inputs = augmenter(0, input_data['rgb']) tgt_imgs = augmenter(0, tgt_imgs) else: inputs = input_data['rgb'].cuda() tgt_imgs = tgt_imgs.cuda() inputs = inputs.squeeze(1) inputs = inputs - val #subtracted by 0.5 tgt_imgs = tgt_imgs - val #subtracted by 0.5 controls = float_data[:, dataset.controls_position(), :] src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) if iteration % 100 == 0: imgs_to_save = torch.cat( (inputs[:1] + val, src_img_fake[:1] + val, tgt_imgs[:1] + val, tgt_img_fake[:1] + val), 0).cpu().data coil_logger.add_image("Images", imgs_to_save, iteration) imgs_to_save = imgs_to_save.clamp(0.0, 1.0) vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=False) ##--------------------Discriminator part!!!!!!!!!!-------------------## set_requires_grad(netD_bin, True) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_bin.zero_grad() outputsD_real_src_bin = netD_bin(src_embed_inputs) outputsD_real_tgt_bin = netD_bin(tgt_embed_inputs) gradient_penalty = calc_gradient_penalty(netD_bin, src_embed_inputs, tgt_embed_inputs) lossD_bin = torch.mean(outputsD_real_tgt_bin - outputsD_real_src_bin) + gradient_penalty lossD_bin.backward(retain_graph=True) optimD_bin.step() coil_logger.add_scalar('Total LossD Bin', lossD_bin.data, iteration) #### Discriminator img update #### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, True) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_img.zero_grad() outputsD_fake_src_img = netD_img(src_img_fake.detach()) outputsD_fake_tgt_img = netD_img(tgt_img_fake.detach()) outputsD_real_src_img = netD_img(inputs) outputsD_real_tgt_img = netD_img(tgt_imgs) gradient_penalty_src = calc_gradient_penalty(netD_img, inputs, src_img_fake) lossD_img_src = torch.mean( outputsD_fake_src_img - outputsD_real_src_img) + gradient_penalty_src gradient_penalty_tgt = calc_gradient_penalty(netD_img, tgt_imgs, tgt_img_fake) lossD_img_tgt = torch.mean( outputsD_fake_tgt_img - outputsD_real_tgt_img) + gradient_penalty_tgt lossD_img = (lossD_img_src + lossD_img_tgt) * 0.5 lossD_img.backward(retain_graph=True) optimD_img.step() coil_logger.add_scalar('Total LossD img', lossD_img.data, iteration) if ((iteration + 1) % n_critic) == 0: #####Generator updates####### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, True) set_requires_grad(netF, False) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) outputsD_fake_tgt_img = netD_img(tgt_img_fake) lossG_src_smooth = L1_loss(src_img_fake, inputs) lossG_tgt_smooth = L1_loss(tgt_img_fake, tgt_imgs) lossG_smooth = (lossG_src_smooth + lossG_tgt_smooth) * 0.5 lossG_adv = 0.5 * (-1.0 * outputsD_fake_src_img.mean() - 1.0 * outputsD_fake_tgt_img.mean()) lossG = (lossG_smooth + 0.0 * lossG_adv) lossG.backward(retain_graph=True) optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) #####Task network updates########################## set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, True) optimF.zero_grad() src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) lossF_task = Task_Loss.MSELoss( src_branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) lossF_adv_bin = netD_bin(src_embed_inputs).mean() - netD_bin( tgt_embed_inputs).mean() lossF_adv_img = outputsD_fake_src_img.mean( ) - outputsD_real_tgt_img.mean() lossF_adv = 0.5 * (lossF_adv_bin + 0.1 * lossF_adv_img) lossF = (lossF_task + task_adv_weight * lossF_adv) coil_logger.add_scalar('Total Task Loss', lossF.data, iteration) coil_logger.add_scalar('Adv Task Loss', lossF_adv.data, iteration) coil_logger.add_scalar('Only Task Loss', lossF_task.data, iteration) lossF.backward(retain_graph=True) optimF.step() if lossF.data < best_lossF: best_lossF = lossF.data.tolist() best_loss_iter_F = iteration #optimization for one iter done! position = random.randint(0, len(float_data) - 1) accumulated_time += time.time() - capture_time capture_time = time.time() if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter_F and iteration > 10000: state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossF': best_lossF, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelF' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): manualSeed = 123 torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor()])) sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = g_conf.L1 image_size = tuple([88, 200]) testmode = 1 if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() if g_conf.GANMODEL_NAME == 'LSDcontrol_acgan_nopatch': netD = acganmodels_nopatch._netD().cuda() netG = acganmodels_nopatch._netG(skip=g_conf.SKIP).cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.train() netD.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) fake_inputs = netG(inputs) if iteration % 200 == 0: imgs_to_save = torch.cat((inputs[:2], fake_inputs[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) controls = float_data[:, dataset.controls_position(), :] steer = controls[:, 0].cuda() ##--------------------Discriminator part!!!!!!!!!!----------------------- ##fake set_requires_grad(netD, True) optimD.zero_grad() outputsD_fake_forD, fakeD_steer = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() labels_fake = torch.zeros(labsize) #Fake labels label_fake_noise = torch.rand(labels_fake.size()) * 0.1 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake_aux = MSE_loss(fakeD_steer, steer) lossD_fake_total = lossD_fake + lossD_fake_aux lossD_fake_total.backward() optimD.step() ##real set_requires_grad(netD, True) optimD.zero_grad() outputsD_real_forD, realD_steer = netD(inputs) labsize = outputsD_real_forD.size() labels_real = torch.ones(labsize) #Real labels label_real_noise = torch.rand(labels_real.size()) * 0.1 #Label smoothing labels_real = labels_real - label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = torch.mean(outputsD_real_forD) lossD_real_aux = MSE_loss(realD_steer, steer) #Discriminator updates lossD_real_total = lossD_real + lossD_real_aux lossD_real_total.backward() optimD.step() lossD = lossD_real_total + lossD_fake_total coil_logger.add_scalar('Aux Real LossD', lossD_real_aux.data, iteration) coil_logger.add_scalar('Aux Fake LossD', lossD_fake_aux.data, iteration) coil_logger.add_scalar('Total Real LossD', lossD_real_total.data , iteration) coil_logger.add_scalar('Total Fake LossD', lossD_fake_total.data , iteration) coil_logger.add_scalar('Real LossD', lossD_real.data , iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data , iteration) ##--------------------Generator part!!!!!!!!!!----------------------- set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG, G_steer = netD(fake_inputs) #Generator updates lossG_smooth = L1_loss(fake_inputs, inputs) lossG_aux = MSE_loss(G_steer, steer) lossG = lossG_adv + lossG_aux + l1weight * lossG_smooth lossG.backward() optimG.step() coil_logger.add_scalar('Total LossG', lossG.data / len(inputs), iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data , iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data , iteration) coil_logger.add_scalar('Aux LossG', lossG_aux.data , iteration) #optimization for one iter done! position = random.randint(0, len(float_data)-1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message('Iterating', {'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'best_modelG' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12): """ The main encoder training function. Args: gpu: The GPU id number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train_encoder') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]}) # we set a seed for this exp seed_everything(seed=g_conf.MAGICAL_SEED) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join('_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load(os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 1000000000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. # full_dataset = os.path.join(os.environ["SRL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) if len(g_conf.EXPERIENCE_FILE) == 1: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] else: json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2] + '_' + str(g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2] dataset = CoILDataset(transform=augmenter, preload_name=g_conf.PROCESS_NAME + '_' + json_file_name + '_' + g_conf.DATA_USED) print ("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) encoder_model.cuda() encoder_model.train() print(encoder_model) optimizer = optim.Adam(encoder_model.parameters(), lr=g_conf.LEARNING_RATE) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: encoder_model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print ("Before the loss") if g_conf.ENCODER_MODEL_TYPE in ['ETE']: criterion = Loss(g_conf.LOSS_FUNCTION) # Loss time series window for data in data_loader: if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) capture_time = time.time() encoder_model.zero_grad() """ #################################### ENCODER_MODEL_TYPE can be: one-step-affordances, ETE, stdim, action_prediction #################################### - one-step-affordances: input RGB images, compute affordances loss. - ETE: input RGB images and speed, compute action loss (steering, throttle, brake) - stdim: input two consecutive RGB images, compute the feature loss - action_prediction: input two consecutive RGB images, compute action classification loss - forward: input two consecutive RGB images, compute action loss + feature loss """ if g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances']: loss_function_params = { 'classification_gt': dataset.extract_affordances_targets(data, 'classification').cuda(), # harzard stop, red_light.... 'class_weights': g_conf.AFFORDANCES_CLASS_WEIGHT, 'regression_gt': dataset.extract_affordances_targets(data, 'regression').cuda(), 'variable_weights': g_conf.AFFORDANCES_VARIABLE_WEIGHT } # we input RGB images, speed and command to train affordances loss = encoder_model(torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda()), loss_function_params) if iteration == 0: state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', 'inital.pth')) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['forward']: # We sample another batch to avoid the superposition inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, loss_other, loss_ete = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data), dataset.extract_targets(data)[0].cuda() ) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['ETE']: branches = encoder_model(torch.squeeze(data['rgb'].cuda()), dataset.extract_inputs(data).cuda(), torch.squeeze(dataset.extract_commands(data).cuda())) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), # steer, throttle, brake 'inputs': dataset.extract_inputs(data).cuda(), # speed 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['stdim']: inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, _, _ = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data) ) loss.backward() optimizer.step() elif g_conf.ENCODER_MODEL_TYPE in ['action_prediction']: inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()] loss, _, _ = encoder_model(inputs_data, dataset.extract_inputs(data), # We also add measurements and commands dataset.extract_commands(data), dataset.extract_targets(data)[0].cuda() ) loss.backward() optimizer.step() else: raise ValueError("The encoder model type is not know") """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': encoder_model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) iteration += 1 """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ if g_conf.ENCODER_MODEL_TYPE in ['stdim', 'action_prediction', 'forward']: coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('f_t', torch.squeeze(data['rgb'][0]), iteration) coil_logger.add_image('f_ti', torch.squeeze(data['rgb'][1]), iteration) elif g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances', 'ETE']: coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time coil_logger.add_message('Iterating', {'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter}, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) if iteration % 100 == 0: print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'.format( iteration, iteration, g_conf.NUMBER_ITERATIONS, 100. * iteration / g_conf.NUMBER_ITERATIONS, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
metavar='WIDTHxHEIGHT', default='1280x720', help='window resolution (default: 1280x720)') argparser.add_argument( '-o', '--output_folder', metavar='P', default=None, type=str, help= 'The folder to store images received by the network and its activations' ) args = argparser.parse_args() args.width, args.height = [int(x) for x in args.res.split('x')] merge_with_yaml(os.path.join('configs', args.folder, args.exp + '.yaml')) checkpoint = torch.load( os.path.join('_logs', args.folder, args.exp, 'checkpoints', str(args.checkpoint) + '.pth')) agent = CoILAgent(checkpoint, '_', args.carla_version) # Decide the version if args.carla_version == '0.9': try: sys.path.append( glob.glob( '**/carla-*%d.%d-%s.egg' % (sys.version_info.major, sys.version_info.minor, 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) except IndexError:
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToPILImage(), transforms.Resize(128, 128), transforms.ToTensor(), transforms.Normalize([ 0.5, 0.5, 0.5], [ 1.0, 1.0, 1.0])])) sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) transform = transforms.Compose([transforms.Resize((88, 200))]) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() netD.eval() capture_time = time.time() for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) print ("Inputs", i) #forward pass fake_inputs = netG(inputs) if iteration % 1000 == 0: coil_logger.add_image("Images", torch.cat((inputs[:3], fake_inputs[:3]), 0), iteration) ##--------------------Discriminator part!!!!!!!!!!----------------------- set_requires_grad(netD, True) optimD.zero_grad() ##fake outputsD_fake_forD = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() #Create labels of patchgan style with label smoothing labels_fake = torch.zeros(labsize[0], labsize[1], labsize[2], labsize[3]) #Fake labels label_fake_noise = torch.rand(labels_fake.size()) * 0.5 - 0.25 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = MSE_loss(outputsD_fake_forD, labels_fake) ##real outputsD_real = netD(inputs) labsize = outputsD_real.size() #Create labels of patchgan style with label smoothing labels_real = torch.ones(labsize[0], labsize[1], labsize[2], labsize[3]) #Real labels label_real_noise = torch.rand(labels_real.size()) * 0.5 - 0.25 #Label smoothing labels_real = labels_real + label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = MSE_loss(outputsD_real, labels_real) #Discriminator updates lossD = (lossD_real + lossD_fake) * 0.5 lossD /= len(inputs) lossD.backward() #retain_graph=True needed? optimD.step() coil_logger.add_scalar('Total LossD', lossD.data, iteration) coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs), iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs), iteration) ##--------------------Generator part!!!!!!!!!!----------------------- #TODO change decoder architecture #TODO check norms of gradients later #TODO add auxiliary regression loss for steering set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG = netD(fake_inputs) #Generator updates lossG_adv = MSE_loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = lossG_adv + l1weight * lossG_smooth lossG /= len(inputs) lossG.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs), iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs), iteration) #optimization for one iter done! position = random.randint(0, len(float_data)-1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message('Iterating', {'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'best_modelG' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias, drive_conditions, params): """ Main loop function. Executes driving benchmarks the specified iterations. Args: gpu: exp_batch: exp_alias: drive_conditions: params: Returns: """ try: print("Running ", __file__, " On GPU ", gpu, "of experiment name ", exp_alias) os.environ["CUDA_VISIBLE_DEVICES"] = gpu if not os.path.exists('_output_logs'): os.mkdir('_output_logs') merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) exp_set_name, town_name = drive_conditions.split('_') experiment_suite_module = __import__('drive.suites.' + camelcase_to_snakecase(exp_set_name) + '_suite', fromlist=[exp_set_name]) experiment_suite_module = getattr(experiment_suite_module, exp_set_name) experiment_set = experiment_suite_module() set_type_of_process('drive', drive_conditions) if params['suppress_output']: sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join('_output_logs', exp_alias + '_err_'+g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) coil_logger.add_message('Loading', {'Poses': experiment_set.build_experiments()[0].poses}) if g_conf.USE_ORACLE: control_filename = 'control_output_auto' else: control_filename = 'control_output' """ ##### Preparing the output files that will contain the driving summary ##### """ experiment_list = experiment_set.build_experiments() # Get all the uniquely named tasks task_list = unique([experiment.task_name for experiment in experiment_list ]) # Now actually run the driving_benchmark latest = get_latest_evaluated_checkpoint(control_filename + '_' + task_list[0]) if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 # The used tasks are hardcoded, this need to be improved file_base = os.path.join('_logs', exp_batch, exp_alias, g_conf.PROCESS_NAME + '_csv', control_filename) for i in range(len(task_list)): # Write the header of the summary file used conclusion # While the checkpoint is not there write_header_control_summary(file_base, task_list[i]) """ ###### Run a single driving benchmark specified by the checkpoint were validation is stale ###### """ if g_conf.FINISH_ON_VALIDATION_STALE is not None: while validation_stale_point(g_conf.FINISH_ON_VALIDATION_STALE) is None: time.sleep(0.1) validation_state_iteration = validation_stale_point(g_conf.FINISH_ON_VALIDATION_STALE) driving_benchmark(validation_state_iteration, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: """ ##### Main Loop , Run a benchmark for each specified checkpoint on the "Test Configuration" ##### """ while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): # Get the correct checkpoint # We check it for some task name, all of then are ready at the same time if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE, control_filename + '_' + task_list[0]) driving_benchmark(latest, gpu, town_name, experiment_set, exp_batch, exp_alias, params, control_filename, task_list) else: time.sleep(0.1) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something happened'})
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias + '.yaml')) set_type_of_process('validation') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 print(dataset.meta_data) while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) for data in data_loader: input_data, labels = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] print(torch.squeeze(input_data['rgb']).shape) print(control_position) print(speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), labels[:, speed_position, :].cuda(), labels[:, control_position, :].cuda()) # TODO: clean this squeeze and dimension things for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) #loss = criterion(output, labels) #loss.backward() #optimizer.step() #shutil.copyfile(filename, 'model_best.pth.tar') else: time.sleep(1) print("Waiting for the next Validation")
def plot_folder_summaries(exp_batch, train, validation_datasets, drive_environments, verbose=False): """ Main plotting function for the folder mode. Args: exp_batch: The exp batch (folder) being plotted on the screen. train: If train process is being printed validation_datasets: The validation datasets being computed drive_environments: The driving environments/ Benchmarks verbose: Returns: None """ os.system('clear') process_names = [] if train: process_names.append('train') for val in validation_datasets: process_names.append('validation' + '_' + val) # We save the drive files to be used later drive_files = {} for drive in drive_environments: drive_files.update({ ('drive' + '_' + drive.split('/')[-1].split('.')[0]): drive }) process_names.append('drive' + '_' + drive.split('/')[-1].split('.')[0]) experiments_list = os.listdir(os.path.join('configs', exp_batch)) experiments_list = [ experiment.split('.')[-2] for experiment in experiments_list ] names_list = get_names(exp_batch) sorted_keys = sorted( range(len(names_list)), key=lambda k: names_list[experiments_list[k] + '.yaml']) for key in sorted_keys: experiment = experiments_list[key] generated_name = names_list[experiment + '.yaml'] if experiment == '': raise ValueError("Empty Experiment on List") g_conf.immutable(False) merge_with_yaml( os.path.join('configs', exp_batch, experiment + '.yaml')) print(BOLD + experiment + ' : ' + generated_name + END) for process in process_names: try: output = get_status(exp_batch, experiment, process) except: # TODO: bad design. But the printing should not stop for any error on reading import traceback traceback.print_exc() status = output[0] summary = output[1] print(' ', process) if status == 'Not Started': print(' STATUS: ', BOLD + status + END) elif status == 'Loading': print(' STATUS: ', YELLOW + status + END, ' - ', YELLOW + summary + END) elif status == 'Iterating': print(' STATUS: ', YELLOW + status + END) elif status == 'Finished': print(' STATUS: ', GREEN + status + END) elif status == 'Error': print(' STATUS: ', RED + status + END, ' - ', RED + summary + END) if status == 'Iterating': if 'train' == process: print_train_summary(summary[status]) if 'validation' in process: if summary[1] != '': # If it has no summary we don't plot print_validation_summary(summary[0][status], summary[1][status]['Summary'], verbose) else: print_validation_summary(summary[0][status], '', verbose) if 'drive' in process: if 'Agent' not in summary[status]: continue checkpoint = summary[status]['Checkpoint'] # Get the sta # This contain the results from completed iterations # we read the json file directly agent_checkpoint_name = str(exp_batch) + '_' + str( experiment) + '_' + str(checkpoint) print_drive_summary(drive_files[process], agent_checkpoint_name, checkpoint)
def execute(gpu, exp_batch, exp_alias, state_dict, suppress_output=True, number_of_workers=12): """ The main training function. This functions loads the latest checkpoint for a given, exp_batch (folder) and exp_alias (experiment configuration). With this checkpoint it starts from the beginning or continue some training. Args: gpu: The GPU number exp_batch: the folder with the experiments exp_alias: the alias, experiment name suppress_output: if the output are going to be saved on a file number_of_workers: the number of threads used for data loading Returns: None """ try: # We set the visible cuda devices to select the GPU os.environ["CUDA_VISIBLE_DEVICES"] = gpu g_conf.VARIABLE_WEIGHT = {} # At this point the log file with the correct naming is created. # You merge the yaml file with the global configuration structure. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') # Set the process into loading status. coil_logger.add_message('Loading', {'GPU': gpu}) # Put the output to a separate file if it is the case if suppress_output: if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) sys.stderr = open(os.path.join( '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if coil_logger.check_finish('train'): coil_logger.add_message('Finished', {}) return # Preload option if g_conf.PRELOAD_MODEL_ALIAS is not None: checkpoint = torch.load( os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH, g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints', str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth')) # Get the latest checkpoint to be loaded # returns none if there are no checkpoints saved for this model checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file is not None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 best_loss_iter = 0 # Define the dataset. This structure is has the __get_item__ redefined in a way # that you can access the positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) # By instantiating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = Augmenter(g_conf.AUGMENTATION) # Instantiate the class used to read a dataset. The coil dataset generator # can be found dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=str(g_conf.NUMBER_OF_HOURS) + 'hours_' + g_conf.TRAIN_DATASET_NAME) print("Loaded dataset") data_loader = select_balancing_strategy(dataset, iteration, number_of_workers) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION) model.cuda() if state_dict != '': seg_model = ERFNet_Fast(2) seg_model = load_my_state_dict(seg_model, torch.load(state_dict)) seg_model.cuda() optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE) if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) accumulated_time = checkpoint['total_time'] loss_window = coil_logger.recover_loss_window('train', iteration) else: # We accumulate iteration time and keep the average speed accumulated_time = 0 loss_window = [] print("Before the loss") criterion = Loss(g_conf.LOSS_FUNCTION) color_transforms = Colorizes(2) board = Dashboard(8097) # Loss time series window for data in data_loader: # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times, # add a stop on the _logs folder that is going to be read by this process if g_conf.FINISH_ON_VALIDATION_STALE is not None and \ check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE): break """ #################################### Main optimization loop #################################### """ iteration += 1 if iteration % 1000 == 0: adjust_learning_rate_auto(optimizer, loss_window) # get the control commands from float_data, size = [120,1] capture_time = time.time() controls = data['directions'] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() if state_dict != '': with torch.no_grad(): repre = seg_model(torch.squeeze(data['rgb'].cuda()), only_encode=False) inputs = repre imgs = color_transforms(inputs) inputs = inputs.float().cuda() else: inputs = torch.squeeze(data['rgb'].cuda()) # vis board.image( torch.squeeze(data['rgb'])[0].cpu().data, '(train) input iter: ' + str(iteration)) board.image(imgs[0].cpu().data, '(train) output iter: ' + str(iteration)) branches = model(inputs, dataset.extract_inputs(data).cuda()) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls.cuda(), 'inputs': dataset.extract_inputs(data).cuda(), 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } loss, _ = criterion(loss_function_params) loss.backward() optimizer.step() """ #################################### Saving the model if necessary #################################### """ if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'optimizer': optimizer.state_dict(), 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) """ ################################################ Adding tensorboard logs. Making calculations for logging purposes. These logs are monitored by the printer module. ################################################# """ coil_logger.add_scalar('Loss', loss.data, iteration) coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration) if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(data).cuda()) accumulated_time += time.time() - capture_time coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(data)[position].data.tolist() }, iteration) loss_window.append(loss.data.tolist()) coil_logger.write_on_error_csv('train', loss.data) print("Iteration: %d Loss: %f" % (iteration, loss.data)) coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except RuntimeError as e: coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
parser.add_argument('--checkpoint', type=str, required=True, help='saved model checkpoint') parser.add_argument('--gradcam_path', type=str, required=True, help='path to save gradcam heatmap') parser.add_argument('--type', type=str, required=True, help='type of evaluation') args = parser.parse_args() merge_with_yaml(args.config) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus augmenter = Augmenter(None) dataset = CoILDataset(args.dataset_path, transform=augmenter, preload_name=args.preload_name) dataloader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True)
def setup(self, path_to_config_file): yaml_conf, checkpoint_number, agent_name, encoder_params = checkpoint_parse_configuration_file( path_to_config_file) # Take the checkpoint name and load it if encoder_params is not None: self.checkpoint = torch.load( os.path.join( '/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), '_logs', yaml_conf.split('/')[-2], yaml_conf.split('/')[-1].split('.')[-2] + '_' + str(encoder_params['encoder_checkpoint']), 'checkpoints', str(checkpoint_number) + '.pth')) # Once the ENCODER_MODEL_CONFIGURATION was defined, we use the pre-trained encoder model to extract bottleneck Z and drive the E-t-E agent self.encoder_checkpoint = torch.load( os.path.join( '/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) self.encoder_model = CoILModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) self.encoder_model.load_state_dict( self.encoder_checkpoint['state_dict']) self.encoder_model.cuda() self.encoder_model.eval() else: self.checkpoint = torch.load( os.path.join( '/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), '_logs', yaml_conf.split('/')[-2], yaml_conf.split('/')[-1].split('.')[-2], 'checkpoints', str(checkpoint_number) + '.pth')) # do the merge here # TODO THE MERGE IS REQUIRED DEPENDING ON THE SITUATION g_conf.immutable(False) merge_with_yaml( os.path.join( '/', os.path.join(*os.path.realpath(__file__).split('/')[:-2]), yaml_conf), encoder_params) self._model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) self.first_iter = True logging.info("Setup Model") # Load the model and prepare set it for evaluation self._model.load_state_dict(self.checkpoint['state_dict']) self._model.cuda() self._model.eval() self.latest_image = None self.latest_image_tensor = None # We add more time to the curve commands self._expand_command_front = 5 self._expand_command_back = 3 # TODO: Merge with Felipe's code self._msn = None self._lat_ref = 0 self._lon_ref = 0 # Check the agent name self._name = agent_name self.count = 0
def execute(gpu, exp_batch, exp_alias, validation_dataset, suppress_output): latest = None try: # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, f'{exp_alias}.yaml')) # The validation dataset is always fully loaded, so we fix a very high number of hours g_conf.NUMBER_OF_HOURS = 10000 set_type_of_process(process_type='validation', param=validation_dataset) # Save the output to a file if so desired if suppress_output: save_output(exp_alias) # Define the dataset. This structure has the __get_item__ redefined in a way # that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], validation_dataset) augmenter = Augmenter(None) # Definition of the dataset to be used. Preload name is just the validation data name dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=validation_dataset, process_type='validation') # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. data_loader = torch.utils.data.DataLoader( dataset, batch_size=g_conf.BATCH_SIZE, shuffle=False, num_workers=g_conf.NUMBER_OF_LOADING_WORKERS, pin_memory=True) model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.SENSORS).cuda() # The window used to keep track of the trainings l1_window = [] latest = get_latest_evaluated_checkpoint() if latest is not None: # When latest is noe l1_window = coil_logger.recover_loss_window( validation_dataset, None) # Keep track of the best loss and the iteration where it happens best_loss = 1000 best_loss_iter = 0 print(20 * '#') print('Starting validation!') print(20 * '#') # Check if the maximum checkpoint for validating has been reached while not maximum_checkpoint_reached(latest): # Wait until the next checkpoint is ready (assuming this is run whilst training the model) if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # Get next checkpoint for validation according to the test schedule and load it latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', f'{latest}.pth')) checkpoint_iteration = checkpoint['iteration'] model.load_state_dict(checkpoint['state_dict']) model.eval() # Turn off dropout and batchnorm (if any) print(f"Validation loaded, checkpoint {checkpoint_iteration}") # Main metric will be the used loss for training the network criterion = Loss(g_conf.LOSS_FUNCTION) checkpoint_average_loss = 0 # Counter iteration_on_checkpoint = 0 with torch.no_grad(): # save some computation/memory for data in data_loader: # Compute the forward pass on a batch from the validation dataset controls = data['directions'].cuda() img = torch.squeeze(data['rgb']).cuda() speed = dataset.extract_inputs( data).cuda() # this might not always be speed # For auxiliary metrics output = model.forward_branch(img, speed, controls) # For the loss function branches = model(img, speed) loss_function_params = { 'branches': branches, 'targets': dataset.extract_targets(data).cuda(), 'controls': controls, 'inputs': speed, 'branch_weights': g_conf.BRANCH_LOSS_WEIGHT, 'variable_weights': g_conf.VARIABLE_WEIGHT } # It could be either waypoints or direct control if 'waypoint1_angle' in g_conf.TARGETS: write_waypoints_output(checkpoint_iteration, output) else: write_regular_output(checkpoint_iteration, output) loss, _ = criterion(loss_function_params) loss = loss.data.tolist() # Log a random position position = random.randint( 0, len(output.data.tolist()) - 1) coil_logger.add_message( 'Iterating', { 'Checkpoint': latest, 'Iteration': f'{iteration_on_checkpoint * g_conf.BATCH_SIZE}/{len(dataset)}', f'Validation Loss ({g_conf.LOSS_FUNCTION})': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(data) [position].data.tolist() }, latest) # We get the average with a growing list of values # Thanks to John D. Cook: http://www.johndcook.com/blog/standard_deviation/ iteration_on_checkpoint += 1 checkpoint_average_loss += ( loss - checkpoint_average_loss) / iteration_on_checkpoint print( f"\rProgress: {100 * iteration_on_checkpoint * g_conf.BATCH_SIZE / len(dataset):3.4f}% - " f"Average Loss ({g_conf.LOSS_FUNCTION}): {checkpoint_average_loss:.16f}", end='') """ ######## Finish a round of validation, write results, wait for the next ######## """ coil_logger.add_scalar( f'Validation Loss ({g_conf.LOSS_FUNCTION})', checkpoint_average_loss, latest, True) # Let's visualize the distribution of the loss coil_logger.add_histogram( f'Validation Checkpoint Loss ({g_conf.LOSS_FUNCTION})', checkpoint_average_loss, latest) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest coil_logger.add_message( 'Iterating', { 'Summary': { 'Loss': checkpoint_average_loss, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter }, 'Checkpoint': latest }, latest) l1_window.append(checkpoint_average_loss) coil_logger.write_on_error_csv(validation_dataset, checkpoint_average_loss, latest) # If we are using the finish when validation stops, we check the current checkpoint if g_conf.FINISH_ON_VALIDATION_STALE is not None: if dlib.count_steps_without_decrease(l1_window) > 3 and \ dlib.count_steps_without_decrease_robust(l1_window) > 3: coil_logger.write_stop(validation_dataset, latest) break else: latest = get_latest_evaluated_checkpoint() time.sleep(1) coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'}) print("Waiting for the next Validation") print('\n' + 20 * '#') print('Finished validation!') print(20 * '#') coil_logger.add_message('Finished', {}) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest) except RuntimeError as e: if latest is not None: coil_logger.erase_csv(latest) coil_logger.add_message('Error', {'Message': str(e)}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'}) # We erase the output that was unfinished due to some process stop. if latest is not None: coil_logger.erase_csv(latest)
def setup(self, path_to_config_file): self._agent = None self.route_assigned = False self.count = 0 exp_dir = os.path.join( '/', os.path.join(*path_to_config_file.split('/')[:-1])) yaml_conf, checkpoint_number, agent_name, encoder_params = checkpoint_parse_configuration_file( path_to_config_file) if encoder_params == "None": encoder_params = None g_conf.immutable(False) merge_with_yaml( os.path.join('/', os.path.join(*path_to_config_file.split('/')[:-4]), yaml_conf), encoder_params) if g_conf.MODEL_TYPE in ['one-step-affordances']: # one step training, no need to retrain FC layers, we just get the output of encoder model as prediciton self._model = EncoderModel(g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) self.checkpoint = torch.load( os.path.join(exp_dir, 'checkpoints', str(checkpoint_number) + '.pth')) print("Affordances Model ", str(checkpoint_number) + '.pth', "loaded from ", os.path.join(exp_dir, 'checkpoints')) self._model.load_state_dict(self.checkpoint['state_dict']) self._model.cuda() self._model.eval() elif g_conf.MODEL_TYPE in ['separate-affordances']: if encoder_params is not None: self.encoder_model = EncoderModel( g_conf.ENCODER_MODEL_TYPE, g_conf.ENCODER_MODEL_CONFIGURATION) self.encoder_model.cuda() # Here we load the pre-trained encoder (not fine-tunned) if g_conf.FREEZE_ENCODER: encoder_checkpoint = torch.load( os.path.join( os.path.join( '/', os.path.join( *path_to_config_file.split('/')[:-4])), '_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints', str(encoder_params['encoder_checkpoint']) + '.pth')) print( "Encoder model ", str(encoder_params['encoder_checkpoint']), "loaded from ", os.path.join('_logs', encoder_params['encoder_folder'], encoder_params['encoder_exp'], 'checkpoints')) self.encoder_model.load_state_dict( encoder_checkpoint['state_dict']) self.encoder_model.eval() for param_ in self.encoder_model.parameters(): param_.requires_grad = False else: encoder_checkpoint = torch.load( os.path.join(exp_dir, 'checkpoints', str(checkpoint_number) + '_encoder.pth')) print("FINE TUNNED encoder model ", str(checkpoint_number) + '_encoder.pth', "loaded from ", os.path.join(exp_dir, 'checkpoints')) self.encoder_model.load_state_dict( encoder_checkpoint['state_dict']) self.encoder_model.eval() for param_ in self.encoder_model.parameters(): param_.requires_grad = False else: raise RuntimeError( 'encoder_params can not be None in MODEL_TYPE --> separate-affordances' ) self._model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION, g_conf.ENCODER_MODEL_CONFIGURATION) self.checkpoint = torch.load( os.path.join(exp_dir, 'checkpoints', str(checkpoint_number) + '.pth')) print("Affordances Model ", str(checkpoint_number) + '.pth', "loaded from ", os.path.join(exp_dir, 'checkpoints')) self._model.load_state_dict(self.checkpoint['state_dict']) self._model.cuda() self._model.eval()
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias+'.yaml')) set_type_of_process('train') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = CoILSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data)) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] else: iteration = 0 # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was print (dataset.meta_data) print (model) for data in data_loader: input_data, labels = data #TODO we have to divide the input with other data. #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) # get the control commands from labels, size = [120,1] controls = labels[:, 24, :] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() branches = model(input_rgb_data, labels[:, 10, :].cuda()) #print ("len ",len(branches)) # get the steer, gas and brake ground truth from labels steer_gt = labels[:, 0, :] gas_gt = labels[:, 1, :] brake_gt = labels[:, 2, :] speed_gt = labels[:, 10, :] targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) loss = criterion.MSELoss(branches, targets.cuda(), controls.cuda(), speed_gt.cuda()) loss.backward() optimizer.step() # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict() } # TODO : maybe already summarize the best model ??? torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': # netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() # init_weights(netD) init_weights(netG) # print(netD) print(netG) # optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.7, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() # netD.eval() capture_time = time.time() for data in data_loader: val = 0.5 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val #forward pass # print(inputs[0][0][0][0], inputs_in[0][0][0][0]) fake_inputs = netG(inputs_in) #subtracted by 0.5 fake_inputs_in = fake_inputs # print(fake_inputs[0][0][0][0], fake_inputs_in[0][0][0][0]) if iteration % 200 == 0: imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './noganimgs/' + str(iteration) + 'noganreal_samples.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) optimG.zero_grad() print("~~~~~~~~~__________") print(inputs_in[0][0][0][0]) print(fake_inputs[0][0][0][0]) lossG_mse = MSE_loss(fake_inputs, inputs) print(lossG_mse) lossG_mse /= len(inputs_in) print("~~~~~~~~~__________--------------") lossG_mse.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('MSE LossG', lossG_mse.data / len(inputs_in), iteration) #optimization for one iter done! position = random.randint(0, len(float_data) - 1) # if lossD.data < best_lossD: # best_lossD = lossD.data.tolist() if lossG_mse.data < best_lossG: best_lossG = lossG_mse.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() # print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) # coil_logger.add_message('Iterating', # {'Iteration': iteration, # 'LossD': lossD.data.tolist(), # 'LossG': lossG.data.tolist(), # 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, # 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, # 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, # 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), # 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, # iteration) # if is_ready_to_save(iteration): # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'checkpoints', str(iteration) + '.pth')) # if iteration == best_loss_iter: # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'best_modelG' + '.pth')) # iteration += 1