def test_check_status_running_iter(self): g_conf = GlobalConfig() g_conf.param.NAME = 'experiment_running_iter' # TODO: this merge is weird. g_conf.merge_with_yaml( 'configs/monitor_test/experiment_running_iter.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS g_conf.set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Reading', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Model', { "Iteration": i, "Output": ["output"] }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_iter', g_conf.param.PROCESS_NAME) self.assertEqual(status[0], "Iterating")
def test_check_status_not_existent(self): # Check if status could be check for unexistent experiments g_conf = GlobalConfig() status = monitorer.get_status('monitor_test', 'experiment_25', g_conf.param.PROCESS_NAME) self.assertEqual(status[0], "Does Not Exist")
def test_check_status_not_existent(self): # Check if status could be check for unexistent experiments g_conf.immutable(False) status = monitorer.get_status('monitor_test', 'experiment_25.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Does Not Exist")
def test_check_status_to_run(self): # Check for an experiment that exists in the config files but has not been started g_conf = GlobalConfig() g_conf.param.NAME = 'experiment_to_run' status = monitorer.get_status('monitor_test', 'experiment_to_run', g_conf.param.PROCESS_NAME) self.assertEqual(status[0], "Not Started")
def test_check_status_to_run(self): # Check for an experiment that exists in the config files but has not been started g_conf.immutable(False) g_conf.NAME = 'experiment_to_run' status = monitorer.get_status('monitor_test', 'experiment_to_run.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Not Started")
def mount_experiment_heap(folder, experiments_list, is_training, validation_datasets, drive_environments): tasks_queue = [] is_training = True for experiment in experiments_list: # Train is always priority. # TODO: some system to check priority depending on iterations # TODO: One thing is error other thing is stop. However at a first step we can try to restart all error things if is_training: # if monitorer.get_status(folder, experiment, 'train')[0] == "Not Started" or \ # monitorer.get_status(folder, experiment, 'train')[0] == "Error": heapq.heappush(tasks_queue, (0, experiment+'_train' , {'type': 'train', 'folder': folder, 'experiment': experiment})) for val_data in validation_datasets: if monitorer.get_status(folder, experiment, 'validation_' + val_data)[0] == "Not Started" or \ monitorer.get_status(folder, experiment, 'validation_'+ val_data)[0] == "Error": heapq.heappush(tasks_queue, (2, experiment+'_validation_' + val_data, {'type': 'validation', 'folder': folder, 'experiment': experiment, 'dataset': val_data})) for drive_env in drive_environments: if monitorer.get_status(folder, experiment, 'drive_' + drive_env)[0] == "Not Started" or \ monitorer.get_status(folder, experiment, 'drive_' + drive_env)[0] == "Error": heapq.heappush(tasks_queue, (1, experiment+'_drive_' + drive_env, {'type': 'drive', 'folder': folder, 'experiment': experiment, 'environment': drive_env})) return tasks_queue
def test_check_status_error(self): g_conf.immutable(False) # TODO: THe error ? How do nicely merge with the other parts ?? g_conf.NAME = 'experiment_running_error' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_error.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 10): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }) coil_logger.add_message('Error', { "Iteration": 10, "Message": " Some data integrity problems ! " }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_error.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Error") print(status[1])
def test_check_status_finished(self): g_conf.immutable(False) g_conf.NAME = 'experiment_finished' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_finished.yaml') g_conf.NUMBER_ITERATIONS = 20 # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') # We set the number of iterations as coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) for i in range(0, 21): coil_logger.add_message('Iterating', { "Iteration": i, "ReadKeys": [1, 123, 5, 1, 34, 1, 23] }, i) coil_logger.add_message('Iterating', { "Iteration": i, "Output": ["output"] }, i) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_finished.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Finished")
def get_gpu_resources(gpu_resources, executing_processes, allocation_params): """ Args: allocated_gpus: executing_processes: allocation_params: Returns: """ still_executing_processes = [] for process_specs in executing_processes: # Make process name: if process_specs['type'] == 'drive': name = 'drive_' + process_specs['environment'] elif process_specs['type'] == 'validation': name = 'validation_' + process_specs['dataset'] else: name = process_specs['type'] status = monitorer.get_status(process_specs['folder'], process_specs['experiment'], name)[0] if status == "Finished" or status == 'Error': if isinstance(process_specs['gpu'], list): for gpu in process_specs['gpu']: gpu_resources[gpu] += allocation_params[process_specs['type']+'_cost'] else: gpu_resources[process_specs['gpu']] += allocation_params[process_specs['type']+'_cost'] else: still_executing_processes.append(process_specs) return gpu_resources, max(gpu_resources.values()), still_executing_processes
def get_gpu_resources(gpu_resources, executing_processes, allocation_params): """ Args: allocated_gpus: executing_processes: allocation_params: Returns: """ for process_specs in executing_processes: status = monitorer.get_status(process_specs['folder'], process_specs['experiment'], process_specs['type'])[0] if status == "Finished" or status == 'Error': gpu_resources[process_specs['gpu']] += allocation_params[process_specs['type']+'_cost'] return gpu_resources, max(gpu_resources.values())
def test_check_status_running_loading(self): g_conf.immutable(False) g_conf.NAME = 'experiment_running_loading' # TODO: this merge is weird. merge_with_yaml('configs/monitor_test/experiment_running_loading.yaml') # JUST A TRICK TO CONTAIN THE CURRENT LIMITATIONS set_type_of_process('train') coil_logger.add_message( 'Loading', {"Keys_Division": [1, 123, 1, 1, 2, 12, 3, 12, 31, 2, 1, 1]}) coil_logger.add_message('Loading', { "Models_loaded": ' VUALA ', "Checkpoint": "988765" }) # TODO: Check how the alias will work. status = monitorer.get_status('monitor_test', 'experiment_running_loading.yaml', g_conf.PROCESS_NAME) self.assertEqual(status[0], "Loading")
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': # netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() # init_weights(netD) init_weights(netG) # print(netD) print(netG) # optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.7, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() # netD.eval() capture_time = time.time() for data in data_loader: val = 0.5 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val #forward pass # print(inputs[0][0][0][0], inputs_in[0][0][0][0]) fake_inputs = netG(inputs_in) #subtracted by 0.5 fake_inputs_in = fake_inputs # print(fake_inputs[0][0][0][0], fake_inputs_in[0][0][0][0]) if iteration % 200 == 0: imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './noganimgs/' + str(iteration) + 'noganreal_samples.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) optimG.zero_grad() print("~~~~~~~~~__________") print(inputs_in[0][0][0][0]) print(fake_inputs[0][0][0][0]) lossG_mse = MSE_loss(fake_inputs, inputs) print(lossG_mse) lossG_mse /= len(inputs_in) print("~~~~~~~~~__________--------------") lossG_mse.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('MSE LossG', lossG_mse.data / len(inputs_in), iteration) #optimization for one iter done! position = random.randint(0, len(float_data) - 1) # if lossD.data < best_lossD: # best_lossD = lossD.data.tolist() if lossG_mse.data < best_lossG: best_lossG = lossG_mse.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() # print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) # coil_logger.add_message('Iterating', # {'Iteration': iteration, # 'LossD': lossD.data.tolist(), # 'LossG': lossG.data.tolist(), # 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, # 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, # 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, # 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), # 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, # iteration) # if is_ready_to_save(iteration): # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'checkpoints', str(iteration) + '.pth')) # if iteration == best_loss_iter: # # state = { # 'iteration': iteration, # 'stateD_dict': netD.state_dict(), # 'stateG_dict': netG.state_dict(), # 'best_lossD': best_lossD, # 'best_lossG': best_lossG, # 'total_time': accumulated_time, # 'best_loss_iter': best_loss_iter # # } # torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'best_modelG' + '.pth')) # iteration += 1
def mount_experiment_heap(folder, experiments_list, is_training, executing_processes, old_tasks_queue, validation_datasets, drive_environments, restart_error=True): """ Function that will add all the experiments to a heap. These experiments will be consumed when there is enough resources Args: folder: The folder with the experiments experiments_list: The list of all experiments to be executed is_training: If Training is being add also ( NOT IMPLEMENTED) executing_processes: The current processes being executed old_tasks_queue: Current process on the task queue validation_datasets: The validation datasets to be evaluated drive_environments: All the driving environments where the models are going to be tested. restart_error: If you are going to restart experiments that are not working (NOT implemented) Returns: """ tasks_queue = [] exec_name_vec = execvec_to_names(executing_processes) for experiment in experiments_list: # Train is always priority. task_to_add = None if is_training: if monitorer.get_status(folder, experiment, 'train')[0] == "Not Started": task_to_add = (0, experiment + '_train', { 'type': 'train', 'folder': folder, 'experiment': experiment }) elif restart_error and monitorer.get_status(folder, experiment, 'train')[0] \ == "Error": task_to_add = (0, experiment + '_train', { 'type': 'train', 'folder': folder, 'experiment': experiment }) if task_to_add is not None: task_name_vec = dict_to_namevec(task_to_add[2]) if task_name_vec in exec_name_vec: continue if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) for val_data in validation_datasets: task_to_add = None if monitorer.get_status(folder, experiment, 'validation_' + val_data)[0] == "Not Started": task_to_add = (1, experiment + '_validation_' + val_data, { 'type': 'validation', 'folder': folder, 'experiment': experiment, 'dataset': val_data }) elif restart_error and monitorer.get_status( folder, experiment, 'validation_' + val_data)[0] == "Error": task_to_add = (1, experiment + '_validation_' + val_data, { 'type': 'validation', 'folder': folder, 'experiment': experiment, 'dataset': val_data }) if task_to_add is not None: task_name_vec = dict_to_namevec(task_to_add[2]) if task_name_vec in exec_name_vec: continue if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) for drive_env in drive_environments: task_to_add = None if monitorer.get_status(folder, experiment, 'drive_' + drive_env)[0] == "Not Started": task_to_add = (2, experiment + '_drive_' + drive_env, { 'type': 'drive', 'folder': folder, 'experiment': experiment, 'environment': drive_env }) elif restart_error and monitorer.get_status(folder, experiment, 'drive_' + drive_env)\ [0] == "Error": task_to_add = (2, experiment + '_drive_' + drive_env, { 'type': 'drive', 'folder': folder, 'experiment': experiment, 'environment': drive_env }) if task_to_add is not None: task_name_vec = dict_to_namevec(task_to_add[2]) if task_name_vec in exec_name_vec: continue if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) return tasks_queue
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToPILImage(), transforms.Resize(128, 128), transforms.ToTensor(), transforms.Normalize([ 0.5, 0.5, 0.5], [ 1.0, 1.0, 1.0])])) sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) transform = transforms.Compose([transforms.Resize((88, 200))]) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 # print("helllooooo", g_conf.MODEL_NAME) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() # else: # netD = ganmodels._oldnetD().cuda() # netG = ganmodels._oldnetG().cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.eval() netD.eval() capture_time = time.time() for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) print ("Inputs", i) #forward pass fake_inputs = netG(inputs) if iteration % 1000 == 0: coil_logger.add_image("Images", torch.cat((inputs[:3], fake_inputs[:3]), 0), iteration) ##--------------------Discriminator part!!!!!!!!!!----------------------- set_requires_grad(netD, True) optimD.zero_grad() ##fake outputsD_fake_forD = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() #Create labels of patchgan style with label smoothing labels_fake = torch.zeros(labsize[0], labsize[1], labsize[2], labsize[3]) #Fake labels label_fake_noise = torch.rand(labels_fake.size()) * 0.5 - 0.25 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = MSE_loss(outputsD_fake_forD, labels_fake) ##real outputsD_real = netD(inputs) labsize = outputsD_real.size() #Create labels of patchgan style with label smoothing labels_real = torch.ones(labsize[0], labsize[1], labsize[2], labsize[3]) #Real labels label_real_noise = torch.rand(labels_real.size()) * 0.5 - 0.25 #Label smoothing labels_real = labels_real + label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = MSE_loss(outputsD_real, labels_real) #Discriminator updates lossD = (lossD_real + lossD_fake) * 0.5 lossD /= len(inputs) lossD.backward() #retain_graph=True needed? optimD.step() coil_logger.add_scalar('Total LossD', lossD.data, iteration) coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs), iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs), iteration) ##--------------------Generator part!!!!!!!!!!----------------------- #TODO change decoder architecture #TODO check norms of gradients later #TODO add auxiliary regression loss for steering set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG = netD(fake_inputs) #Generator updates lossG_adv = MSE_loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = lossG_adv + l1weight * lossG_smooth lossG /= len(inputs) lossG.backward() #retain_graph=True needed? optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs), iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs), iteration) #optimization for one iter done! position = random.randint(0, len(float_data)-1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message('Iterating', {'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'best_modelG' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias + '.yaml')) set_type_of_process('validation') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 print(dataset.meta_data) while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(latest) + '.pth')) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) for data in data_loader: input_data, labels = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] print(torch.squeeze(input_data['rgb']).shape) print(control_position) print(speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), labels[:, speed_position, :].cuda(), labels[:, control_position, :].cuda()) # TODO: clean this squeeze and dimension things for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) #loss = criterion(output, labels) #loss.backward() #optimizer.step() #shutil.copyfile(filename, 'model_best.pth.tar') else: time.sleep(1) print("Waiting for the next Validation")
def execute(gpu, exp_batch, exp_alias): manualSeed = 123 torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join('_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor()])) sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = g_conf.L1 image_size = tuple([88, 200]) testmode = 1 if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() if g_conf.GANMODEL_NAME == 'LSDcontrol_acgan_nopatch': netD = acganmodels_nopatch._netD().cuda() netG = acganmodels_nopatch._netG(skip=g_conf.SKIP).cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999)) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.train() netD.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) fake_inputs = netG(inputs) if iteration % 200 == 0: imgs_to_save = torch.cat((inputs[:2], fake_inputs[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) controls = float_data[:, dataset.controls_position(), :] steer = controls[:, 0].cuda() ##--------------------Discriminator part!!!!!!!!!!----------------------- ##fake set_requires_grad(netD, True) optimD.zero_grad() outputsD_fake_forD, fakeD_steer = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() labels_fake = torch.zeros(labsize) #Fake labels label_fake_noise = torch.rand(labels_fake.size()) * 0.1 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake_aux = MSE_loss(fakeD_steer, steer) lossD_fake_total = lossD_fake + lossD_fake_aux lossD_fake_total.backward() optimD.step() ##real set_requires_grad(netD, True) optimD.zero_grad() outputsD_real_forD, realD_steer = netD(inputs) labsize = outputsD_real_forD.size() labels_real = torch.ones(labsize) #Real labels label_real_noise = torch.rand(labels_real.size()) * 0.1 #Label smoothing labels_real = labels_real - label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = torch.mean(outputsD_real_forD) lossD_real_aux = MSE_loss(realD_steer, steer) #Discriminator updates lossD_real_total = lossD_real + lossD_real_aux lossD_real_total.backward() optimD.step() lossD = lossD_real_total + lossD_fake_total coil_logger.add_scalar('Aux Real LossD', lossD_real_aux.data, iteration) coil_logger.add_scalar('Aux Fake LossD', lossD_fake_aux.data, iteration) coil_logger.add_scalar('Total Real LossD', lossD_real_total.data , iteration) coil_logger.add_scalar('Total Fake LossD', lossD_fake_total.data , iteration) coil_logger.add_scalar('Real LossD', lossD_real.data , iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data , iteration) ##--------------------Generator part!!!!!!!!!!----------------------- set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG, G_steer = netD(fake_inputs) #Generator updates lossG_smooth = L1_loss(fake_inputs, inputs) lossG_aux = MSE_loss(G_steer, steer) lossG = lossG_adv + lossG_aux + l1weight * lossG_smooth lossG.backward() optimG.step() coil_logger.add_scalar('Total LossG', lossG.data / len(inputs), iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data , iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data , iteration) coil_logger.add_scalar('Aux LossG', lossG_aux.data , iteration) #optimization for one iter done! position = random.randint(0, len(float_data)-1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message('Iterating', {'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration*g_conf.BATCH_SIZE)/accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist()}, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias , 'best_modelG' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = g_conf.L1_WEIGHT image_size = tuple([88, 200]) if g_conf.TRAIN_TYPE == 'WGAN': clamp_value = g_conf.CLAMP n_critic = g_conf.N_CRITIC print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda() if g_conf.PRETRAINED == 'RECON': netF_statedict = torch.load('netF_GAN_Pretrained.wts') netF.load_state_dict(netF_statedict) elif g_conf.PRETRAINED == 'IL': model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth') model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys if newkey.split('.')[0] == "branch" and oldkey.split( '.')[0] == "branches": print("No Transfer of ", newkey, " to ", oldkey) else: print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) init_weights(netD) init_weights(netG) #do init for netF also later but now it is in the model code itself print(netD) print(netF) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_G, betas=(0.5, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE) Task_Loss = TaskLoss() if g_conf.LOSS_FUNCTION == 'LSGAN': Loss = torch.nn.MSELoss().cuda() elif g_conf.LOSS_FUNCTION == 'NORMAL': Loss = torch.nn.BCEWithLogitsLoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter_F = 0 best_loss_iter_G = 0 best_lossF = 1000000.0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 lossF = Variable(torch.Tensor([100.0])) lossG_adv = Variable(torch.Tensor([100.0])) lossG_smooth = Variable(torch.Tensor([100.0])) lossG = Variable(torch.Tensor([100.0])) netG.train() netD.train() netF.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) #TODO put family for losses fake_img_pool = ImagePool(50) for data in data_loader: set_requires_grad(netD, True) set_requires_grad(netF, True) set_requires_grad(netG, True) # print("ITERATION:", iteration) val = 0.0 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val #subtracted by 0.5 #TODO: make sure the F network does not get optimized by G optim controls = float_data[:, dataset.controls_position(), :] embed, branches = netF(inputs_in, dataset.extract_inputs(float_data).cuda()) print("Branch Outputs:::", branches[0][0]) embed_inputs = embed fake_inputs = netG(embed_inputs) fake_inputs_in = fake_inputs if iteration % 500 == 0: imgs_to_save = torch.cat( (inputs_in[:2] + val, fake_inputs_in[:2] + val), 0).cpu().data vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) ##--------------------Discriminator part!!!!!!!!!!-------------------## set_requires_grad(netD, True) set_requires_grad(netF, False) set_requires_grad(netG, False) optimD.zero_grad() ##fake # fake_inputs_forD = fake_img_pool.query(fake_inputs) outputsD_fake_forD = netD(fake_inputs) labsize = outputsD_fake_forD.size() labels_fake = torch.zeros(labsize) #Fake labels label_fake_noise = torch.rand( labels_fake.size()) * 0.1 #Label smoothing if g_conf.LABSMOOTH == 1: labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = torch.mean( outputsD_fake_forD) #Loss(outputsD_fake_forD, labels_fake) ##real outputsD_real = netD(inputs_in) labsize = outputsD_real.size() labels_real = torch.ones(labsize) #Real labels label_real_noise = torch.rand( labels_real.size()) * 0.1 #Label smoothing if g_conf.LABSMOOTH == 1: labels_real = labels_real - label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = -1.0 * torch.mean( outputsD_real) #Loss(outputsD_real, labels_real) ### Gradient Penalty ### gradient_penalty = calc_gradient_penalty(netD, inputs, fake_inputs) # alpha = torch.rand((g_conf.BATCH_SIZE, 1, 1, 1)) # alpha = alpha.cuda() # # x_hat = alpha * inputs.data + (1 - alpha) * fake_inputs.data # x_hat.requires_grad = True # # pred_hat = netD(x_hat) # gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).cuda(), # create_graph=True, retain_graph=True, only_inputs=True)[0] # # gradient_penalty = 10 * ((gradients.view(gradients.size()[0], -1).norm(2, 1) - 1) ** 2).mean() #Discriminator updates lossD = torch.mean( outputsD_fake_forD - outputsD_real) + gradient_penalty #(lossD_real + lossD_fake) * 0.5 # lossD /= len(inputs) print("Loss d", lossD) lossD.backward(retain_graph=True) optimD.step() # if g_conf.TRAIN_TYPE == 'WGAN': # for p in netD.parameters(): # p.data.clamp_(-clamp_value, clamp_value) coil_logger.add_scalar('Total LossD', lossD.data, iteration) coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs), iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs), iteration) ##--------------------Generator part!!!!!!!!!!-----------------------## set_requires_grad(netD, False) set_requires_grad(netF, False) set_requires_grad(netG, True) if ((iteration + 1) % n_critic) == 0: optimG.zero_grad() outputsD_fake_forG = netD(fake_inputs) #Generator updates lossG_adv = -1.0 * torch.mean( outputsD_fake_forG) #Loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = (lossG_adv + l1weight * lossG_smooth) / (1.0 + l1weight) # lossG /= len(inputs) print(lossG) lossG.backward(retain_graph=True) optimG.step() #####Task network updates########################## set_requires_grad(netD, False) set_requires_grad(netF, True) set_requires_grad(netG, False) optimF.zero_grad() lossF = Variable(torch.Tensor()) lossF = Task_Loss.MSELoss( branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) coil_logger.add_scalar('Task Loss', lossF.data, iteration) lossF.backward() optimF.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs), iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs), iteration) #optimization for one iter done! position = random.randint(0, len(float_data) - 1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() # print (lossG.item(), best_lossG) if lossG.item() < best_lossG: best_lossG = lossG.item() best_loss_iter_G = iteration if lossF.item() < best_lossF: best_lossF = lossF.item() best_loss_iter_F = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "LossF", lossF, "BestLossF", best_lossF, "Iteration", iteration, "Best Loss Iteration G", best_loss_iter_G, "Best Loss Iteration F", best_loss_iter_F) coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLossD': best_lossD, 'BestLossG': best_lossG, 'BestLossIterationG': best_loss_iter_G, 'BestLossF': best_lossF, 'BestLossIterationF': best_loss_iter_F, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter_G': best_loss_iter_G, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter_G and iteration > 10000: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter_G': best_loss_iter_G } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelG' + '.pth')) if iteration == best_loss_iter_F and iteration > 10000: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'best_lossF': best_lossF, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelF' + '.pth')) iteration += 1
def mount_experiment_heap(folder, experiments_list, is_training, executing_processes, old_tasks_queue, validation_datasets, drive_environments, restart_error=True): tasks_queue = [] for experiment in experiments_list: if experiment in executing_processes: continue # Train is always priority. # TODO: some system to check priority depending on iterations task_to_add = None # TODO: One thing is error other thing is stop. However at a first step we can try to restart all error things if is_training: if monitorer.get_status(folder, experiment, 'train')[0] == "Not Started": task_to_add = (0, experiment + '_train', { 'type': 'train', 'folder': folder, 'experiment': experiment }) elif restart_error and monitorer.get_status(folder, experiment, 'train')[0] \ == "Error": task_to_add = (0, experiment + '_train', { 'type': 'train', 'folder': folder, 'experiment': experiment }) if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) task_to_add = None for val_data in validation_datasets: if monitorer.get_status(folder, experiment, 'validation_' + val_data)[0] == "Not Started": task_to_add = (2, experiment + '_validation_' + val_data, { 'type': 'validation', 'folder': folder, 'experiment': experiment, 'dataset': val_data }) elif restart_error and monitorer.get_status( folder, experiment, 'validation_' + val_data)[0] == "Error": task_to_add = (2, experiment + '_validation_' + val_data, { 'type': 'validation', 'folder': folder, 'experiment': experiment, 'dataset': val_data }) if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) task_to_add = None for drive_env in drive_environments: if monitorer.get_status(folder, experiment, 'drive_' + drive_env)[0] == "Not Started": task_to_add = (1, experiment + '_drive_' + drive_env, { 'type': 'drive', 'folder': folder, 'experiment': experiment, 'environment': drive_env }) elif restart_error and monitorer.get_status(folder, experiment, 'drive_' + drive_env)\ [0] == "Error": task_to_add = (1, experiment + '_drive_' + drive_env, { 'type': 'drive', 'folder': folder, 'experiment': experiment, 'environment': drive_env }) if task_to_add is not None and task_to_add not in old_tasks_queue: heapq.heappush(tasks_queue, task_to_add) return tasks_queue
def execute(gpu, exp_batch, exp_alias): from time import gmtime, strftime manualSeed = g_conf.SEED torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) real_dataset = g_conf.TARGET_DOMAIN_PATH # real_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], "FinalRealWorldDataset") #main data loader dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) real_dl = real_dataloader.RealDataset(real_dataset, g_conf.BATCH_SIZE) st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl(iag.AdditiveGaussianNoise(loc=0, scale=( 0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5)), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization( (0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) l1weight = g_conf.L1_WEIGHT task_adv_weight = g_conf.TASK_ADV_WEIGHT image_size = tuple([88, 200]) print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda() if g_conf.PRETRAINED == 'RECON': netF_statedict = torch.load('netF_GAN_Pretrained.wts') netF.load_state_dict(netF_statedict) elif g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth') model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches": # print("No Transfer of ", newkey, " to ", oldkey) # else: print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': netD = ganmodels_taskAC_shared._netD().cuda() netG = ganmodels_taskAC_shared._netG().cuda() netF = ganmodels_taskAC_shared._netF().cuda() if g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load(g_conf.IL_AGENT_PATH) model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") ##### if g_conf.IF_AUG: print("Loading Aug Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) else: print("Loading Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) model_dec_state_dict = model_dec['stateG_dict'] netG_state_dict = netG.state_dict() print(len(netG_state_dict.keys()), len(model_dec_state_dict.keys())) for i, keys in enumerate( zip(netG_state_dict.keys(), model_dec_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netG_state_dict[newkey] = model_dec_state_dict[oldkey] netG.load_state_dict(netG_state_dict) print("Decoder Model Loaded!") init_weights(netD) print(netD) print(netF) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_G, betas=(0.5, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE) Task_Loss = TaskLoss() if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': print("Using cross entropy!") Loss = torch.nn.CrossEntropyLoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter_F = 0 best_loss_iter_G = 0 best_lossF = 1000000.0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 n_critic = g_conf.N_CRITIC lossF = Variable(torch.Tensor([100.0])) lossG_adv = Variable(torch.Tensor([100.0])) lossG_smooth = Variable(torch.Tensor([100.0])) lossG = Variable(torch.Tensor([100.0])) netG.train() netD.train() netF.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) fake_img_pool_src = ImagePool(50) fake_img_pool_tgt = ImagePool(50) for data in data_loader: set_requires_grad(netD, True) set_requires_grad(netF, True) set_requires_grad(netG, True) input_data, float_data = data tgt_imgs = real_dl.get_imgs() if g_conf.IF_AUG: inputs = augmenter(0, input_data['rgb']) else: inputs = input_data['rgb'].cuda() tgt_imgs = tgt_imgs.cuda() inputs = inputs.squeeze(1) inputs = inputs tgt_imgs = tgt_imgs controls = float_data[:, dataset.controls_position(), :] camera_angle = float_data[:, 26, :] camera_angle = camera_angle.cuda() steer = float_data[:, 0, :] steer = steer.cuda() speed = float_data[:, 10, :] speed = speed.cuda() time_use = 1.0 car_length = 3.0 extra_factor = 2.5 threshold = 1.0 pos = camera_angle > 0.0 pos = pos.type(torch.FloatTensor) neg = camera_angle <= 0.0 neg = neg.type(torch.FloatTensor) pos = pos.cuda() neg = neg.cuda() rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0 val = extra_factor * (torch.atan((rad_camera_angle * car_length) / (time_use * speed + 0.05))) / 3.1415 steer -= pos * torch.min(val, torch.Tensor([0.6]).cuda()) steer += neg * torch.min(val, torch.Tensor([0.6]).cuda()) steer = steer.cpu() float_data[:, 0, :] = steer float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0 float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0 src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_fake_inputs = netG(src_embed_inputs.detach()) tgt_fake_inputs = netG(tgt_embed_inputs.detach()) if iteration % 100 == 0: imgs_to_save = torch.cat((inputs[:1], src_fake_inputs[:1], tgt_imgs[:1], tgt_fake_inputs[:1]), 0).cpu().data coil_logger.add_image("Images", imgs_to_save, iteration) imgs_to_save = imgs_to_save.clamp(0.0, 1.0) vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=False) ##--------------------Discriminator part!!!!!!!!!!-------------------## ##source fake if g_conf.IF_POOL: src_fake_inputs_forD = fake_img_pool_src.query(src_fake_inputs) tgt_fake_inputs_forD = fake_img_pool_tgt.query(tgt_fake_inputs) else: src_fake_inputs_forD = src_fake_inputs tgt_fake_inputs_forD = tgt_fake_inputs set_requires_grad(netD, True) set_requires_grad(netF, False) set_requires_grad(netG, False) optimD.zero_grad() outputsD_fake_src_bin, __ = netD(src_fake_inputs_forD.detach()) outputsD_fake_tgt_bin, __ = netD(tgt_fake_inputs_forD.detach()) outputsD_real_src_bin, __ = netD(inputs) outputsD_real_tgt_bin, __ = netD(tgt_imgs) gradient_penalty_src = calc_gradient_penalty(netD, inputs, src_fake_inputs_forD, "recon") lossD_bin_src = torch.mean( outputsD_fake_src_bin - outputsD_real_src_bin) + gradient_penalty_src gradient_penalty_tgt = calc_gradient_penalty(netD, tgt_imgs, tgt_fake_inputs_forD, "recon") lossD_bin_tgt = torch.mean( outputsD_fake_tgt_bin - outputsD_real_tgt_bin) + gradient_penalty_tgt lossD = (lossD_bin_src + lossD_bin_tgt) * 0.5 lossD.backward(retain_graph=True) optimD.step() coil_logger.add_scalar('Total LossD Bin', lossD.data, iteration) coil_logger.add_scalar('Src LossD Bin', lossD_bin_src.data, iteration) coil_logger.add_scalar('Tgt LossD Bin', lossD_bin_tgt.data, iteration) ##--------------------Generator part!!!!!!!!!!-----------------------## set_requires_grad(netD, False) set_requires_grad(netF, False) set_requires_grad(netG, True) optimG.zero_grad() #fake outputs for bin outputsD_bin_src_fake_forG, __ = netD(src_fake_inputs) outputsD_bin_tgt_fake_forG, __ = netD(tgt_fake_inputs) #Generator updates if ((iteration + 1) % n_critic) == 0: #for netD_bin optimG.zero_grad() outputsD_bin_fake_forG = netD(tgt_imgs) #Generator updates lossG_src_smooth = L1_loss( src_fake_inputs, inputs) # L1 loss with real domain image lossG_tgt_smooth = L1_loss( tgt_fake_inputs, tgt_imgs) # L1 loss with real domain image lossG_src_adv_bin = -1.0 * torch.mean(outputsD_bin_src_fake_forG) lossG_tgt_adv_bin = -1.0 * torch.mean(outputsD_bin_tgt_fake_forG) lossG_adv_bin = 0.5 * (lossG_src_adv_bin + lossG_tgt_adv_bin) lossG_Adv = lossG_adv_bin lossG_L1 = 0.5 * (lossG_src_smooth + lossG_tgt_smooth) lossG = (lossG_Adv + l1weight * lossG_L1) / (1.0 + l1weight) lossG.backward(retain_graph=True) optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('LossG Adv', lossG_Adv.data, iteration) coil_logger.add_scalar('Adv Bin LossG', lossG_adv_bin.data, iteration) coil_logger.add_scalar('Smooth LossG', lossG_L1.data, iteration) #####Task network updates########################## set_requires_grad(netD, False) set_requires_grad(netF, True) set_requires_grad(netG, False) optimF.zero_grad() lossF_task = Task_Loss.MSELoss( src_branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) __, outputsD_fake_src_da = netD(src_fake_inputs_forD.detach()) __, outputsD_real_tgt_da = netD(tgt_imgs) __, outputsD_fake_tgt_da = netD(tgt_fake_inputs_forD.detach()) __, outputsD_real_src_da = netD(inputs) gradient_penalty_da_1 = calc_gradient_penalty( netD, tgt_imgs, src_fake_inputs_forD, "da") lossF_da_1 = torch.mean(outputsD_fake_src_da - outputsD_real_tgt_da ) + gradient_penalty_da_1 gradient_penalty_da_2 = calc_gradient_penalty( netD, inputs, tgt_fake_inputs_forD, "da") lossF_da_2 = torch.mean(outputsD_fake_tgt_da - outputsD_real_src_da ) + gradient_penalty_da_2 lossF_da = 0.5 * (lossF_da_1 + lossF_da_2) lossF = (lossF_task + task_adv_weight * lossF_da) / (1.0 + task_adv_weight) coil_logger.add_scalar('Total Task Loss', lossF.data, iteration) coil_logger.add_scalar('Adv Task Loss', lossF_da.data, iteration) coil_logger.add_scalar('Only Task Loss', lossF_task.data, iteration) lossF.backward(retain_graph=True) optimF.step() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter_G = iteration if lossF.data < best_lossF: best_lossF = lossF.data.tolist() best_loss_iter_F = iteration #optimization for one iter done! position = random.randint(0, len(float_data) - 1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() accumulated_time += time.time() - capture_time capture_time = time.time() if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter_G': best_loss_iter_G, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Datasets/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter_F and iteration > 10000: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'best_lossF': best_lossF, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Datasets/rohitgan/_logs', exp_batch, exp_alias, 'best_modelF' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 modelG = coil_ganmodules_task._netG() modelF = coil_ganmodules_task._netF() fstd = torch.load('netF_GAN_Pretrained.wts') gstd = torch.load('netG_GAN_Pretrained.wts') print(fstd.keys()) print("+++++++++") print(gstd.keys()) print(modelG) modelF.load_state_dict(fstd) modelG.load_state_dict(gstd) print(modelF) print(modelG) # netG.eval() capture_time = time.time() for data in data_loader: val = 0.5 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val #subtracted by 0.5 #forward pass embeds, branches = modelF(inputs) fake_inputs = modelG(embeds) imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './imgs' + '/' + str(iteration) + '_real_and_fake.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration)
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = g_conf.L1_WEIGHT image_size = tuple([88, 200]) print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() init_weights(netD) init_weights(netG) print(netD) print(netG) optimD = torch.optim.Adam(netD.parameters(), lr=g_conf.LR_D, betas=(0.7, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_G, betas=(0.7, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netG.parameters(), lr=g_conf.LEARNING_RATE, betas=(0.7, 0.999)) Task_Loss = TaskLoss() if g_conf.LOSS_FUNCTION == 'LSGAN': Loss = torch.nn.MSELoss().cuda() elif g_conf.LOSS_FUNCTION == 'NORMAL': Loss = torch.nn.BCELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter = 0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 netG.train() netD.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) #TODO add image queue #TODO add auxiliary regression loss for steering #TODO put family for losses fake_img_pool = ImagePool(50) for data in data_loader: val = 0.5 input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) inputs_in = inputs - val fake_inputs = netG(inputs_in) #subtracted by 0.5 fake_inputs_in = fake_inputs if iteration % 200 == 0: imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]), 0).cpu().data vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=True) coil_logger.add_image("Images", imgs_to_save, iteration) ##--------------------Discriminator part!!!!!!!!!!-------------------## set_requires_grad(netD, True) optimD.zero_grad() ##fake fake_inputs_forD = fake_img_pool.query(fake_inputs) outputsD_fake_forD = netD(fake_inputs_forD.detach()) labsize = outputsD_fake_forD.size() labels_fake = torch.zeros(labsize) #Fake labels label_fake_noise = torch.rand( labels_fake.size()) * 0.05 - 0.025 #Label smoothing if g_conf.LABSMOOTH == 1: labels_fake = labels_fake + labels_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = Loss(outputsD_fake_forD, labels_fake) ##real outputsD_real = netD(inputs) print("some d outputs", outputsD_real[0]) labsize = outputsD_real.size() labels_real = torch.ones(labsize) #Real labels label_real_noise = torch.rand( labels_real.size()) * 0.05 - 0.025 #Label smoothing if g_conf.LABSMOOTH == 1: labels_real = labels_real + labels_real_noise labels_real = Variable(labels_real).cuda() lossD_real = Loss(outputsD_real, labels_real) #Discriminator updates lossD = (lossD_real + lossD_fake) * 0.5 # lossD /= len(inputs) lossD.backward() optimD.step() coil_logger.add_scalar('Total LossD', lossD.data, iteration) coil_logger.add_scalar('Real LossD', lossD_real.data, iteration) coil_logger.add_scalar('Fake LossD', lossD_fake.data, iteration) ##--------------------Generator part!!!!!!!!!!----------------------- set_requires_grad(netD, False) optimG.zero_grad() outputsD_fake_forG = netD(fake_inputs) #Generator updates lossG_adv = Loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = (lossG_adv + l1weight * lossG_smooth) / (1.0 + l1weight) lossG lossG.backward() optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) coil_logger.add_scalar('Adv LossG', lossG_adv.data, iteration) coil_logger.add_scalar('Smooth LossG', lossG_smooth.data, iteration) #optimization for one iter done! position = random.randint(0, len(float_data) - 1) if lossD.data < best_lossD: best_lossD = lossD.data.tolist() if lossG.data < best_lossG: best_lossG = lossG.data.tolist() best_loss_iter = iteration accumulated_time += time.time() - capture_time capture_time = time.time() print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'LossD': lossD.data.tolist(), 'LossG': lossG.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLossD': best_lossD, 'BestLossIteration': best_loss_iter, 'BestLossG': best_lossG, 'BestLossIteration': best_loss_iter, 'GroundTruth': dataset.extract_targets(float_data)[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter: state = { 'iteration': iteration, 'stateD_dict': netD.state_dict(), 'stateG_dict': netG.state_dict(), 'best_lossD': best_lossD, 'best_lossG': best_lossG, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelG' + '.pth')) iteration += 1
def folder_execute(params=None): """ Execute a folder of experiments. It will execute trainings and all the selected evaluations for each of the models present on the folder. Args params: a dictionary containing: gpus: the gpu numbers that are going to be allocated for the experiment gpu_value: the "value" of each gpu, depending on the value more or less experiments will be allocated per GPU folder: the folder where all the experiment configuration files are validation_datasets: the validation datasets that are going to be validated per experiment driving_environments: The driving environments where the models are going to be tested. """ folder = params['folder'] allocated_gpus = params['gpus'] validation_datasets = params['validation_datasets'] driving_environments = params['driving_environments'] allocation_parameters = params['allocation_parameters'] experiments_list = os.listdir(os.path.join('configs', folder)) experiments_list = [ experiment.split('.')[-2] for experiment in experiments_list ] allocated_gpus = { gpu: allocation_parameters['gpu_value'] for gpu in allocated_gpus } executing_processes = [] free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) # Is a queue of tasks to be executed. The priority is always train. # then test then val. tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], [], [], validation_datasets, driving_environments) # No process is executing right now. while True: # if not done or executing get to the list # If amount of resources is smaller than a threshold. while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'], allocation_parameters['validation_cost'], allocation_parameters['drive_cost']]) \ and tasks_queue != []: # Allocate all the gpus popped_thing = heapq.heappop(tasks_queue) process_specs = popped_thing[2] # To get directly the dict # Get the train status, that will affect in scheduling a validation or drive process train_status = monitorer.get_status(folder, process_specs['experiment'], 'train')[0] # ADD TRAIN TO EXECUTE if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \ allocation_parameters['train_cost']: free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['train_cost']) execute_train(gpu_number, process_specs['folder'], process_specs['experiment'], params['number_of_workers']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD DRIVE TO EXECUTE elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \ allocation_parameters['drive_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): print(process_specs['type']) free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['drive_cost']) execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['environment'], params['driving_parameters']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) # ADD VALIDATION TO EXECUTE elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \ allocation_parameters['validation_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['validation_cost']) execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['dataset']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], executing_processes, tasks_queue, validation_datasets, driving_environments, False) printer.plot_folder_summaries(folder, params['is_training'], validation_datasets, driving_environments) # Check allocated process, and look which ones finished. if len(tasks_queue) == 0 and len(executing_processes) == 0: break free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) time.sleep(10) print("ALL EXPERIMENTS EXECUTED")
def execute(gpu, exp_batch, exp_alias): from time import gmtime, strftime manualSeed = g_conf.SEED torch.cuda.manual_seed(manualSeed) os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) real_dataset = g_conf.TARGET_DOMAIN_PATH #main data loader dataset = CoILDataset(full_dataset, real_dataset, transform=transforms.Compose([transforms.ToTensor() ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl(iag.AdditiveGaussianNoise(loc=0, scale=( 0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5)), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization( (0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) l1weight = g_conf.L1_WEIGHT task_adv_weight = g_conf.TASK_ADV_WEIGHT image_size = tuple([88, 200]) print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) print("GPU", gpu) print("Configurations of ", exp_alias) print("GANMODEL_NAME", g_conf.GANMODEL_NAME) print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION) print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE) print("SKIP", g_conf.SKIP) print("TYPE", g_conf.TYPE) print("L1 WEIGHT", g_conf.L1_WEIGHT) print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT) print("LAB SMOOTH", g_conf.LABSMOOTH) if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch': netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller': netD = ganmodels_nopatch_smaller._netD( loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_nopatch_smaller._netG( loss=g_conf.LOSS_FUNCTION).cuda() elif g_conf.GANMODEL_NAME == 'LSDcontrol_task': netD_task = ganmodels_task._netD_task(loss=g_conf.LOSS_FUNCTION).cuda() netD_img = ganmodels_task._netD_img(loss=g_conf.LOSS_FUNCTION).cuda() netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda() netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda() if g_conf.PRETRAINED == 'RECON': netF_statedict = torch.load('netF_GAN_Pretrained.wts') netF.load_state_dict(netF_statedict) elif g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth') model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches": # print("No Transfer of ", newkey, " to ", oldkey) # else: print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': netD_bin = ganmodels_task._netD_task().cuda() netD_img = ganmodels_task._netD_img().cuda() netG = ganmodels_task._netG().cuda() netF = ganmodels_task._netF().cuda() if g_conf.PRETRAINED == 'IL': print("Loading IL") model_IL = torch.load(g_conf.IL_AGENT_PATH) model_IL_state_dict = model_IL['state_dict'] netF_state_dict = netF.state_dict() print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys())) for i, keys in enumerate( zip(netF_state_dict.keys(), model_IL_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netF_state_dict[newkey] = model_IL_state_dict[oldkey] netF.load_state_dict(netF_state_dict) print("IL Model Loaded!") #### if g_conf.IF_AUG: print("Loading Aug Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) else: print("Loading Decoder") model_dec = torch.load(g_conf.DECODER_RECON_PATH) model_dec_state_dict = model_dec['stateG_dict'] netG_state_dict = netG.state_dict() print(len(netG_state_dict.keys()), len(model_dec_state_dict.keys())) for i, keys in enumerate( zip(netG_state_dict.keys(), model_dec_state_dict.keys())): newkey, oldkey = keys print("Transferring ", newkey, " to ", oldkey) netG_state_dict[newkey] = model_dec_state_dict[oldkey] netG.load_state_dict(netG_state_dict) print("Decoder Model Loaded!") init_weights(netD_bin) init_weights(netD_img) # init_weights(netG) print(netD_bin) print(netF) optimD_bin = torch.optim.Adam(netD_bin.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimD_img = torch.optim.Adam(netD_img.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) optimG = torch.optim.Adam(netG.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999)) if g_conf.TYPE == 'task': optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE) Task_Loss = TaskLoss() if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d': print("Using cross entropy!") Loss = torch.nn.CrossEntropyLoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 best_loss_iter_F = 0 best_loss_iter_G = 0 best_lossF = 1000000.0 best_lossD = 1000000.0 best_lossG = 1000000.0 accumulated_time = 0 gen_iterations = 0 n_critic = g_conf.N_CRITIC lossF = Variable(torch.Tensor([100.0])) lossG_adv = Variable(torch.Tensor([100.0])) lossG_smooth = Variable(torch.Tensor([100.0])) lossG = Variable(torch.Tensor([100.0])) netD_bin.train() netD_img.train() netG.train() netF.train() capture_time = time.time() if not os.path.exists('./imgs_' + exp_alias): os.mkdir('./imgs_' + exp_alias) #TODO check how C network is optimized in LSDSEG #TODO put family for losses #IMPORTANT WHILE RUNNING THIS, CONV.PY MUST HAVE BATCHNORMS fake_img_pool_src = ImagePool(50) fake_img_pool_tgt = ImagePool(50) for data in data_loader: set_requires_grad(netD_bin, True) set_requires_grad(netD_img, True) set_requires_grad(netG, True) set_requires_grad(netF, True) # print("ITERATION:", iteration) val = 0.0 input_data, float_data, tgt_imgs = data if g_conf.IF_AUG: inputs = augmenter(0, input_data['rgb']) tgt_imgs = augmenter(0, tgt_imgs) else: inputs = input_data['rgb'].cuda() tgt_imgs = tgt_imgs.cuda() inputs = inputs.squeeze(1) inputs = inputs - val #subtracted by 0.5 tgt_imgs = tgt_imgs - val #subtracted by 0.5 controls = float_data[:, dataset.controls_position(), :] src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) if iteration % 100 == 0: imgs_to_save = torch.cat( (inputs[:1] + val, src_img_fake[:1] + val, tgt_imgs[:1] + val, tgt_img_fake[:1] + val), 0).cpu().data coil_logger.add_image("Images", imgs_to_save, iteration) imgs_to_save = imgs_to_save.clamp(0.0, 1.0) vutils.save_image(imgs_to_save, './imgs_' + exp_alias + '/' + str(iteration) + '_real_and_fake.png', normalize=False) ##--------------------Discriminator part!!!!!!!!!!-------------------## set_requires_grad(netD_bin, True) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_bin.zero_grad() outputsD_real_src_bin = netD_bin(src_embed_inputs) outputsD_real_tgt_bin = netD_bin(tgt_embed_inputs) gradient_penalty = calc_gradient_penalty(netD_bin, src_embed_inputs, tgt_embed_inputs) lossD_bin = torch.mean(outputsD_real_tgt_bin - outputsD_real_src_bin) + gradient_penalty lossD_bin.backward(retain_graph=True) optimD_bin.step() coil_logger.add_scalar('Total LossD Bin', lossD_bin.data, iteration) #### Discriminator img update #### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, True) set_requires_grad(netG, False) set_requires_grad(netF, False) optimD_img.zero_grad() outputsD_fake_src_img = netD_img(src_img_fake.detach()) outputsD_fake_tgt_img = netD_img(tgt_img_fake.detach()) outputsD_real_src_img = netD_img(inputs) outputsD_real_tgt_img = netD_img(tgt_imgs) gradient_penalty_src = calc_gradient_penalty(netD_img, inputs, src_img_fake) lossD_img_src = torch.mean( outputsD_fake_src_img - outputsD_real_src_img) + gradient_penalty_src gradient_penalty_tgt = calc_gradient_penalty(netD_img, tgt_imgs, tgt_img_fake) lossD_img_tgt = torch.mean( outputsD_fake_tgt_img - outputsD_real_tgt_img) + gradient_penalty_tgt lossD_img = (lossD_img_src + lossD_img_tgt) * 0.5 lossD_img.backward(retain_graph=True) optimD_img.step() coil_logger.add_scalar('Total LossD img', lossD_img.data, iteration) if ((iteration + 1) % n_critic) == 0: #####Generator updates####### set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, True) set_requires_grad(netF, False) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) outputsD_fake_tgt_img = netD_img(tgt_img_fake) lossG_src_smooth = L1_loss(src_img_fake, inputs) lossG_tgt_smooth = L1_loss(tgt_img_fake, tgt_imgs) lossG_smooth = (lossG_src_smooth + lossG_tgt_smooth) * 0.5 lossG_adv = 0.5 * (-1.0 * outputsD_fake_src_img.mean() - 1.0 * outputsD_fake_tgt_img.mean()) lossG = (lossG_smooth + 0.0 * lossG_adv) lossG.backward(retain_graph=True) optimG.step() coil_logger.add_scalar('Total LossG', lossG.data, iteration) #####Task network updates########################## set_requires_grad(netD_bin, False) set_requires_grad(netD_img, False) set_requires_grad(netG, False) set_requires_grad(netF, True) optimF.zero_grad() src_embed_inputs, src_branches = netF( inputs, dataset.extract_inputs(float_data).cuda()) tgt_embed_inputs = netF(tgt_imgs, None) src_img_fake = netG(src_embed_inputs) tgt_img_fake = netG(tgt_embed_inputs) outputsD_fake_src_img = netD_img(src_img_fake) outputsD_real_tgt_img = netD_img(tgt_imgs) lossF_task = Task_Loss.MSELoss( src_branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) lossF_adv_bin = netD_bin(src_embed_inputs).mean() - netD_bin( tgt_embed_inputs).mean() lossF_adv_img = outputsD_fake_src_img.mean( ) - outputsD_real_tgt_img.mean() lossF_adv = 0.5 * (lossF_adv_bin + 0.1 * lossF_adv_img) lossF = (lossF_task + task_adv_weight * lossF_adv) coil_logger.add_scalar('Total Task Loss', lossF.data, iteration) coil_logger.add_scalar('Adv Task Loss', lossF_adv.data, iteration) coil_logger.add_scalar('Only Task Loss', lossF_task.data, iteration) lossF.backward(retain_graph=True) optimF.step() if lossF.data < best_lossF: best_lossF = lossF.data.tolist() best_loss_iter_F = iteration #optimization for one iter done! position = random.randint(0, len(float_data) - 1) accumulated_time += time.time() - capture_time capture_time = time.time() if is_ready_to_save(iteration): state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) if iteration == best_loss_iter_F and iteration > 10000: state = { 'iteration': iteration, 'stateD_bin_dict': netD_bin.state_dict(), 'stateF_dict': netF.state_dict(), 'best_lossD': best_lossD, 'best_lossF': best_lossF, 'total_time': accumulated_time, 'best_loss_iter_F': best_loss_iter_F } torch.save( state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias, 'best_modelF' + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices try: os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) dataset = CoILDataset(full_dataset, transform=transforms.Compose( [transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() exit() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] accumulated_time = checkpoint['total_time'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 accumulated_time = 0 # We accumulate iteration time and keep the average speed best_loss_iter = 0 # TODO: The checkpoint will continue, so it should erase everything up to the iteration print(dataset.meta_data) print(model) capture_time = time.time() for data in data_loader: input_data, float_data = data #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) #coil_logger.add_images(input_rgb_data) # get the control commands from float_data, size = [120,1] controls = float_data[:, dataset.controls_position(), :] print(" CONTROLS ", controls.shape) # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() print('INPUTS', dataset.extract_inputs(float_data).shape) branches = model(input_rgb_data, dataset.extract_inputs(float_data).cuda()) #print ("len ",len(branches)) #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) print("Extracted targets ", dataset.extract_targets(float_data).shape[0]) loss = criterion.MSELoss( branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) # TODO: All these logging things could go out to clean up the main if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration # Log a random position position = random.randint(0, len(float_data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # TODO: For now we are computing the error for just the correct branch, it could be multi- branch, coil_logger.add_scalar('Loss', loss.data, iteration) loss.backward() optimizer.step() accumulated_time += time.time() - capture_time capture_time = time.time() # TODO: Get only the float_data that are actually generating output # TODO: itearation is repeating , and that is dumb coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( float_data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) # TODO: For now we are computing the error for just the correct branch, it could be multi-branch, # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) iteration += 1 except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias): os.environ["CUDA_VISIBLE_DEVICES"] = gpu merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": return full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [1.0, 1.0, 1.0]) ])) sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=6, pin_memory=True) l1weight = 1.0 image_size = tuple([88, 200]) testmode = 1 if g_conf.GANMODEL_NAME == 'LSDcontrol': netD = ganmodels._netD().cuda() netG = ganmodels._netG(skip=g_conf.SKIP).cuda() checkpoint = torch.load( os.path.join( '/datatmp/Experiments/rohitgan/_logs/eccv/experiment_1/checkpoints/1010000.pth' )) netG.load_state_dict(checkpoint['stateG_dict']) netD.load_state_dict(checkpoint['stateD_dict']) print(netD) print(netG) MSE_loss = torch.nn.MSELoss().cuda() L1_loss = torch.nn.L1Loss().cuda() iteration = 0 # netG.eval() # netD.eval() capture_time = time.time() for data in data_loader: input_data, float_data = data inputs = input_data['rgb'].cuda() inputs = inputs.squeeze(1) #forward pass fake_inputs = netG(inputs) imgs_to_save = torch.cat((fake_inputs, inputs), 0).cpu().data imgs_to_save = (imgs_to_save + 1.0) / 2.0 imgs = [img for img in imgs_to_save] vutils.save_image(imgs, 'imgs/' + str(iteration) + '.png') outputsD_fake_forD = netD(fake_inputs.detach()) labsize = outputsD_fake_forD.size() labels_fake = torch.zeros(labsize[0], labsize[1], labsize[2], labsize[3]) #Fake labels label_fake_noise = torch.rand( labels_fake.size()) * 0.5 - 0.25 #Label smoothing labels_fake = labels_fake + label_fake_noise labels_fake = Variable(labels_fake).cuda() lossD_fake = MSE_loss(outputsD_fake_forD, labels_fake) outputsD_real = netD(inputs) labsize = outputsD_real.size() labels_real = torch.ones(labsize[0], labsize[1], labsize[2], labsize[3]) #Real labels label_real_noise = torch.rand( labels_real.size()) * 0.5 - 0.25 #Label smoothing labels_real = labels_real + label_real_noise labels_real = Variable(labels_real).cuda() lossD_real = MSE_loss(outputsD_real, labels_real) lossD = (lossD_real + lossD_fake) * 0.5 lossD /= len(inputs) outputsD_fake_forG = netD(fake_inputs) lossG_adv = MSE_loss(outputsD_fake_forG, labels_real) lossG_smooth = L1_loss(fake_inputs, inputs) lossG = lossG_adv + l1weight * lossG_smooth lossG /= len(inputs) # print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(), "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration", iteration, "Best Loss Iteration", best_loss_iter) iteration += 1
def folder_execute(params=None): """ On this mode the training software keeps all It forks a process to run the monitor over the training logs. Arguments param, prioritize training, prioritize test, prioritize """ folder = params['folder'] allocated_gpus = params['gpus'] validation_datasets = params['validation_datasets'] driving_environments = params['driving_environments'] allocation_parameters = params['allocation_parameters'] experiments_list = os.listdir(os.path.join('configs', folder)) experiments_list = [experiment.split('.')[-2] for experiment in experiments_list] # Each gpu has maximun 2 slots allocated_gpus = {gpu: allocation_parameters['gpu_value'] for gpu in allocated_gpus} executing_processes = [] free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources(allocated_gpus, executing_processes, allocation_parameters) # Is a queue of tasks to be executed. The priority is always train. # then test then val. # TODO: change the priority to test the ones that have already been trained. tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], [], [], validation_datasets, driving_environments) # No process is executing right now. print(tasks_queue) # TODO: the while should go outside, so the monitorer process is independent of the type of execution while True: # if not done or executing get to the list # If amount of resources is smaller than a threshold. while resources_on_most_free_gpu >= min([allocation_parameters['train_cost'], allocation_parameters['validation_cost'], allocation_parameters['drive_cost']]) \ and tasks_queue != []: # Allocate all the gpus popped_thing = heapq.heappop(tasks_queue) process_specs = popped_thing[2] # To get directly the dict # Get the train status, that will affect in scheduling a validation or drive process train_status = monitorer.get_status(folder, process_specs['experiment'], 'train')[0] if process_specs['type'] == 'train' and resources_on_most_free_gpu >= \ allocation_parameters['train_cost']: free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['train_cost']) execute_train(gpu_number, process_specs['folder'], process_specs['experiment']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) elif process_specs['type'] == 'validation' and resources_on_most_free_gpu >= \ allocation_parameters['validation_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['validation_cost']) execute_validation(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['dataset']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) elif process_specs['type'] == 'drive' and resources_on_most_free_gpu >= \ allocation_parameters['drive_cost'] \ and (train_status == 'Iterating' or train_status == 'Loading' or train_status == 'Finished'): free_gpus, resources_on_most_free_gpu, gpu_number = allocate_gpu_resources( free_gpus, allocation_parameters['drive_cost']) execute_drive(gpu_number, process_specs['folder'], process_specs['experiment'], process_specs['environment'], no_screen=params['no_screen']) process_specs.update({'gpu': gpu_number}) executing_processes.append(process_specs) tasks_queue = mount_experiment_heap(folder, experiments_list, params['is_training'], executing_processes, tasks_queue, validation_datasets, driving_environments, False) printer.plot_folder_summaries(folder, params['is_training'], validation_datasets, driving_environments) # Check allocated process, and look which ones finished. free_gpus, resources_on_most_free_gpu, executing_processes = get_gpu_resources( allocated_gpus, executing_processes, allocation_parameters) if len(tasks_queue) == 0 and len(executing_processes) == 0: break print ("Task queue", tasks_queue) print ("") print ("exec proc", executing_processes) print("resources", free_gpus) time.sleep(10) print("ALL EXPERIMENTS EXECUTED")
def execute(gpu, exp_batch, exp_alias, dataset_name): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = '0' # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) print(full_dataset) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor() ])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() model.eval() criterion = Loss() latest = get_latest_evaluated_checkpoint() if latest is None: # When nothing was tested, get latest returns none, we fix that. latest = 0 latest = 200000 best_loss = 1000.0 best_error = 1000.0 best_loss_iter = 0 best_error_iter = 0 print(dataset.meta_data[0][0]) for k in dataset.meta_data: k[0] = str(k[0], 'utf-8') print(dataset.meta_data[0][0]) cpts = glob.glob( '/home-local/rohitrishabh/coil_20-06/_logs/eccv/experiment_1/checkpoints/*.pth' ) # while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE): for ckpt in cpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) latest = int(ckpt[-10:-4]) # checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth')) checkpoint = torch.load(ckpt) checkpoint_iteration = checkpoint['iteration'] print("Validation loaded ", checkpoint_iteration) accumulated_loss = 0.0 accumulated_error = 0.0 iteration_on_checkpoint = 0 for data in data_loader: input_data, float_data = data control_position = np.where( dataset.meta_data[:, 0] == 'control')[0][0] speed_position = np.where( dataset.meta_data[:, 0] == 'speed_module')[0][0] # print (torch.squeeze(input_data['rgb']).shape) # print (control_position) # print (speed_position) # Obs : Maybe we could also check for other branches ?? output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) for i in range(input_data['rgb'].shape[0]): coil_logger.write_on_csv( checkpoint_iteration, [output[i][0], output[i][1], output[i][2]]) # TODO: Change this a functional standard using the loss functions. loss = torch.mean( (output - dataset.extract_targets(float_data).cuda())**2).data.tolist() mean_error = torch.mean( torch.abs( output - dataset.extract_targets(float_data).cuda())).data.tolist() accumulated_error += mean_error accumulated_loss += loss error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) #print (output[position].data.tolist()) coil_logger.add_message( 'Iterating in Validation', { 'Checkpoint': latest, 'Iteration': (str(iteration_on_checkpoint * 120) + '/' + str(len(dataset))), 'MeanError': mean_error, 'Loss': loss, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets(float_data) [position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, latest) iteration_on_checkpoint += 1 checkpoint_average_loss = accumulated_loss / len(dataset) checkpoint_average_error = accumulated_error / len(dataset) coil_logger.add_scalar('Loss', checkpoint_average_loss, latest) coil_logger.add_scalar('Error', checkpoint_average_error, latest) print('Loss: ', checkpoint_average_loss, "----Error: ", checkpoint_average_error) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest state = { 'state_dict': model.state_dict(), 'best_loss': best_loss, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l2' + '.pth')) if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest state = { 'state_dict': model.state_dict(), 'best_error': best_error, 'best_error_iter': best_error_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_model_l1' + '.pth')) print('Best Loss: ', best_loss, "Checkpoint", best_loss_iter) print('Best Error: ', best_error, "Checkpoint", best_error_iter) coil_logger.add_message( 'Iterating in Validation', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest })
def execute(gpu, exp_batch, exp_alias, dataset_name, architecture, suppress_output): try: # We set the visible cuda devices torch.manual_seed(2) os.environ["CUDA_VISIBLE_DEVICES"] = gpu # Validation available for: # coil_unit (UNIT + task combined) # coil_icra (Also used for finetuned models) # wgangp_lsd (Our architecture) architecture_name = architecture # At this point the log file with the correct naming is created. if architecture_name == 'coil_unit': pass elif architecture_name == 'wgangp_lsd': merge_with_yaml( os.path.join('/home/rohitrishabh/CoilWGAN/configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) elif architecture_name == 'coil_icra': merge_with_yaml( os.path.join( '/home/adas/CleanedCode/CoIL_Codes/coil_20-06/configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('validation', dataset_name) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return if not os.path.exists('_output_logs'): os.mkdir('_output_logs') if suppress_output: sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. if dataset_name != []: full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) else: full_dataset = os.environ["COIL_DATASET_PATH"] augmenter = Augmenter(None) dataset = CoILDataset(full_dataset, transform=augmenter) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file batchsize = 30 data_loader = torch.utils.data.DataLoader(dataset, batch_size=batchsize, shuffle=False, num_workers=1, pin_memory=True) # TODO: here there is clearly a posibility to make a cool "conditioning" system. if architecture_name == 'coil_unit': model_task, model_gen = CoILModel('coil_unit') model_task, model_gen = model_task.cuda(), model_gen.cuda() else: model = CoILModel(architecture_name) model.cuda() latest = 0 # print (dataset.meta_data) best_loss = 1000 best_error = 1000 best_loss_mini = 1000 best_loss_iter = 0 best_error_iter = 0 batch_size = 30 best_loss_ckpt = '' if architecture_name == 'coil_unit': ckpts = glob.glob('/home/rohitrishabh/UNIT_DA/outputs/' + exp_alias + '/checkpoints/gen*.pt') else: ckpts = glob.glob( os.path.join( '/home/adas/CleanedCode/CoIL_Codes/coil_20-06/_logs', exp_batch, exp_alias) + '/*.pth') if architecture_name == 'coil_unit': model_task.eval() model_gen.eval() else: model.eval() ckpts = sorted(ckpts) # TODO: refactor on the getting on the checkpoint organization needed for ckpt in ckpts: # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE): # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE) # ckpt = os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias # , 'checkpoints', str(latest) + '.pth') checkpoint = torch.load(ckpt) print("Validation loaded ", ckpt) if architecture_name == 'wgangp_lsd': print(ckpt, checkpoint['best_loss_iter_F']) model.load_state_dict(checkpoint['stateF_dict']) model.eval() elif architecture_name == 'coil_unit': model_task.load_state_dict(checkpoint['task']) model_gen.load_state_dict(checkpoint['b']) model_task.eval() model_gen.eval() elif architecture_name == 'coil_icra': model.load_state_dict(checkpoint['state_dict']) model.eval() accumulated_loss = 0 accumulated_error = 0 iteration_on_checkpoint = 0 datacount = 0 for data in data_loader: input_data, float_data = data controls = float_data[:, dataset.controls_position(), :] camera_angle = float_data[:, 26, :] camera_angle = camera_angle.cuda() steer = float_data[:, 0, :] steer = steer.cuda() speed = float_data[:, 10, :] speed = speed.cuda() time_use = 1.0 car_length = 3.0 extra_factor = 2.5 threshold = 1.0 pos = camera_angle > 0.0 pos = pos.type(torch.FloatTensor) neg = camera_angle <= 0.0 neg = neg.type(torch.FloatTensor) pos = pos.cuda() neg = neg.cuda() rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0 val = extra_factor * (torch.atan( (rad_camera_angle * car_length) / (time_use * speed + 0.05))) / 3.1415 steer -= pos * torch.min(val, torch.Tensor([0.6]).cuda()) steer += neg * torch.min(val, torch.Tensor([0.6]).cuda()) steer = steer.cpu() float_data[:, 0, :] = steer float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0 float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0 datacount += 1 control_position = 24 speed_position = 10 if architecture_name == 'wgangp_lsd': embed, output = model( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda()) loss = torch.sum( (output[0] - dataset.extract_targets(float_data).cuda() )**2).data.tolist() mean_error = torch.sum( torch.abs(output[0] - dataset.extract_targets(float_data).cuda()) ).data.tolist() elif architecture_name == 'coil_unit': embed, n_b = model_gen.encode( torch.squeeze(input_data['rgb']).cuda()) output = model_task( embed, Variable(float_data[:, speed_position, :]).cuda()) loss = torch.sum( (output[0].data - dataset.extract_targets(float_data).cuda())**2) mean_error = torch.sum( torch.abs(output[0].data - dataset.extract_targets(float_data).cuda())) elif architecture_name == 'coil_icra': output = model.forward_branch( torch.squeeze(input_data['rgb']).cuda(), float_data[:, speed_position, :].cuda(), float_data[:, control_position, :].cuda()) loss = torch.sum( (output - dataset.extract_targets(float_data).cuda() )**2).data.tolist() mean_error = torch.sum( torch.abs(output - dataset.extract_targets(float_data).cuda()) ).data.tolist() if loss < best_loss_mini: best_loss_mini = loss accumulated_error += mean_error accumulated_loss += loss # error = torch.abs(output[0] - dataset.extract_targets(float_data).cuda()) # Log a random position position = random.randint(0, len(float_data) - 1) iteration_on_checkpoint += 1 print(datacount, len(data_loader), accumulated_loss) checkpoint_average_loss = accumulated_loss / float( datacount * batchsize) checkpoint_average_error = accumulated_error / float( datacount * batchsize) if checkpoint_average_loss < best_loss: best_loss = checkpoint_average_loss best_loss_iter = latest best_loss_ckpt = ckpt if checkpoint_average_error < best_error: best_error = checkpoint_average_error best_error_iter = latest print("current loss", checkpoint_average_loss) print("best_loss", best_loss) coil_logger.add_message( 'Iterating', { 'Summary': { 'Error': checkpoint_average_error, 'Loss': checkpoint_average_loss, 'BestError': best_error, 'BestLoss': best_loss, 'BestLossCheckpoint': best_loss_iter, 'BestErrorCheckpoint': best_error_iter }, 'Checkpoint': latest }, latest) latest += 2000 coil_logger.add_message('Finished', {}) print("Best Validation Loss ckpt:", best_loss_ckpt) # TODO: DO ALL THE AMAZING LOGGING HERE, as a way to very the status in paralell. # THIS SHOULD BE AN INTERELY PARALLEL PROCESS except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join(exp_batch, exp_alias+'.yaml')) set_type_of_process('train') sys.stdout = open(str(os.getpid()) + ".out", "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias, g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.DATASET_NAME) dataset = CoILDataset(full_dataset, transform=transforms.Compose([transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = CoILSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data)) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, sampler=sampler, batch_size=120, shuffle=False, num_workers=12, pin_memory=True) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] else: iteration = 0 # TODO: The checkpoint will continue, so the logs should restart ??? OR continue were it was print (dataset.meta_data) print (model) for data in data_loader: input_data, labels = data #TODO we have to divide the input with other data. #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) # get the control commands from labels, size = [120,1] controls = labels[:, 24, :] # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() branches = model(input_rgb_data, labels[:, 10, :].cuda()) #print ("len ",len(branches)) # get the steer, gas and brake ground truth from labels steer_gt = labels[:, 0, :] gas_gt = labels[:, 1, :] brake_gt = labels[:, 2, :] speed_gt = labels[:, 10, :] targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) loss = criterion.MSELoss(branches, targets.cuda(), controls.cuda(), speed_gt.cuda()) loss.backward() optimizer.step() # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict() } # TODO : maybe already summarize the best model ??? torch.save(state, os.path.join('_logs', exp_batch, exp_alias , 'checkpoints', str(iteration) + '.pth')) iteration += 1
def execute(gpu, exp_batch, exp_alias): # We set the visible cuda devices try: os.environ["CUDA_VISIBLE_DEVICES"] = gpu # At this point the log file with the correct naming is created. merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml')) set_type_of_process('train') coil_logger.add_message('Loading', {'GPU': gpu}) if not os.path.exists('_output_logs'): os.mkdir('_output_logs') sys.stdout = open(os.path.join( '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1) if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished": # TODO: print some cool summary or not ? return #Define the dataset. This structure is has the __get_item__ redefined in a way #that you can access the HDFILES positions from the root directory as a in a vector. full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME) #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU) dataset = CoILDataset(full_dataset, transform=transforms.Compose( [transforms.ToTensor()])) # Creates the sampler, this part is responsible for managing the keys. It divides # all keys depending on the measurements and produces a set of keys for each bach. sampler = BatchSequenceSampler( splitter.control_steer_split(dataset.measurements, dataset.meta_data), g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE) # The data loader is the multi threaded module from pytorch that release a number of # workers to get all the data. # TODO: batch size an number of workers go to some configuration file data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, shuffle=False, num_workers=12, pin_memory=False) # By instanciating the augmenter we get a callable that augment images and transform them # into tensors. st = lambda aug: iag.Sometimes(aug, 0.4) oc = lambda aug: iag.Sometimes(aug, 0.3) rl = lambda aug: iag.Sometimes(aug, 0.09) augmenter = iag.Augmenter([iag.ToGPU()] + [ rl(iag.GaussianBlur( (0, 1.5))), # blur images with a sigma between 0 and 1.5 rl( iag.AdditiveGaussianNoise( loc=0, scale=(0.0, 0.05), per_channel=0.5)), # add gaussian noise to images oc(iag.Dropout((0.0, 0.10), per_channel=0.5) ), # randomly remove up to X% of the pixels oc( iag.CoarseDropout( (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5) ), # randomly remove up to X% of the pixels oc(iag.Add((-40, 40), per_channel=0.5) ), # change brightness of images (by -X to Y of original value) st(iag.Multiply((0.10, 2), per_channel=0.2) ), # change brightness of images (X-Y% of original value) rl(iag.ContrastNormalization(( 0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast rl(iag.Grayscale((0.0, 1))), # put grayscale ] # do all of the above in random order ) # augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE) # TODO: here there is clearly a posibility to make a cool "conditioning" system. model = CoILModel(g_conf.MODEL_NAME) model.cuda() print(model) criterion = Loss() # TODO: DATASET SIZE SEEMS WEIRD optimizer = optim.Adam(model.parameters(), lr=0.0002) checkpoint_file = get_latest_saved_checkpoint() if checkpoint_file != None: checkpoint = torch.load( os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(get_latest_saved_checkpoint()))) iteration = checkpoint['iteration'] accumulated_time = checkpoint['total_time'] best_loss = checkpoint['best_loss'] best_loss_iter = checkpoint['best_loss_iter'] else: iteration = 0 best_loss = 10000.0 accumulated_time = 0 # We accumulate iteration time and keep the average speed best_loss_iter = 0 # TODO: The checkpoint will continue, so it should erase everything up to the iteration best_loss_save = 10000.0 best_loss_save_iter = 0 curr_loss_save = 0.0 print(dataset.meta_data) print(model) capture_time = time.time() model.train() for data in data_loader: input_data, float_data = data #TODO, ADD ITERATION SCHEDULE input_rgb_data = augmenter(0, input_data['rgb']) augment_for_controls = 1 adjustlr = 1 if augment_for_controls: #and self._config.targets_names[j] == "Steer": camera_angle = float_data[:, 26, :] camera_angle = camera_angle.cuda( ) #self._config.variable_names.index('Angle'),i] print("Camera angle", camera_angle[0]) steer = float_data[:, 0, :] # print("Original", steer[0]) steer = steer.cuda() speed = float_data[:, 10, :] speed = speed.cuda() # print (steer) time_use = 1.0 car_length = 3.0 extra_factor = 2.5 threshold = 1.0 pos = camera_angle > 0.0 pos = pos.type(torch.FloatTensor) neg = camera_angle <= 0.0 neg = neg.type(torch.FloatTensor) pos = pos.cuda() neg = neg.cuda() rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0 val = extra_factor * (torch.atan( (rad_camera_angle * car_length) / (time_use * speed + 0.05))) / 3.1415 # print(val) steer -= pos * torch.min(val, torch.tensor([0.6]).cuda()) steer += neg * torch.min(val, torch.tensor([0.6]).cuda()) print("val", val[0]) print("speed", speed[0]) steer = steer.cpu() float_data[:, 0, :] = steer float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0 float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0 #coil_logger.add_images(input_rgb_data) # get the control commands from float_data, size = [120,1] controls = float_data[:, dataset.controls_position(), :] # print(" CONTROLS ", controls.shape) # The output(branches) is a list of 5 branches results, each branch is with size [120,3] model.zero_grad() # print ( 'INPUTS', dataset.extract_inputs(float_data).shape ) branches = model(input_rgb_data, dataset.extract_inputs(float_data).cuda()) #print ("len ",len(branches)) #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1) # print ("Extracted targets ", dataset.extract_targets(float_data).shape[0]) loss = criterion.MSELoss( branches, dataset.extract_targets(float_data).cuda(), controls.cuda(), dataset.extract_inputs(float_data).cuda()) # TODO: All these logging things could go out to clean up the main if loss.data < best_loss: best_loss = loss.data.tolist() best_loss_iter = iteration curr_loss_save += loss.data # Log a random position position = random.randint(0, len(float_data) - 1) output = model.extract_branch(torch.stack(branches[0:4]), controls) error = torch.abs(output - dataset.extract_targets(float_data).cuda()) # TODO: For now we are computing the error for just the correct branch, it could be multi- branch, coil_logger.add_scalar('Loss', loss.data, iteration) loss.backward() optimizer.step() accumulated_time += time.time() - capture_time capture_time = time.time() # TODO: Get only the float_data that are actually generating output # TODO: itearation is repeating , and that is dumb coil_logger.add_message( 'Iterating', { 'Iteration': iteration, 'Loss': loss.data.tolist(), 'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time, 'BestLoss': best_loss, 'BestLossIteration': best_loss_iter, 'BestLossSave': best_loss_save, 'Output': output[position].data.tolist(), 'GroundTruth': dataset.extract_targets( float_data)[position].data.tolist(), 'Error': error[position].data.tolist(), 'Inputs': dataset.extract_inputs(float_data)[position].data.tolist() }, iteration) # TODO: For now we are computing the error for just the correct branch, it could be multi-branch, # TODO: save also the optimizer state dictionary if is_ready_to_save(iteration): state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'checkpoints', str(iteration) + '.pth')) print("before best save") if iteration % 5 == 0 and iteration > 4: curr_loss_save /= 5000.0 if curr_loss_save < best_loss_save: best_loss_save = curr_loss_save curr_loss_save = 0 state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss_save, 'total_time': accumulated_time, 'best_loss_iter': best_loss_save_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_loss_save' + '.pth')) print("after best save") if iteration == best_loss_iter: state = { 'iteration': iteration, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'total_time': accumulated_time, 'best_loss_iter': best_loss_iter } # TODO : maybe already summarize the best model ??? torch.save( state, os.path.join('_logs', exp_batch, exp_alias, 'best_loss' + '.pth')) iteration += 1 if adjustlr and iteration % 1000: adjust_learning_rate(optimizer, iteration) except KeyboardInterrupt: coil_logger.add_message('Error', {'Message': 'Killed By User'}) except: traceback.print_exc() coil_logger.add_message('Error', {'Message': 'Something Happened'})