Esempio n. 1
0
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output=True, yaml_file=None):
    latest = None
    # try:
    # We set the visible cuda devices
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    # At this point the log file with the correct naming is created.
    path_to_yaml_file = os.path.join('configs', exp_batch, exp_alias+'.yaml')
    if yaml_file is not None:
      path_to_yaml_file = os.path.join(yaml_file, exp_alias+'.yaml')
    merge_with_yaml(path_to_yaml_file)
    # The validation dataset is always fully loaded, so we fix a very high number of hours
    # g_conf.NUMBER_OF_HOURS = 10000 # removed to simplify code
    
    """
    # check again if this segment is required or not
    set_type_of_process('validation', dataset_name)

    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')

    if suppress_output:
        sys.stdout = open(os.path.join('_output_logs',
                                       exp_alias + '_' + g_conf.PROCESS_NAME + '_'
                                       + str(os.getpid()) + ".out"),
                          "a", buffering=1)
        sys.stderr = open(os.path.join('_output_logs',
                          exp_alias + '_err_' + g_conf.PROCESS_NAME + '_'
                                       + str(os.getpid()) + ".out"),
                          "a", buffering=1)
    """

    # Define the dataset. This structure is has the __get_item__ redefined in a way
    # that you can access the HDFILES positions from the root directory as a in a vector.
    
    dataset_name = dataset_name.split('_')[-1] # since preload file has '<X>hours_' as prefix whereas dataset folder does not
    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name) # original code
    augmenter = Augmenter(None)

    print ('full dataset path: ', full_dataset)
    dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=args.dataset_name)

    # The data loader is the multi threaded module from pytorch that release a number of
    # workers to get all the data.
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=g_conf.BATCH_SIZE,
                                              shuffle=False,
                                              num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
                                              pin_memory=True)

    model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)

    """ removing this segment to simplify code
    # The window used to keep track of the trainings
    l1_window = []
    latest = get_latest_evaluated_checkpoint()
    if latest is not None:  # When latest is noe
        l1_window = coil_logger.recover_loss_window(dataset_name, None)
    """
    
    model.cuda()

    best_mse = 1000
    best_error = 1000
    best_mse_iter = 0
    best_error_iter = 0

    # modified validation code from here to run a single model
    checkpoint = torch.load(args.checkpoint)
    checkpoint_iteration = checkpoint['iteration']
    print("model loaded ", checkpoint_iteration)

    model.load_state_dict(checkpoint['state_dict'])

    model.eval()
    accumulated_mse = 0
    accumulated_error = 0
    iteration_on_checkpoint = 0

    print ('data_loader size: ', len(data_loader))
    total_error = []
    for data in data_loader:

        # Compute the forward pass on a batch from the loaded dataset
        controls = data['directions']
        branches = model(torch.squeeze(data['rgb'].cuda()),
                             dataset.extract_inputs(data).cuda())
        output = model.extract_branch(torch.stack(branches[0:4]), controls)
        error = torch.abs(output - dataset.extract_targets(data).cuda())
        total_error += error.detach().cpu().tolist()
        
        iteration_on_checkpoint += 1
        if iteration_on_checkpoint % 50 == 0:
            print ('iteration: ', iteration_on_checkpoint)

    total_error = np.array(total_error)
    print (len(total_error), total_error.shape)

    np.save(os.path.join(args.save_path, args.dataset_name, 'computed_error.npy'), total_error)
    '''
Esempio n. 2
0
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12):
    """
        The main training function. This functions loads the latest checkpoint
        for a given, exp_batch (folder) and exp_alias (experiment configuration).
        With this checkpoint it starts from the beginning or continue some training.
    Args:
        gpu: The GPU number
        exp_batch: the folder with the experiments
        exp_alias: the alias, experiment name
        suppress_output: if the output are going to be saved on a file
        number_of_workers: the number of threads used for data loading

    Returns:
        None

    """
    try:
        # We set the visible cuda devices to select the GPU
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
        g_conf.VARIABLE_WEIGHT = {}
        # At this point the log file with the correct naming is created.
        # You merge the yaml file with the global configuration structure.
        merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
        set_type_of_process('train')
        # Set the process into loading status.
        coil_logger.add_message('Loading', {'GPU': gpu})

        # Put the output to a separate file if it is the case

        if suppress_output:
            if not os.path.exists('_output_logs'):
                os.mkdir('_output_logs')
            sys.stdout = open(os.path.join('_output_logs', exp_alias + '_' +
                              g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a",
                              buffering=1)
            sys.stderr = open(os.path.join('_output_logs',
                              exp_alias + '_err_'+g_conf.PROCESS_NAME + '_'
                                           + str(os.getpid()) + ".out"),
                              "a", buffering=1)

        if coil_logger.check_finish('train'):
            coil_logger.add_message('Finished', {})
            return

        # Preload option
        if g_conf.PRELOAD_MODEL_ALIAS is not None:
            checkpoint = torch.load(os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH,
                                                  g_conf.PRELOAD_MODEL_ALIAS,
                                                 'checkpoints',
                                                 str(g_conf.PRELOAD_MODEL_CHECKPOINT)+'.pth'))


        # Get the latest checkpoint to be loaded
        # returns none if there are no checkpoints saved for this model
        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file is not None:
            checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias,
                                    'checkpoints', str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 10000.0
            best_loss_iter = 0


        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME)

        # By instantiating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = Augmenter(g_conf.AUGMENTATION)

        # Instantiate the class used to read a dataset. The coil dataset generator
        # can be found
        dataset = CoILDataset(full_dataset, transform=augmenter,
                              preload_name=str(g_conf.NUMBER_OF_HOURS)
                                               + 'hours_' + g_conf.TRAIN_DATASET_NAME)
        print ("Loaded dataset")

        data_loader = select_balancing_strategy(dataset, iteration, number_of_workers)
        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        model.cuda()
        optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE)
        

        # Set ERFnet for segmentation
        model_erf = ERFNet(20)
        model_erf = torch.nn.DataParallel(model_erf)
        model_erf = model_erf.cuda()        
        
        print("LOAD ERFNet")
        def load_my_state_dict(model, state_dict):  #custom function to load model when not all dict elements
            own_state = model.state_dict()
            for name, param in state_dict.items():
                if name not in own_state:
                    continue
                own_state[name].copy_(param)
            return model
        
        model_erf = load_my_state_dict(model_erf, torch.load(os.path.join('trained_models/erfnet_pretrained.pth')))
        model_erf.eval()
        print ("ERFNet and weights LOADED successfully")

        if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None:
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            accumulated_time = checkpoint['total_time']
            loss_window = coil_logger.recover_loss_window('train', iteration)
        else:  # We accumulate iteration time and keep the average speed
            accumulated_time = 0
            loss_window = []
       

        print ("Before the loss")

        criterion = Loss(g_conf.LOSS_FUNCTION)

        # Loss time series window
        for data in data_loader:

            # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times,
            # add a stop on the _logs folder that is going to be read by this process
            if g_conf.FINISH_ON_VALIDATION_STALE is not None and \
                    check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE):
                break
            """
                ####################################
                    Main optimization loop
                ####################################
            """

            iteration += 1
            if iteration % 1000 == 0:
                adjust_learning_rate_auto(optimizer, loss_window)

            # get the control commands from float_data, size = [120,1]

            capture_time = time.time()
            controls = data['directions']
            # The output(branches) is a list of 5 branches results, each branch is with size [120,3]
            model.zero_grad()

            # print("Segmentation")
            # use ERFNet to convert RGB to Segmentation
            rgbs = data['rgb']
            filenames = data['rgb_name']

            # # seg one by one
            # seg_road = []
            # seg_not_road = []
            # i = 0
            # for inputs in rgbs:
            #     inputs = inputs.unsqueeze(0)
            #     # print("inputs ",inputs.shape)
            #     with torch.no_grad():
            #         outputs = model_erf(inputs)

            #     label = outputs[0].max(0)[1].byte().cpu().data

            #     road = (label == 0)
            #     not_road = (label != 0)
            #     seg_road.append(road)
            #     seg_not_road.append(not_road)   

            #     # # print("label ",label.shape)
            #     # label_color = Colorize()(label.unsqueeze(0))
            #     # filename = filenames[i]                
            #     # filenameSave = "./save_color/" + filename.split("CoILTrain/")[1]
            #     # os.makedirs(os.path.dirname(filenameSave), exist_ok=True)
                   
            #     # label_save = ToPILImage()(label_color)           
            #     # label_save.save(filenameSave) 
            #     # # print (i, filenameSave)
            #     # i += 1                 

            # seg_road = torch.stack(seg_road)
            # seg_not_road = torch.stack(seg_not_road)
            # seg = torch.stack([seg_road,seg_not_road]).transpose(0,1).float()
            # # print(seg.shape)
            
            # seg batch
            with torch.no_grad():
                outputs = model_erf(rgbs)
            # print("outputs.shape ",outputs.shape)
            labels = outputs.max(1)[1].byte().cpu().data
            # print("labels.shape",labels.shape)
            # print(np.unique(labels[0])) 

            seg_road = (labels==0)
            seg_not_road = (labels!=0)
            seg = torch.stack((seg_road,seg_not_road),1).float()

            # save 1st batch's segmentation results
            if iteration == 1:
                for i in range(120):
                    label = seg[i,0,:,:]
                    label_color = Colorize()(label.unsqueeze(0))               
                    filenameSave = "./save_color/batch_road_mask/%d.png"%(i)
                    os.makedirs(os.path.dirname(filenameSave), exist_ok=True)                   
                    label_save = ToPILImage()(label_color)           
                    label_save.save(filenameSave)

                    label = labels[i,:,:]
                    label_color = Colorize()(label.unsqueeze(0))               
                    filenameSave = "./save_color/batch_road/%d.png"%(i)
                    os.makedirs(os.path.dirname(filenameSave), exist_ok=True)                   
                    label_save = ToPILImage()(label_color)           
                    label_save.save(filenameSave)


            branches = model(torch.squeeze(seg).cuda(),
                             dataset.extract_inputs(data).cuda())
#             branches = model(torch.squeeze(rgbs.cuda()),
#                              dataset.extract_input(data).cuda())

            loss_function_params = {
                'branches': branches,
                'targets': dataset.extract_targets(data).cuda(),
                'controls': controls.cuda(),
                'inputs': dataset.extract_inputs(data).cuda(),
                'branch_weights': g_conf.BRANCH_LOSS_WEIGHT,
                'variable_weights': g_conf.VARIABLE_WEIGHT
            }
            loss, _ = criterion(loss_function_params)
            loss.backward()
            optimizer.step()
            """
                ####################################
                    Saving the model if necessary
                ####################################
            """

            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'optimizer': optimizer.state_dict(),
                    'best_loss_iter': best_loss_iter
                }
                torch.save(state, os.path.join('_logs', exp_batch, exp_alias
                                               , 'checkpoints', str(iteration) + '.pth'))

            """
                ################################################
                    Adding tensorboard logs.
                    Making calculations for logging purposes.
                    These logs are monitored by the printer module.
                #################################################
            """
            coil_logger.add_scalar('Loss', loss.data, iteration)
            coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration)
            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            # Log a random position
            position = random.randint(0, len(data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output - dataset.extract_targets(data).cuda())

            accumulated_time += time.time() - capture_time

            coil_logger.add_message('Iterating',
                                    {'Iteration': iteration,
                                     'Loss': loss.data.tolist(),
                                     'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                                     'BestLoss': best_loss, 'BestLossIteration': best_loss_iter,
                                     'Output': output[position].data.tolist(),
                                     'GroundTruth': dataset.extract_targets(data)[
                                         position].data.tolist(),
                                     'Error': error[position].data.tolist(),
                                     'Inputs': dataset.extract_inputs(data)[
                                         position].data.tolist()},
                                    iteration)
            loss_window.append(loss.data.tolist())
            coil_logger.write_on_error_csv('train', loss.data)
            print("Iteration: %d  Loss: %f" % (iteration, loss.data))

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except RuntimeError as e:

        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
Esempio n. 3
0
def execute(gpu, exp_batch, exp_alias, suppress_output=True, number_of_workers=12):
    """
        The main training function. This functions loads the latest checkpoint
        for a given, exp_batch (folder) and exp_alias (experiment configuration).
        With this checkpoint it starts from the beginning or continue some training.
    Args:
        gpu: The GPU number
        exp_batch: the folder with the experiments
        exp_alias: the alias, experiment name
        suppress_output: if the output are going to be saved on a file
        number_of_workers: the number of threads used for data loading

    Returns:
        None

    """
    try:
        # We set the visible cuda devices to select the GPU
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
        g_conf.VARIABLE_WEIGHT = {}
        # At this point the log file with the correct naming is created.
        # You merge the yaml file with the global configuration structure.
        merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
        set_type_of_process('train')
        # Set the process into loading status.
        coil_logger.add_message('Loading', {'GPU': gpu})

        # Put the output to a separate file if it is the case

        if suppress_output:
            if not os.path.exists('_output_logs'):
                os.mkdir('_output_logs')
            sys.stdout = open(os.path.join('_output_logs', exp_alias + '_' +
                              g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a",
                              buffering=1)
            sys.stderr = open(os.path.join('_output_logs',
                              exp_alias + '_err_'+g_conf.PROCESS_NAME + '_'
                                           + str(os.getpid()) + ".out"),
                              "a", buffering=1)
        

        # Preload option
        if g_conf.PRELOAD_MODEL_ALIAS is not None:
            checkpoint = torch.load(os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH,
                                                  g_conf.PRELOAD_MODEL_ALIAS,
                                                 'checkpoints',
                                                 str(g_conf.PRELOAD_MODEL_CHECKPOINT)+'.pth'))


        # Get the latest checkpoint to be loaded
        # returns none if there are no checkpoints saved for this model
        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file is not None:
            checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias,
                                    'checkpoints', str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 10000.0
            best_loss_iter = 0


        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME)

        # By instantiating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = Augmenter(g_conf.AUGMENTATION)

        # Instantiate the class used to read a dataset. The coil dataset generator
        # can be found
        dataset = CoILDataset(full_dataset, transform=augmenter,
                              preload_name=str(g_conf.NUMBER_OF_HOURS)
                                               + 'hours_' + g_conf.TRAIN_DATASET_NAME)
        print ("Loaded dataset")

        data_loader = select_balancing_strategy(dataset, iteration, number_of_workers)
        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        model.cuda()
        optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE)

        if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None:
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            accumulated_time = checkpoint['total_time']
            loss_window = coil_logger.recover_loss_window('train', iteration)
        else:  # We accumulate iteration time and keep the average speed
            accumulated_time = 0
            loss_window = []

        print ("Before the loss")

        criterion = Loss(g_conf.LOSS_FUNCTION)

        # Loss time series window
        for data in data_loader:

            # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times,
            # add a stop on the _logs folder that is going to be read by this process
            if g_conf.FINISH_ON_VALIDATION_STALE is not None and \
                    check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE):
                break
            """
                ####################################
                    Main optimization loop
                ####################################
            """

            iteration += 1
            if iteration % 1000 == 0:
                adjust_learning_rate_auto(optimizer, loss_window)

            # get the control commands from float_data, size = [120,1]

            capture_time = time.time()
            controls = data['directions']
            # The output(branches) is a list of 5 branches results, each branch is with size [120,3]
            model.zero_grad()
            branches = model(torch.squeeze(data['rgb'].cuda()),
                             dataset.extract_inputs(data).cuda())
            loss_function_params = {
                'branches': branches,
                'targets': dataset.extract_targets(data).cuda(),
                'controls': controls.cuda(),
                'inputs': dataset.extract_inputs(data).cuda(),
                'branch_weights': g_conf.BRANCH_LOSS_WEIGHT,
                'variable_weights': g_conf.VARIABLE_WEIGHT
            }
            loss, _ = criterion(loss_function_params)
            loss.backward()
            optimizer.step()
            """
                ####################################
                    Saving the model if necessary
                ####################################
            """

            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'optimizer': optimizer.state_dict(),
                    'best_loss_iter': best_loss_iter
                }
                torch.save(state, os.path.join('_logs', exp_batch, exp_alias
                                               , 'checkpoints', str(iteration) + '.pth'))

            """
                ################################################
                    Adding tensorboard logs.
                    Making calculations for logging purposes.
                    These logs are monitored by the printer module.
                #################################################
            """
            coil_logger.add_scalar('Loss', loss.data, iteration)
            coil_logger.add_image('Image', torch.squeeze(data['rgb']), iteration)
            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            # Log a random position
            position = random.randint(0, len(data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output - dataset.extract_targets(data).cuda())

            accumulated_time += time.time() - capture_time

            coil_logger.add_message('Iterating',
                                    {'Iteration': iteration,
                                     'Loss': loss.data.tolist(),
                                     'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                                     'BestLoss': best_loss, 'BestLossIteration': best_loss_iter,
                                     'Output': output[position].data.tolist(),
                                     'GroundTruth': dataset.extract_targets(data)[
                                         position].data.tolist(),
                                     'Error': error[position].data.tolist(),
                                     'Inputs': dataset.extract_inputs(data)[
                                         position].data.tolist()},
                                    iteration)
            loss_window.append(loss.data.tolist())
            coil_logger.write_on_error_csv('train', loss.data)
            print("Iteration: %d  Loss: %f" % (iteration, loss.data))

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except RuntimeError as e:

        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
Esempio n. 4
0
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output):
    latest = None
    try:
        # We set the visible cuda devices
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        # The validation dataset is always fully loaded, so we fix a very high number of hours
        g_conf.NUMBER_OF_HOURS = 10000
        set_type_of_process('validation', dataset_name)

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        # Define the dataset.
        full_dataset = [
            os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
        ]
        augmenter = Augmenter(None)
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_names=[dataset_name])

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        data_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=g_conf.BATCH_SIZE,
            shuffle=False,
            num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
            pin_memory=True)

        # Create model.
        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        # The window used to keep track of the validation loss
        l1_window = []
        # If we have evaluated a checkpoint, get the validation losses of all the previously
        # evaluated checkpoints (validation loss is used for early stopping)
        latest = get_latest_evaluated_checkpoint()
        if latest is not None:  # When latest is noe
            l1_window = coil_logger.recover_loss_window(dataset_name, None)

        model.cuda()

        best_mse = 1000
        best_error = 1000
        best_mse_iter = 0
        best_error_iter = 0

        # Loop to validate all checkpoints as they are saved during training
        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):
            if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):
                with torch.no_grad():
                    # Get and load latest checkpoint
                    latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)

                    checkpoint = torch.load(
                        os.path.join('_logs', exp_batch, exp_alias,
                                     'checkpoints',
                                     str(latest) + '.pth'))
                    checkpoint_iteration = checkpoint['iteration']
                    print("Validation loaded ", checkpoint_iteration)

                    model.load_state_dict(checkpoint['state_dict'])
                    model.eval()

                    accumulated_mse = 0
                    accumulated_error = 0
                    iteration_on_checkpoint = 0
                    if g_conf.USE_REPRESENTATION_LOSS:
                        accumulated_perception_rep_mse = 0
                        accumulated_speed_rep_mse = 0
                        accumulated_intentions_rep_mse = 0
                        accumulated_rep_mse = 0
                        accumulated_perception_rep_error = 0
                        accumulated_speed_rep_error = 0
                        accumulated_intentions_rep_error = 0
                        accumulated_rep_error = 0

                    # Validation loop
                    for data in data_loader:

                        # Compute the forward pass on a batch from  the validation dataset
                        controls = data['directions']

                        # Run model forward and get outputs
                        # First case corresponds to squeeze network, second case corresponds to driving model without
                        # mimicking losses, last case corresponds to mimic network
                        if "seg" in g_conf.SENSORS.keys():
                            output = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls,
                                dataset.extract_intentions(data).cuda())
                        elif not g_conf.USE_REPRESENTATION_LOSS:
                            output = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls)
                        else:
                            output, intermediate_reps = model.forward_branch(
                                data,
                                dataset.extract_inputs(data).cuda(), controls)

                        write_regular_output(checkpoint_iteration, output)

                        # Compute control loss on current validation batch and accumulate it
                        targets_to_use = dataset.extract_targets(data)

                        mse = torch.mean(
                            (output - targets_to_use.cuda())**2).data.tolist()
                        mean_error = torch.mean(
                            torch.abs(output -
                                      targets_to_use.cuda())).data.tolist()

                        accumulated_error += mean_error
                        accumulated_mse += mse

                        error = torch.abs(output - targets_to_use.cuda())

                        # Compute mimicking losses on current validation batch and accumulate it
                        if g_conf.USE_REPRESENTATION_LOSS:
                            expert_reps = dataset.extract_representations(data)
                            # First L1 losses (seg mask, speed, intention mimicking losses)
                            if g_conf.USE_PERCEPTION_REP_LOSS:
                                perception_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[0] -
                                              expert_reps[0].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                perception_rep_loss = 0
                            if g_conf.USE_SPEED_REP_LOSS:
                                speed_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[1] -
                                              expert_reps[1].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                speed_rep_loss = 0
                            if g_conf.USE_INTENTION_REP_LOSS:
                                intentions_rep_loss = torch.sum(
                                    torch.abs(intermediate_reps[2] -
                                              expert_reps[2].cuda())
                                ).data.tolist() / (3 * output.shape[0])
                            else:
                                intentions_rep_loss = 0
                            rep_error = g_conf.REP_LOSS_WEIGHT * (
                                perception_rep_loss + speed_rep_loss +
                                intentions_rep_loss)
                            accumulated_perception_rep_error += perception_rep_loss
                            accumulated_speed_rep_error += speed_rep_loss
                            accumulated_intentions_rep_error += intentions_rep_loss
                            accumulated_rep_error += rep_error

                            # L2 losses now
                            if g_conf.USE_PERCEPTION_REP_LOSS:
                                perception_rep_loss = torch.sum(
                                    (intermediate_reps[0] -
                                     expert_reps[0].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                perception_rep_loss = 0
                            if g_conf.USE_SPEED_REP_LOSS:
                                speed_rep_loss = torch.sum(
                                    (intermediate_reps[1] -
                                     expert_reps[1].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                speed_rep_loss = 0
                            if g_conf.USE_INTENTION_REP_LOSS:
                                intentions_rep_loss = torch.sum(
                                    (intermediate_reps[2] -
                                     expert_reps[2].cuda())**
                                    2).data.tolist() / (3 * output.shape[0])
                            else:
                                intentions_rep_loss = 0
                            rep_mse = g_conf.REP_LOSS_WEIGHT * (
                                perception_rep_loss + speed_rep_loss +
                                intentions_rep_loss)
                            accumulated_perception_rep_mse += perception_rep_loss
                            accumulated_speed_rep_mse += speed_rep_loss
                            accumulated_intentions_rep_mse += intentions_rep_loss
                            accumulated_rep_mse += rep_mse

                        # Log a random position
                        position = random.randint(
                            0,
                            len(output.data.tolist()) - 1)

                        # Logging
                        if g_conf.USE_REPRESENTATION_LOSS:
                            total_mse = mse + rep_mse
                            total_error = mean_error + rep_error
                            coil_logger.add_message(
                                'Iterating', {
                                    'Checkpoint':
                                    latest,
                                    'Iteration':
                                    (str(iteration_on_checkpoint * 120) + '/' +
                                     str(len(dataset))),
                                    'MeanError':
                                    mean_error,
                                    'MSE':
                                    mse,
                                    'RepMeanError':
                                    rep_error,
                                    'RepMSE':
                                    rep_mse,
                                    'MeanTotalError':
                                    total_error,
                                    'TotalMSE':
                                    total_mse,
                                    'Output':
                                    output[position].data.tolist(),
                                    'GroundTruth':
                                    targets_to_use[position].data.tolist(),
                                    'Error':
                                    error[position].data.tolist(),
                                    'Inputs':
                                    dataset.extract_inputs(
                                        data)[position].data.tolist()
                                }, latest)
                        else:
                            coil_logger.add_message(
                                'Iterating', {
                                    'Checkpoint':
                                    latest,
                                    'Iteration':
                                    (str(iteration_on_checkpoint * 120) + '/' +
                                     str(len(dataset))),
                                    'MeanError':
                                    mean_error,
                                    'MSE':
                                    mse,
                                    'Output':
                                    output[position].data.tolist(),
                                    'GroundTruth':
                                    targets_to_use[position].data.tolist(),
                                    'Error':
                                    error[position].data.tolist(),
                                    'Inputs':
                                    dataset.extract_inputs(
                                        data)[position].data.tolist()
                                }, latest)
                        iteration_on_checkpoint += 1

                        if g_conf.USE_REPRESENTATION_LOSS:
                            print("Iteration %d  on Checkpoint %d : Error %f" %
                                  (iteration_on_checkpoint,
                                   checkpoint_iteration, total_error))
                        else:
                            print("Iteration %d  on Checkpoint %d : Error %f" %
                                  (iteration_on_checkpoint,
                                   checkpoint_iteration, mean_error))
                    """
                        ########
                        Finish a round of validation, write results, wait for the next
                        ########
                    """
                    # Compute average L1 and L2 losses over whole round of validation and log them
                    checkpoint_average_mse = accumulated_mse / (
                        len(data_loader))
                    checkpoint_average_error = accumulated_error / (
                        len(data_loader))
                    coil_logger.add_scalar('L2 Loss', checkpoint_average_mse,
                                           latest, True)
                    coil_logger.add_scalar('Loss', checkpoint_average_error,
                                           latest, True)

                    if g_conf.USE_REPRESENTATION_LOSS:
                        checkpoint_average_perception_rep_mse = accumulated_perception_rep_mse / (
                            len(data_loader))
                        checkpoint_average_speed_rep_mse = accumulated_speed_rep_mse / (
                            len(data_loader))
                        checkpoint_average_intentions_rep_mse = accumulated_intentions_rep_mse / (
                            len(data_loader))
                        checkpoint_average_rep_mse = accumulated_rep_mse / (
                            len(data_loader))
                        checkpoint_average_total_mse = checkpoint_average_mse + checkpoint_average_rep_mse

                        checkpoint_average_perception_rep_error = accumulated_perception_rep_error / (
                            len(data_loader))
                        checkpoint_average_speed_rep_error = accumulated_speed_rep_error / (
                            len(data_loader))
                        checkpoint_average_intentions_rep_error = accumulated_intentions_rep_error / (
                            len(data_loader))
                        checkpoint_average_rep_error = accumulated_rep_error / (
                            len(data_loader))
                        checkpoint_average_total_error = checkpoint_average_error + checkpoint_average_rep_mse

                        # Log L1/L2 loss terms
                        coil_logger.add_scalar(
                            'Perception Rep Loss',
                            checkpoint_average_perception_rep_mse, latest,
                            True)
                        coil_logger.add_scalar(
                            'Speed Rep Loss', checkpoint_average_speed_rep_mse,
                            latest, True)
                        coil_logger.add_scalar(
                            'Intentions Rep Loss',
                            checkpoint_average_intentions_rep_mse, latest,
                            True)
                        coil_logger.add_scalar('Overall Rep Loss',
                                               checkpoint_average_rep_mse,
                                               latest, True)
                        coil_logger.add_scalar('Total L2 Loss',
                                               checkpoint_average_total_mse,
                                               latest, True)

                        coil_logger.add_scalar(
                            'Perception Rep Error',
                            checkpoint_average_perception_rep_error, latest,
                            True)
                        coil_logger.add_scalar(
                            'Speed Rep Error',
                            checkpoint_average_speed_rep_error, latest, True)
                        coil_logger.add_scalar(
                            'Intentions Rep Error',
                            checkpoint_average_intentions_rep_error, latest,
                            True)
                        coil_logger.add_scalar('Total Rep Error',
                                               checkpoint_average_rep_error,
                                               latest, True)
                        coil_logger.add_scalar('Total Loss',
                                               checkpoint_average_total_error,
                                               latest, True)
                    else:
                        checkpoint_average_total_mse = checkpoint_average_mse
                        checkpoint_average_total_error = checkpoint_average_error

                    if checkpoint_average_total_mse < best_mse:
                        best_mse = checkpoint_average_total_mse
                        best_mse_iter = latest

                    if checkpoint_average_total_error < best_error:
                        best_error = checkpoint_average_total_error
                        best_error_iter = latest

                    # Print for logging / to terminal validation results
                    if g_conf.USE_REPRESENTATION_LOSS:
                        coil_logger.add_message(
                            'Iterating', {
                                'Summary': {
                                    'Control Error': checkpoint_average_error,
                                    'Control Loss': checkpoint_average_mse,
                                    'Rep Error': checkpoint_average_rep_error,
                                    'Rep Loss': checkpoint_average_rep_mse,
                                    'Error': checkpoint_average_total_error,
                                    'Loss': checkpoint_average_total_mse,
                                    'BestError': best_error,
                                    'BestMSE': best_mse,
                                    'BestMSECheckpoint': best_mse_iter,
                                    'BestErrorCheckpoint': best_error_iter
                                },
                                'Checkpoint': latest
                            }, latest)
                    else:
                        coil_logger.add_message(
                            'Iterating', {
                                'Summary': {
                                    'Error': checkpoint_average_error,
                                    'Loss': checkpoint_average_mse,
                                    'BestError': best_error,
                                    'BestMSE': best_mse,
                                    'BestMSECheckpoint': best_mse_iter,
                                    'BestErrorCheckpoint': best_error_iter
                                },
                                'Checkpoint': latest
                            }, latest)

                    # Save validation loss history (validation loss is used for early stopping)
                    l1_window.append(checkpoint_average_total_error)
                    coil_logger.write_on_error_csv(
                        dataset_name, checkpoint_average_total_error)

                    # Early stopping
                    if g_conf.FINISH_ON_VALIDATION_STALE is not None:
                        if dlib.count_steps_without_decrease(l1_window) > 3 and \
                                dlib.count_steps_without_decrease_robust(l1_window) > 3:
                            coil_logger.write_stop(dataset_name, latest)
                            break

            else:

                latest = get_latest_evaluated_checkpoint()
                time.sleep(1)

                coil_logger.add_message('Loading',
                                        {'Message': 'Waiting Checkpoint'})
                print("Waiting for the next Validation")

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

    except RuntimeError as e:
        if latest is not None:
            coil_logger.erase_csv(latest)
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)
Esempio n. 5
0
def execute(gpu, exp_batch, exp_alias):

    from time import gmtime, strftime

    manualSeed = g_conf.SEED
    torch.cuda.manual_seed(manualSeed)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                g_conf.TRAIN_DATASET_NAME)
    real_dataset = g_conf.TARGET_DOMAIN_PATH
    # real_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], "FinalRealWorldDataset")

    #main data loader
    dataset = CoILDataset(full_dataset,
                          real_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    sampler = BatchSequenceSampler(
        splitter.control_steer_split(dataset.measurements,
                                     dataset.meta_data), g_conf.BATCH_SIZE,
        g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=sampler,
                                              shuffle=False,
                                              num_workers=6,
                                              pin_memory=True)

    st = lambda aug: iag.Sometimes(aug, 0.4)
    oc = lambda aug: iag.Sometimes(aug, 0.3)
    rl = lambda aug: iag.Sometimes(aug, 0.09)
    augmenter = iag.Augmenter([iag.ToGPU()] + [
        rl(iag.GaussianBlur(
            (0, 1.5))),  # blur images with a sigma between 0 and 1.5
        rl(iag.AdditiveGaussianNoise(loc=0, scale=(
            0.0, 0.05), per_channel=0.5)),  # add gaussian noise to images
        oc(iag.Dropout((0.0, 0.10), per_channel=0.5)
           ),  # randomly remove up to X% of the pixels
        oc(
            iag.CoarseDropout(
                (0.0, 0.10), size_percent=(0.08, 0.2),
                per_channel=0.5)),  # randomly remove up to X% of the pixels
        oc(iag.Add((-40, 40), per_channel=0.5)
           ),  # change brightness of images (by -X to Y of original value)
        st(iag.Multiply((0.10, 2), per_channel=0.2)
           ),  # change brightness of images (X-Y% of original value)
        rl(iag.ContrastNormalization(
            (0.5, 1.5), per_channel=0.5)),  # improve or worsen the contrast
        rl(iag.Grayscale((0.0, 1))),  # put grayscale
    ]  # do all of the above in random order
                              )

    l1weight = g_conf.L1_WEIGHT
    task_adv_weight = g_conf.TASK_ADV_WEIGHT
    image_size = tuple([88, 200])

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    print("Configurations of ", exp_alias)
    print("GANMODEL_NAME", g_conf.GANMODEL_NAME)
    print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION)
    print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE)
    print("SKIP", g_conf.SKIP)
    print("TYPE", g_conf.TYPE)
    print("L1 WEIGHT", g_conf.L1_WEIGHT)
    print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT)
    print("LAB SMOOTH", g_conf.LABSMOOTH)

    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION,
                               skip=g_conf.SKIP).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch':
        netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller':
        netD = ganmodels_nopatch_smaller._netD(
            loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch_smaller._netG(
            loss=g_conf.LOSS_FUNCTION).cuda()

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task':
        netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda()
        netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda()

        if g_conf.PRETRAINED == 'RECON':
            netF_statedict = torch.load('netF_GAN_Pretrained.wts')
            netF.load_state_dict(netF_statedict)

        elif g_conf.PRETRAINED == 'IL':
            print("Loading IL")
            model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth')
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()

            print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys()))
            for i, keys in enumerate(
                    zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches":
                #     print("No Transfer of ",  newkey, " to ", oldkey)
                # else:
                print("Transferring ", newkey, " to ", oldkey)
                netF_state_dict[newkey] = model_IL_state_dict[oldkey]

            netF.load_state_dict(netF_state_dict)
            print("IL Model Loaded!")

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d':
        netD = ganmodels_taskAC_shared._netD().cuda()
        netG = ganmodels_taskAC_shared._netG().cuda()
        netF = ganmodels_taskAC_shared._netF().cuda()

        if g_conf.PRETRAINED == 'IL':
            print("Loading IL")
            model_IL = torch.load(g_conf.IL_AGENT_PATH)
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()

            print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys()))
            for i, keys in enumerate(
                    zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                print("Transferring ", newkey, " to ", oldkey)
                netF_state_dict[newkey] = model_IL_state_dict[oldkey]

            netF.load_state_dict(netF_state_dict)
            print("IL Model Loaded!")

            #####
            if g_conf.IF_AUG:
                print("Loading Aug Decoder")
                model_dec = torch.load(g_conf.DECODER_RECON_PATH)
            else:
                print("Loading Decoder")
                model_dec = torch.load(g_conf.DECODER_RECON_PATH)
            model_dec_state_dict = model_dec['stateG_dict']

            netG_state_dict = netG.state_dict()

            print(len(netG_state_dict.keys()),
                  len(model_dec_state_dict.keys()))
            for i, keys in enumerate(
                    zip(netG_state_dict.keys(), model_dec_state_dict.keys())):
                newkey, oldkey = keys
                print("Transferring ", newkey, " to ", oldkey)
                netG_state_dict[newkey] = model_dec_state_dict[oldkey]

            netG.load_state_dict(netG_state_dict)
            print("Decoder Model Loaded!")

    init_weights(netD)

    print(netD)
    print(netF)
    print(netG)

    optimD = torch.optim.Adam(netD.parameters(),
                              lr=g_conf.LR_D,
                              betas=(0.5, 0.999))
    optimG = torch.optim.Adam(netG.parameters(),
                              lr=g_conf.LR_G,
                              betas=(0.5, 0.999))
    if g_conf.TYPE == 'task':
        optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE)
        Task_Loss = TaskLoss()

    if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d':
        print("Using cross entropy!")
        Loss = torch.nn.CrossEntropyLoss().cuda()

    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter_F = 0
    best_loss_iter_G = 0
    best_lossF = 1000000.0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0
    n_critic = g_conf.N_CRITIC

    lossF = Variable(torch.Tensor([100.0]))
    lossG_adv = Variable(torch.Tensor([100.0]))
    lossG_smooth = Variable(torch.Tensor([100.0]))
    lossG = Variable(torch.Tensor([100.0]))

    netG.train()
    netD.train()
    netF.train()
    capture_time = time.time()

    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    fake_img_pool_src = ImagePool(50)
    fake_img_pool_tgt = ImagePool(50)

    for data in data_loader:

        set_requires_grad(netD, True)
        set_requires_grad(netF, True)
        set_requires_grad(netG, True)

        # print("ITERATION:", iteration)

        val = 0.0
        input_data, float_data, tgt_imgs = data

        if g_conf.IF_AUG:
            inputs = augmenter(0, input_data['rgb'])
            # tgt_imgs = augmenter(0, tgt_imgs)
        else:
            inputs = input_data['rgb'].cuda()
            # tgt_imgs = tgt_imgs.cuda()

        tgt_imgs = tgt_imgs.cuda()

        inputs = inputs.squeeze(1)
        inputs = inputs - val  #subtracted by 0.5
        tgt_imgs = tgt_imgs - val  #subtracted by 0.5

        controls = float_data[:, dataset.controls_position(), :]

        src_embed_inputs, src_branches = netF(
            inputs,
            dataset.extract_inputs(float_data).cuda())
        tgt_embed_inputs = netF(tgt_imgs, None)

        src_fake_inputs = netG(src_embed_inputs.detach())
        tgt_fake_inputs = netG(tgt_embed_inputs.detach())

        if iteration % 100 == 0:
            imgs_to_save = torch.cat(
                (inputs[:1] + val, src_fake_inputs[:1] + val,
                 tgt_imgs[:1] + val, tgt_fake_inputs[:1] + val), 0).cpu().data
            coil_logger.add_image("Images", imgs_to_save, iteration)
            imgs_to_save = imgs_to_save.clamp(0.0, 1.0)
            vutils.save_image(imgs_to_save,
                              './imgs_' + exp_alias + '/' + str(iteration) +
                              '_real_and_fake.png',
                              normalize=False)

        ##--------------------Discriminator part!!!!!!!!!!-------------------##

        ##source fake
        if g_conf.IF_POOL:
            src_fake_inputs_forD = fake_img_pool_src.query(src_fake_inputs)
            tgt_fake_inputs_forD = fake_img_pool_tgt.query(tgt_fake_inputs)
        else:
            src_fake_inputs_forD = src_fake_inputs
            tgt_fake_inputs_forD = tgt_fake_inputs

        set_requires_grad(netD, True)
        set_requires_grad(netF, False)
        set_requires_grad(netG, False)
        optimD.zero_grad()

        outputsD_fake_src_bin, __ = netD(src_fake_inputs_forD.detach())
        outputsD_fake_tgt_bin, __ = netD(tgt_fake_inputs_forD.detach())

        outputsD_real_src_bin, __ = netD(inputs)
        outputsD_real_tgt_bin, __ = netD(tgt_imgs)

        gradient_penalty_src = calc_gradient_penalty(netD, inputs,
                                                     src_fake_inputs_forD,
                                                     "recon")
        lossD_bin_src = torch.mean(
            outputsD_fake_src_bin -
            outputsD_real_src_bin) + gradient_penalty_src

        gradient_penalty_tgt = calc_gradient_penalty(netD, tgt_imgs,
                                                     tgt_fake_inputs_forD,
                                                     "recon")
        lossD_bin_tgt = torch.mean(
            outputsD_fake_tgt_bin -
            outputsD_real_tgt_bin) + gradient_penalty_tgt

        lossD = (lossD_bin_src + lossD_bin_tgt) * 0.5
        lossD.backward(retain_graph=True)
        optimD.step()

        coil_logger.add_scalar('Total LossD Bin', lossD.data, iteration)
        coil_logger.add_scalar('Src LossD Bin', lossD_bin_src.data, iteration)
        coil_logger.add_scalar('Tgt LossD Bin', lossD_bin_tgt.data, iteration)

        ##--------------------Generator part!!!!!!!!!!-----------------------##
        set_requires_grad(netD, False)
        set_requires_grad(netF, False)
        set_requires_grad(netG, True)
        optimG.zero_grad()

        #fake outputs for bin
        outputsD_bin_src_fake_forG, __ = netD(src_fake_inputs)
        outputsD_bin_tgt_fake_forG, __ = netD(tgt_fake_inputs)

        #Generator updates

        if ((iteration + 1) % n_critic) == 0:
            #for netD_bin

            optimG.zero_grad()
            outputsD_bin_fake_forG = netD(tgt_imgs)

            #Generator updates
            lossG_src_smooth = L1_loss(
                src_fake_inputs, inputs)  # L1 loss with real domain image
            lossG_tgt_smooth = L1_loss(
                tgt_fake_inputs, tgt_imgs)  # L1 loss with real domain image

            lossG_src_adv_bin = -1.0 * torch.mean(outputsD_bin_src_fake_forG)
            lossG_tgt_adv_bin = -1.0 * torch.mean(outputsD_bin_tgt_fake_forG)

            lossG_adv_bin = 0.5 * (lossG_src_adv_bin + lossG_tgt_adv_bin)

            lossG_Adv = lossG_adv_bin
            lossG_L1 = 0.5 * (lossG_src_smooth + lossG_tgt_smooth)

            lossG = (lossG_Adv + l1weight * lossG_L1) / (1.0 + l1weight)

            lossG.backward(retain_graph=True)
            optimG.step()

            coil_logger.add_scalar('Total LossG', lossG.data, iteration)
            coil_logger.add_scalar('LossG Adv', lossG_Adv.data, iteration)
            coil_logger.add_scalar('Adv Bin LossG', lossG_adv_bin.data,
                                   iteration)
            coil_logger.add_scalar('Smooth LossG', lossG_L1.data, iteration)

            #####Task network updates##########################
            set_requires_grad(netD, False)
            set_requires_grad(netF, True)
            set_requires_grad(netG, False)

            optimF.zero_grad()
            lossF_task = Task_Loss.MSELoss(
                src_branches,
                dataset.extract_targets(float_data).cuda(), controls.cuda(),
                dataset.extract_inputs(float_data).cuda())

            __, outputsD_fake_src_da = netD(src_fake_inputs_forD.detach())
            __, outputsD_real_tgt_da = netD(tgt_imgs)

            __, outputsD_fake_tgt_da = netD(tgt_fake_inputs_forD.detach())
            __, outputsD_real_src_da = netD(inputs)

            gradient_penalty_da_1 = calc_gradient_penalty(
                netD, tgt_imgs, src_fake_inputs_forD, "da")
            lossF_da_1 = torch.mean(outputsD_fake_src_da - outputsD_real_tgt_da
                                    ) + gradient_penalty_da_1

            gradient_penalty_da_2 = calc_gradient_penalty(
                netD, inputs, tgt_fake_inputs_forD, "da")
            lossF_da_2 = torch.mean(outputsD_fake_tgt_da - outputsD_real_src_da
                                    ) + gradient_penalty_da_2

            lossF_da = 0.5 * (lossF_da_1 + lossF_da_2)
            lossF = (lossF_task +
                     task_adv_weight * lossF_da) / (1.0 + task_adv_weight)

            coil_logger.add_scalar('Total Task Loss', lossF.data, iteration)
            coil_logger.add_scalar('Adv Task Loss', lossF_da.data, iteration)
            coil_logger.add_scalar('Only Task Loss', lossF_task.data,
                                   iteration)
            lossF.backward(retain_graph=True)
            optimF.step()

            if lossG.data < best_lossG:
                best_lossG = lossG.data.tolist()
                best_loss_iter_G = iteration

            if lossF.data < best_lossF:
                best_lossF = lossF.data.tolist()
                best_loss_iter_F = iteration

        #optimization for one iter done!

        position = random.randint(0, len(float_data) - 1)

        if lossD.data < best_lossD:
            best_lossD = lossD.data.tolist()

        accumulated_time += time.time() - capture_time
        capture_time = time.time()

        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter_G': best_loss_iter_G,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))

        if iteration == best_loss_iter_F and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'best_lossF': best_lossF,
                'total_time': accumulated_time,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelF' + '.pth'))

        iteration += 1
Esempio n. 6
0
def execute(gpu, exp_batch, exp_alias):

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                g_conf.TRAIN_DATASET_NAME)
    dataset = CoILDataset(full_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    sampler = BatchSequenceSampler(
        splitter.control_steer_split(dataset.measurements,
                                     dataset.meta_data), g_conf.BATCH_SIZE,
        g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=sampler,
                                              shuffle=False,
                                              num_workers=6,
                                              pin_memory=True)

    l1weight = 1.0
    image_size = tuple([88, 200])
    testmode = 1

    # print("helllooooo", g_conf.MODEL_NAME)
    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD().cuda()
        netG = ganmodels._netG(skip=g_conf.SKIP).cuda()
    # else:
    #     netD = ganmodels._oldnetD().cuda()
    #     netG = ganmodels._oldnetG().cuda()

    init_weights(netD)
    init_weights(netG)
    print(netD)
    print(netG)

    optimD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.7, 0.999))
    BCE_loss = torch.nn.MSELoss().cuda()
    # BCE_loss = torch.nn.BCELoss().cuda()
    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter = 0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0

    netG.train()
    netD.train()
    capture_time = time.time()
    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    for data in data_loader:

        val = 0.0
        input_data, float_data = data
        inputs = input_data['rgb'].cuda()
        inputs = inputs.squeeze(1)
        inputs_in = inputs - val

        #forward pass
        # print(inputs[0][0][0][0], inputs_in[0][0][0][0])
        fake_inputs = netG(inputs_in)  #subtracted by 0.5
        fake_inputs_in = fake_inputs
        # print(fake_inputs[0][0][0][0], fake_inputs_in[0][0][0][0])
        if iteration % 200 == 0:
            imgs_to_save = torch.cat((inputs_in[:2], fake_inputs_in[:2]),
                                     0).cpu().data
            vutils.save_image(imgs_to_save,
                              './imgs_' + exp_alias + '/' + str(iteration) +
                              '_real_and_fake.png',
                              normalize=True)
            coil_logger.add_image("Images", imgs_to_save, iteration)

        #########################dark territory starts here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        ##--------------------Discriminator part!!!!!!!!!!-----------------------
        set_requires_grad(netD, True)
        optimD.zero_grad()

        ##fake
        outputsD_fake_forD = netD(fake_inputs.detach())

        labsize = outputsD_fake_forD.size()
        #Create labels of patchgan style with label smoothing
        labels_fake = torch.zeros(labsize)  #Fake labels
        label_fake_noise = torch.rand(
            labels_fake.size()) * 0.05 - 0.025  #Label smoothing
        labels_fake = labels_fake
        labels_fake = Variable(labels_fake).cuda()

        # lossD_fake = MSE_loss(outputsD_fake_forD, labels_fake)
        lossD_fake = BCE_loss(outputsD_fake_forD, labels_fake)

        ##real
        outputsD_real = netD(inputs)

        labsize = outputsD_real.size()
        print("label size is: ", labsize)
        #Create labels of patchgan style with label smoothing
        labels_real = torch.ones(labsize)  #Real labels
        label_real_noise = torch.rand(
            labels_real.size()) * 0.05 - 0.025  #Label smoothing
        labels_real = labels_real
        labels_real = Variable(labels_real).cuda()

        # lossD_real = MSE_loss(outputsD_real, labels_real)
        lossD_real = BCE_loss(outputsD_real, labels_real)

        #Discriminator updates

        lossD = (lossD_real + lossD_fake) * 0.5
        lossD /= len(inputs)
        lossD.backward()  #retain_graph=True needed?
        optimD.step()

        coil_logger.add_scalar('Total LossD', lossD.data, iteration)
        coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs),
                               iteration)

        ##--------------------Generator part!!!!!!!!!!-----------------------

        #TODO change decoder architecture
        #TODO check norms of gradients later
        #TODO add auxiliary regression loss for steering

        set_requires_grad(netD, False)
        optimG.zero_grad()
        outputsD_fake_forG = netD(fake_inputs)
        #Generator updates

        # lossG_adv = MSE_loss(outputsD_fake_forG, labels_real)
        lossG_adv = BCE_loss(outputsD_fake_forG, labels_real)
        lossG_smooth = L1_loss(fake_inputs, inputs)
        lossG = lossG_adv + l1weight * lossG_smooth

        # lossG = lossG_adv
        lossG /= len(inputs)

        lossG.backward()  #retain_graph=True needed?
        optimG.step()

        coil_logger.add_scalar('Total LossG', lossG.data, iteration)
        coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs),
                               iteration)

        #optimization for one iter done!

        position = random.randint(0, len(float_data) - 1)

        if lossD.data < best_lossD:
            best_lossD = lossD.data.tolist()

        if lossG.data < best_lossG:
            best_lossG = lossG.data.tolist()
            best_loss_iter = iteration

        accumulated_time += time.time() - capture_time
        capture_time = time.time()
        print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(),
              "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration",
              iteration, "Best Loss Iteration", best_loss_iter)

        coil_logger.add_message(
            'Iterating', {
                'Iteration':
                iteration,
                'LossD':
                lossD.data.tolist(),
                'LossG':
                lossG.data.tolist(),
                'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                'BestLossD':
                best_lossD,
                'BestLossIteration':
                best_loss_iter,
                'BestLossG':
                best_lossG,
                'BestLossIteration':
                best_loss_iter,
                'GroundTruth':
                dataset.extract_targets(float_data)[position].data.tolist(),
                'Inputs':
                dataset.extract_inputs(float_data)[position].data.tolist()
            }, iteration)
        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter': best_loss_iter
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))
        if iteration == best_loss_iter:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter': best_loss_iter
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelG' + '.pth'))

        iteration += 1
Esempio n. 7
0
def execute(gpu,
            exp_batch,
            exp_alias,
            suppress_output=True,
            number_of_workers=12):
    """
        The main encoder training function.
    Args:
        gpu: The GPU id number
        exp_batch: the folder with the experiments
        exp_alias: the alias, experiment name
        suppress_output: if the output are going to be saved on a file
        number_of_workers: the number of threads used for data loading
    Returns:
        None
    """
    try:
        # We set the visible cuda devices to select the GPU
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
        g_conf.VARIABLE_WEIGHT = {}
        # At this point the log file with the correct naming is created.
        # You merge the yaml file with the global configuration structure.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        set_type_of_process('train_encoder')
        # Set the process into loading status.
        coil_logger.add_message('Loading',
                                {'GPU': os.environ["CUDA_VISIBLE_DEVICES"]})

        # we set a seed for this exp
        seed_everything(seed=g_conf.MAGICAL_SEED)

        # Put the output to a separate file if it is the case
        if suppress_output:
            if not os.path.exists('_output_logs'):
                os.mkdir('_output_logs')
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        # Preload option
        if g_conf.PRELOAD_MODEL_ALIAS is not None:
            checkpoint = torch.load(
                os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH,
                             g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints',
                             str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth'))

        # Get the latest checkpoint to be loaded
        # returns none if there are no checkpoints saved for this model
        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file is not None:
            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 1000000000.0
            best_loss_iter = 0

        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the positions from the root directory as a in a vector.
        # full_dataset = os.path.join(os.environ["SRL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME)

        # By instantiating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = Augmenter(g_conf.AUGMENTATION)

        if len(g_conf.EXPERIENCE_FILE) == 1:
            json_file_name = str(
                g_conf.EXPERIENCE_FILE[0]).split('/')[-1].split('.')[-2]
        else:
            json_file_name = str(g_conf.EXPERIENCE_FILE[0]).split(
                '/')[-1].split('.')[-2] + '_' + str(
                    g_conf.EXPERIENCE_FILE[1]).split('/')[-1].split('.')[-2]
        print(g_conf.EXPERIENCE_FILE)
        dataset = CoILDataset(transform=augmenter,
                              preload_name=g_conf.PROCESS_NAME + '_' +
                              json_file_name + '_' + g_conf.DATA_USED)

        print("Loaded dataset")

        data_loader = select_balancing_strategy(dataset, iteration,
                                                number_of_workers)
        print('len(data_loader)', len(data_loader))
        print('\n' * 2, 'model and config:', g_conf.ENCODER_MODEL_TYPE,
              g_conf.ENCODER_MODEL_CONFIGURATION, '\n' * 2)

        encoder_model = EncoderModel(g_conf.ENCODER_MODEL_TYPE,
                                     g_conf.ENCODER_MODEL_CONFIGURATION)
        encoder_model.cuda()
        encoder_model.train()

        print(encoder_model)

        optimizer = optim.Adam(encoder_model.parameters(),
                               lr=g_conf.LEARNING_RATE)

        if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None:
            encoder_model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            accumulated_time = checkpoint['total_time']
            loss_window = coil_logger.recover_loss_window('train', iteration)
        else:  # We accumulate iteration time and keep the average speed
            accumulated_time = 0
            loss_window = []

        print("Before the loss")

        if g_conf.ENCODER_MODEL_TYPE in ['ETE']:
            criterion = Loss(g_conf.LOSS_FUNCTION)

        # Loss time series window
        for data in data_loader:
            print('iteration :', iteration)
            if iteration % 1000 == 0:
                adjust_learning_rate_auto(optimizer, loss_window)

            capture_time = time.time()
            encoder_model.zero_grad()
            """
                ####################################
                    ENCODER_MODEL_TYPE can be: one-step-affordances, ETE, stdim, action_prediction
                    
                ####################################
              - one-step-affordances: input RGB images, compute affordances loss.
              - ETE: input RGB images and speed, compute action loss (steering, throttle, brake)
              - stdim: input two consecutive RGB images, compute the feature loss
              - action_prediction: input two consecutive RGB images, compute action classification loss
              - forward: input two consecutive RGB images, compute action loss + feature loss
              
            """

            if g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances']:
                loss_function_params = {
                    'classification_gt':
                    dataset.extract_affordances_targets(
                        data, 'classification').cuda(),
                    # harzard stop, red_light....
                    'class_weights':
                    g_conf.AFFORDANCES_CLASS_WEIGHT,
                    'regression_gt':
                    dataset.extract_affordances_targets(data,
                                                        'regression').cuda(),
                    'variable_weights':
                    g_conf.AFFORDANCES_VARIABLE_WEIGHT
                }
                # we input RGB images, speed and command to train affordances
                loss = encoder_model(
                    torch.squeeze(data['rgb'].cuda()),
                    dataset.extract_inputs(data).cuda(),
                    torch.squeeze(dataset.extract_commands(data).cuda()),
                    loss_function_params)

                if iteration == 0:
                    state = {
                        'iteration': iteration,
                        'state_dict': encoder_model.state_dict(),
                        'best_loss': best_loss,
                        'total_time': accumulated_time,
                        'optimizer': optimizer.state_dict(),
                        'best_loss_iter': best_loss_iter
                    }
                    torch.save(
                        state,
                        os.path.join('_logs', exp_batch, exp_alias,
                                     'checkpoints', 'inital.pth'))

                loss.backward()
                optimizer.step()

            elif g_conf.ENCODER_MODEL_TYPE in ['forward']:
                # We sample another batch to avoid the superposition

                inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()]
                loss, loss_other, loss_ete = encoder_model(
                    inputs_data,
                    dataset.extract_inputs(data),
                    # We also add measurements and commands
                    dataset.extract_commands(data),
                    dataset.extract_targets(data)[0].cuda())
                loss.backward()
                optimizer.step()

            elif g_conf.ENCODER_MODEL_TYPE in ['ETE']:
                branches = encoder_model(
                    torch.squeeze(data['rgb'].cuda()),
                    dataset.extract_inputs(data).cuda(),
                    torch.squeeze(dataset.extract_commands(data).cuda()))

                loss_function_params = {
                    'branches': branches,
                    'targets': dataset.extract_targets(
                        data).cuda(),  # steer, throttle, brake
                    'inputs': dataset.extract_inputs(data).cuda(),  # speed
                    'branch_weights': g_conf.BRANCH_LOSS_WEIGHT,
                    'variable_weights': g_conf.VARIABLE_WEIGHT
                }

                loss, _ = criterion(loss_function_params)
                loss.backward()
                optimizer.step()

            elif g_conf.ENCODER_MODEL_TYPE in ['stdim']:
                inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()]
                loss, _, _ = encoder_model(
                    inputs_data,
                    dataset.extract_inputs(data),
                    # We also add measurements and commands
                    dataset.extract_commands(data))
                loss.backward()
                optimizer.step()

            elif g_conf.ENCODER_MODEL_TYPE in ['action_prediction']:
                inputs_data = [data['rgb'][0].cuda(), data['rgb'][1].cuda()]
                loss, _, _ = encoder_model(
                    inputs_data,
                    dataset.extract_inputs(data),
                    # We also add measurements and commands
                    dataset.extract_commands(data),
                    dataset.extract_targets(data)[0].cuda())
                loss.backward()
                optimizer.step()

            else:
                raise ValueError("The encoder model type is not know")
            """
                ####################################
                    Saving the model if necessary
                ####################################
            """

            if is_ready_to_save(iteration):
                state = {
                    'iteration': iteration,
                    'state_dict': encoder_model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'optimizer': optimizer.state_dict(),
                    'best_loss_iter': best_loss_iter
                }
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(iteration) + '.pth'))

            iteration += 1
            """
                ################################################
                    Adding tensorboard logs.
                    Making calculations for logging purposes.
                    These logs are monitored by the printer module.
                #################################################
            """

            if g_conf.ENCODER_MODEL_TYPE in [
                    'stdim', 'action_prediction', 'forward'
            ]:
                coil_logger.add_scalar('Loss', loss.data, iteration)
                coil_logger.add_image('f_t', torch.squeeze(data['rgb'][0]),
                                      iteration)
                coil_logger.add_image('f_ti', torch.squeeze(data['rgb'][1]),
                                      iteration)

            elif g_conf.ENCODER_MODEL_TYPE in ['one-step-affordances', 'ETE']:
                coil_logger.add_scalar('Loss', loss.data, iteration)
                coil_logger.add_image('Image', torch.squeeze(data['rgb']),
                                      iteration)

            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            accumulated_time += time.time() - capture_time
            coil_logger.add_message(
                'Iterating', {
                    'Iteration': iteration,
                    'Loss': loss.data.tolist(),
                    'Images/s':
                    (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                    'BestLoss': best_loss,
                    'BestLossIteration': best_loss_iter
                }, iteration)
            loss_window.append(loss.data.tolist())
            coil_logger.write_on_error_csv('train', loss.data)

            if iteration % 100 == 0:
                print('Train Iteration: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'.
                      format(iteration, iteration, g_conf.NUMBER_ITERATIONS,
                             100. * iteration / g_conf.NUMBER_ITERATIONS,
                             loss.data))

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except RuntimeError as e:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
Esempio n. 8
0
def execute(gpu, exp_batch, exp_alias, dataset_name, architecture,
            suppress_output):

    try:
        # We set the visible cuda devices
        torch.manual_seed(2)
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # Validation available for:
        # coil_unit (UNIT + task combined)
        # coil_icra (Also used for finetuned models)
        # wgangp_lsd (Our architecture)

        architecture_name = architecture
        # At this point the log file with the correct naming is created.
        if architecture_name == 'coil_unit':
            pass
        elif architecture_name == 'wgangp_lsd':
            merge_with_yaml(
                os.path.join('/home/rohitrishabh/CoilWGAN/configs', exp_batch,
                             exp_alias + '.yaml'))
            set_type_of_process('validation', dataset_name)
        elif architecture_name == 'coil_icra':
            merge_with_yaml(
                os.path.join(
                    '/home/adas/CleanedCode/CoIL_Codes/coil_20-06/configs',
                    exp_batch, exp_alias + '.yaml'))
            set_type_of_process('validation', dataset_name)

            if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                                    g_conf.PROCESS_NAME)[0] == "Finished":
                # TODO: print some cool summary or not ?
                return

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs',
                g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        #Define the dataset. This structure is has the __get_item__ redefined in a way
        #that you can access the HDFILES positions from the root directory as a in a vector.
        if dataset_name != []:
            full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                        dataset_name)
        else:
            full_dataset = os.environ["COIL_DATASET_PATH"]

        augmenter = Augmenter(None)

        dataset = CoILDataset(full_dataset, transform=augmenter)

        # Creates the sampler, this part is responsible for managing the keys. It divides
        # all keys depending on the measurements and produces a set of keys for each bach.

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        # TODO: batch size an number of workers go to some configuration file
        batchsize = 30
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  batch_size=batchsize,
                                                  shuffle=False,
                                                  num_workers=1,
                                                  pin_memory=True)

        # TODO: here there is clearly a posibility to make a cool "conditioning" system.

        if architecture_name == 'coil_unit':
            model_task, model_gen = CoILModel('coil_unit')
            model_task, model_gen = model_task.cuda(), model_gen.cuda()
        else:
            model = CoILModel(architecture_name)
            model.cuda()

        latest = 0

        # print (dataset.meta_data)
        best_loss = 1000
        best_error = 1000
        best_loss_mini = 1000
        best_loss_iter = 0
        best_error_iter = 0
        batch_size = 30
        best_loss_ckpt = ''

        if architecture_name == 'coil_unit':
            ckpts = glob.glob('/home/rohitrishabh/UNIT_DA/outputs/' +
                              exp_alias + '/checkpoints/gen*.pt')
        else:
            ckpts = glob.glob(
                os.path.join(
                    '/home/adas/CleanedCode/CoIL_Codes/coil_20-06/_logs',
                    exp_batch, exp_alias) + '/*.pth')

        if architecture_name == 'coil_unit':
            model_task.eval()
            model_gen.eval()
        else:
            model.eval()
        ckpts = sorted(ckpts)
        # TODO: refactor on the getting on the checkpoint organization needed
        for ckpt in ckpts:

            # if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):

            # latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)
            # ckpt = os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias
            #                         , 'checkpoints', str(latest) + '.pth')
            checkpoint = torch.load(ckpt)
            print("Validation loaded ", ckpt)
            if architecture_name == 'wgangp_lsd':
                print(ckpt, checkpoint['best_loss_iter_F'])
                model.load_state_dict(checkpoint['stateF_dict'])
                model.eval()
            elif architecture_name == 'coil_unit':
                model_task.load_state_dict(checkpoint['task'])
                model_gen.load_state_dict(checkpoint['b'])
                model_task.eval()
                model_gen.eval()
            elif architecture_name == 'coil_icra':
                model.load_state_dict(checkpoint['state_dict'])
                model.eval()

            accumulated_loss = 0
            accumulated_error = 0
            iteration_on_checkpoint = 0
            datacount = 0
            for data in data_loader:

                input_data, float_data = data

                controls = float_data[:, dataset.controls_position(), :]

                camera_angle = float_data[:, 26, :]
                camera_angle = camera_angle.cuda()
                steer = float_data[:, 0, :]
                steer = steer.cuda()
                speed = float_data[:, 10, :]
                speed = speed.cuda()

                time_use = 1.0
                car_length = 3.0
                extra_factor = 2.5
                threshold = 1.0

                pos = camera_angle > 0.0
                pos = pos.type(torch.FloatTensor)
                neg = camera_angle <= 0.0
                neg = neg.type(torch.FloatTensor)
                pos = pos.cuda()
                neg = neg.cuda()

                rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0
                val = extra_factor * (torch.atan(
                    (rad_camera_angle * car_length) /
                    (time_use * speed + 0.05))) / 3.1415
                steer -= pos * torch.min(val, torch.Tensor([0.6]).cuda())
                steer += neg * torch.min(val, torch.Tensor([0.6]).cuda())

                steer = steer.cpu()
                float_data[:, 0, :] = steer
                float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0
                float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0

                datacount += 1
                control_position = 24
                speed_position = 10

                if architecture_name == 'wgangp_lsd':
                    embed, output = model(
                        torch.squeeze(input_data['rgb']).cuda(),
                        float_data[:, speed_position, :].cuda())

                    loss = torch.sum(
                        (output[0] -
                         dataset.extract_targets(float_data).cuda()
                         )**2).data.tolist()
                    mean_error = torch.sum(
                        torch.abs(output[0] -
                                  dataset.extract_targets(float_data).cuda())
                    ).data.tolist()

                elif architecture_name == 'coil_unit':
                    embed, n_b = model_gen.encode(
                        torch.squeeze(input_data['rgb']).cuda())
                    output = model_task(
                        embed,
                        Variable(float_data[:, speed_position, :]).cuda())

                    loss = torch.sum(
                        (output[0].data -
                         dataset.extract_targets(float_data).cuda())**2)
                    mean_error = torch.sum(
                        torch.abs(output[0].data -
                                  dataset.extract_targets(float_data).cuda()))

                elif architecture_name == 'coil_icra':
                    output = model.forward_branch(
                        torch.squeeze(input_data['rgb']).cuda(),
                        float_data[:, speed_position, :].cuda(),
                        float_data[:, control_position, :].cuda())

                    loss = torch.sum(
                        (output - dataset.extract_targets(float_data).cuda()
                         )**2).data.tolist()
                    mean_error = torch.sum(
                        torch.abs(output -
                                  dataset.extract_targets(float_data).cuda())
                    ).data.tolist()

                if loss < best_loss_mini:
                    best_loss_mini = loss

                accumulated_error += mean_error
                accumulated_loss += loss
                # error = torch.abs(output[0] - dataset.extract_targets(float_data).cuda())

                # Log a random position
                position = random.randint(0, len(float_data) - 1)
                iteration_on_checkpoint += 1

            print(datacount, len(data_loader), accumulated_loss)
            checkpoint_average_loss = accumulated_loss / float(
                datacount * batchsize)
            checkpoint_average_error = accumulated_error / float(
                datacount * batchsize)

            if checkpoint_average_loss < best_loss:
                best_loss = checkpoint_average_loss
                best_loss_iter = latest
                best_loss_ckpt = ckpt

            if checkpoint_average_error < best_error:
                best_error = checkpoint_average_error
                best_error_iter = latest

            print("current loss", checkpoint_average_loss)
            print("best_loss", best_loss)

            coil_logger.add_message(
                'Iterating', {
                    'Summary': {
                        'Error': checkpoint_average_error,
                        'Loss': checkpoint_average_loss,
                        'BestError': best_error,
                        'BestLoss': best_loss,
                        'BestLossCheckpoint': best_loss_iter,
                        'BestErrorCheckpoint': best_error_iter
                    },
                    'Checkpoint': latest
                }, latest)
            latest += 2000

        coil_logger.add_message('Finished', {})
        print("Best Validation Loss ckpt:", best_loss_ckpt)

        # TODO: DO ALL THE AMAZING LOGGING HERE, as a way to very the status in paralell.
        # THIS SHOULD BE AN INTERELY PARALLEL PROCESS

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except:
        traceback.print_exc()

        coil_logger.add_message('Error', {'Message': 'Something Happened'})
Esempio n. 9
0
def execute(gpu, exp_batch, exp_alias):
    # We set the visible cuda devices

    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        set_type_of_process('train')

        coil_logger.add_message('Loading', {'GPU': gpu})

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        sys.stdout = open(os.path.join(
            '_output_logs',
            g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                          "a",
                          buffering=1)

        if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                                g_conf.PROCESS_NAME)[0] == "Finished":
            # TODO: print some cool summary or not ?
            return

        #Define the dataset. This structure is has the __get_item__ redefined in a way
        #that you can access the HDFILES positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                    g_conf.TRAIN_DATASET_NAME)

        #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU)

        dataset = CoILDataset(full_dataset,
                              transform=transforms.Compose(
                                  [transforms.ToTensor()]))

        # Creates the sampler, this part is responsible for managing the keys. It divides
        # all keys depending on the measurements and produces a set of keys for each bach.
        sampler = BatchSequenceSampler(
            splitter.control_steer_split(dataset.measurements,
                                         dataset.meta_data), g_conf.BATCH_SIZE,
            g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        # TODO: batch size an number of workers go to some configuration file
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  batch_sampler=sampler,
                                                  shuffle=False,
                                                  num_workers=12,
                                                  pin_memory=False)
        # By instanciating the augmenter we get a callable that augment images and transform them
        # into tensors.
        st = lambda aug: iag.Sometimes(aug, 0.4)
        oc = lambda aug: iag.Sometimes(aug, 0.3)
        rl = lambda aug: iag.Sometimes(aug, 0.09)
        augmenter = iag.Augmenter([iag.ToGPU()] + [
            rl(iag.GaussianBlur(
                (0, 1.5))),  # blur images with a sigma between 0 and 1.5
            rl(
                iag.AdditiveGaussianNoise(
                    loc=0, scale=(0.0, 0.05),
                    per_channel=0.5)),  # add gaussian noise to images
            oc(iag.Dropout((0.0, 0.10), per_channel=0.5)
               ),  # randomly remove up to X% of the pixels
            oc(
                iag.CoarseDropout(
                    (0.0, 0.10), size_percent=(0.08, 0.2), per_channel=0.5)
            ),  # randomly remove up to X% of the pixels
            oc(iag.Add((-40, 40), per_channel=0.5)
               ),  # change brightness of images (by -X to Y of original value)
            st(iag.Multiply((0.10, 2), per_channel=0.2)
               ),  # change brightness of images (X-Y% of original value)
            rl(iag.ContrastNormalization((
                0.5, 1.5), per_channel=0.5)),  # improve or worsen the contrast
            rl(iag.Grayscale((0.0, 1))),  # put grayscale
        ]  # do all of the above in random order
                                  )
        # augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE)
        # TODO: here there is clearly a posibility to make a cool "conditioning" system.

        model = CoILModel(g_conf.MODEL_NAME)
        model.cuda()
        print(model)

        criterion = Loss()

        # TODO: DATASET SIZE SEEMS WEIRD
        optimizer = optim.Adam(model.parameters(), lr=0.0002)

        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file != None:
            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            accumulated_time = checkpoint['total_time']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 10000.0
            accumulated_time = 0  # We accumulate iteration time and keep the average speed
            best_loss_iter = 0

        # TODO: The checkpoint will continue, so it should erase everything up to the iteration

        best_loss_save = 10000.0
        best_loss_save_iter = 0
        curr_loss_save = 0.0

        print(dataset.meta_data)

        print(model)
        capture_time = time.time()
        model.train()
        for data in data_loader:

            input_data, float_data = data

            #TODO, ADD ITERATION SCHEDULE
            input_rgb_data = augmenter(0, input_data['rgb'])
            augment_for_controls = 1
            adjustlr = 1

            if augment_for_controls:  #and self._config.targets_names[j] == "Steer":
                camera_angle = float_data[:, 26, :]
                camera_angle = camera_angle.cuda(
                )  #self._config.variable_names.index('Angle'),i]
                print("Camera angle", camera_angle[0])
                steer = float_data[:, 0, :]
                # print("Original", steer[0])
                steer = steer.cuda()
                speed = float_data[:, 10, :]
                speed = speed.cuda()
                # print (steer)

                time_use = 1.0
                car_length = 3.0
                extra_factor = 2.5
                threshold = 1.0

                pos = camera_angle > 0.0
                pos = pos.type(torch.FloatTensor)
                neg = camera_angle <= 0.0
                neg = neg.type(torch.FloatTensor)
                pos = pos.cuda()
                neg = neg.cuda()

                rad_camera_angle = math.pi * (torch.abs(camera_angle)) / 180.0
                val = extra_factor * (torch.atan(
                    (rad_camera_angle * car_length) /
                    (time_use * speed + 0.05))) / 3.1415
                # print(val)
                steer -= pos * torch.min(val, torch.tensor([0.6]).cuda())

                steer += neg * torch.min(val, torch.tensor([0.6]).cuda())

                print("val", val[0])
                print("speed", speed[0])

                steer = steer.cpu()
                float_data[:, 0, :] = steer

                float_data[:, 0, :][float_data[:, 0, :] > 1.0] = 1.0
                float_data[:, 0, :][float_data[:, 0, :] < -1.0] = -1.0
            #coil_logger.add_images(input_rgb_data)

            # get the control commands from float_data, size = [120,1]

            controls = float_data[:, dataset.controls_position(), :]
            # print(" CONTROLS  ", controls.shape)
            # The output(branches) is a list of 5 branches results, each branch is with size [120,3]

            model.zero_grad()
            # print ( 'INPUTS', dataset.extract_inputs(float_data).shape )
            branches = model(input_rgb_data,
                             dataset.extract_inputs(float_data).cuda())

            #print ("len ",len(branches))

            #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1)
            # print ("Extracted targets ", dataset.extract_targets(float_data).shape[0])
            loss = criterion.MSELoss(
                branches,
                dataset.extract_targets(float_data).cuda(), controls.cuda(),
                dataset.extract_inputs(float_data).cuda())

            # TODO: All these logging things could go out to clean up the main
            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            curr_loss_save += loss.data

            # Log a random position
            position = random.randint(0, len(float_data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output -
                              dataset.extract_targets(float_data).cuda())

            # TODO: For now we are computing the error for just the correct branch, it could be multi- branch,

            coil_logger.add_scalar('Loss', loss.data, iteration)

            loss.backward()
            optimizer.step()

            accumulated_time += time.time() - capture_time
            capture_time = time.time()

            # TODO: Get only the  float_data that are actually generating output
            # TODO: itearation is repeating , and that is dumb
            coil_logger.add_message(
                'Iterating', {
                    'Iteration':
                    iteration,
                    'Loss':
                    loss.data.tolist(),
                    'Images/s':
                    (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                    'BestLoss':
                    best_loss,
                    'BestLossIteration':
                    best_loss_iter,
                    'BestLossSave':
                    best_loss_save,
                    'Output':
                    output[position].data.tolist(),
                    'GroundTruth':
                    dataset.extract_targets(
                        float_data)[position].data.tolist(),
                    'Error':
                    error[position].data.tolist(),
                    'Inputs':
                    dataset.extract_inputs(float_data)[position].data.tolist()
                }, iteration)

            # TODO: For now we are computing the error for just the correct branch, it could be multi-branch,

            # TODO: save also the optimizer state dictionary
            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'best_loss_iter': best_loss_iter
                }
                # TODO : maybe already summarize the best model ???
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(iteration) + '.pth'))
            print("before best save")
            if iteration % 5 == 0 and iteration > 4:
                curr_loss_save /= 5000.0
                if curr_loss_save < best_loss_save:

                    best_loss_save = curr_loss_save
                    curr_loss_save = 0
                    state = {
                        'iteration': iteration,
                        'state_dict': model.state_dict(),
                        'best_loss': best_loss_save,
                        'total_time': accumulated_time,
                        'best_loss_iter': best_loss_save_iter
                    }
                    # TODO : maybe already summarize the best model ???
                    torch.save(
                        state,
                        os.path.join('_logs', exp_batch, exp_alias,
                                     'best_loss_save' + '.pth'))
            print("after best save")
            if iteration == best_loss_iter:

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'best_loss_iter': best_loss_iter
                }
                # TODO : maybe already summarize the best model ???
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias,
                                 'best_loss' + '.pth'))

            iteration += 1

            if adjustlr and iteration % 1000:
                adjust_learning_rate(optimizer, iteration)

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except:
        traceback.print_exc()

        coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias):

    manualSeed = g_conf.SEED
    torch.cuda.manual_seed(manualSeed)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                g_conf.TRAIN_DATASET_NAME)
    real_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                "FinalRealWorldDataset")

    #main data loader
    dataset = CoILDataset(full_dataset,
                          real_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    sampler = BatchSequenceSampler(
        splitter.control_steer_split(dataset.measurements,
                                     dataset.meta_data), g_conf.BATCH_SIZE,
        g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=sampler,
                                              shuffle=False,
                                              num_workers=6,
                                              pin_memory=True)

    #real image dataloader

    l1weight = g_conf.L1_WEIGHT
    image_size = tuple([88, 200])

    print("Configurations of ", exp_alias)
    print("GANMODEL_NAME", g_conf.GANMODEL_NAME)
    print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION)
    print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE)
    print("SKIP", g_conf.SKIP)
    print("TYPE", g_conf.TYPE)
    print("L1 WEIGHT", g_conf.L1_WEIGHT)
    print("LAB SMOOTH", g_conf.LABSMOOTH)

    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION,
                               skip=g_conf.SKIP).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch':
        netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller':
        netD = ganmodels_nopatch_smaller._netD(
            loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch_smaller._netG(
            loss=g_conf.LOSS_FUNCTION).cuda()

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task':
        netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda()
        netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda()

        if g_conf.PRETRAINED == 'RECON':
            netF_statedict = torch.load('netF_GAN_Pretrained.wts')
            netF.load_state_dict(netF_statedict)

        elif g_conf.PRETRAINED == 'IL':
            print("Loading IL")
            model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth')
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()

            print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys()))
            for i, keys in enumerate(
                    zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                if newkey.split('.')[0] == "branch" and oldkey.split(
                        '.')[0] == "branches":
                    print("No Transfer of ", newkey, " to ", oldkey)
                else:
                    print("Transferring ", newkey, " to ", oldkey)
                    netF_state_dict[newkey] = model_IL_state_dict[oldkey]

            netF.load_state_dict(netF_state_dict)
            print("IL Model Loaded!")

    init_weights(netD)
    init_weights(netG)
    #do init for netF also later but now it is in the model code itself

    print(netD)
    print(netF)
    print(netG)

    optimD = torch.optim.Adam(netD.parameters(),
                              lr=g_conf.LR_D,
                              betas=(0.7, 0.999))
    optimG = torch.optim.Adam(netG.parameters(),
                              lr=g_conf.LR_G,
                              betas=(0.7, 0.999))
    if g_conf.TYPE == 'task':
        optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE)
        Task_Loss = TaskLoss()

    if g_conf.LOSS_FUNCTION == 'LSGAN':
        Loss = torch.nn.MSELoss().cuda()
    elif g_conf.LOSS_FUNCTION == 'NORMAL':
        Loss = torch.nn.BCELoss().cuda()

    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter_F = 0
    best_loss_iter_G = 0
    best_lossF = 1000000.0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0

    gen_iterations = 0

    netG.train()
    netD.train()
    netF.train()
    capture_time = time.time()

    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    #TODO put family for losses
    fake_img_pool = ImagePool(50)

    for data in data_loader:

        set_requires_grad(netD, True)
        set_requires_grad(netF, True)
        set_requires_grad(netG, True)

        # print("ITERATION:", iteration)

        val = 0.5
        input_data, float_data, real_img = data
        inputs = input_data['rgb'].cuda()
        inputs = inputs.squeeze(1)
        inputs_in = inputs - val  #subtracted by 0.5

        #TODO: make sure the F network does not get optimized by G optim
        controls = float_data[:, dataset.controls_position(), :]
        embed, branches = netF(inputs_in,
                               dataset.extract_inputs(float_data).cuda())
        print("Branch Outputs:::", branches[0][0])

        embed_inputs = embed
        fake_inputs = netG(embed_inputs.detach())
        fake_inputs_in = fake_inputs

        if iteration % 500 == 0:
            imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]),
                                     0).cpu().data
            vutils.save_image(imgs_to_save,
                              './imgs_' + exp_alias + '/' + str(iteration) +
                              '_real_and_fake.png',
                              normalize=True)
            coil_logger.add_image("Images", imgs_to_save, iteration)

        ##--------------------Discriminator part!!!!!!!!!!-------------------##
        set_requires_grad(netD, True)
        set_requires_grad(netF, False)
        set_requires_grad(netG, False)
        optimD.zero_grad()

        ##fake
        fake_inputs_forD = fake_img_pool.query(fake_inputs.detach())
        outputsD_fake_forD = netD(fake_inputs_forD.detach())

        labsize = outputsD_fake_forD.size()
        labels_fake = torch.zeros(labsize)  #Fake labels
        label_fake_noise = torch.rand(
            labels_fake.size()) * 0.05 - 0.025  #Label smoothing

        if g_conf.LABSMOOTH == 1:
            labels_fake = labels_fake + labels_fake_noise

        labels_fake = Variable(labels_fake).cuda()
        lossD_fake = Loss(outputsD_fake_forD, labels_fake)

        ##real
        outputsD_real = netD(inputs)

        labsize = outputsD_real.size()
        labels_real = torch.ones(labsize)  #Real labels
        label_real_noise = torch.rand(
            labels_real.size()) * 0.05 - 0.025  #Label smoothing

        if g_conf.LABSMOOTH == 1:
            labels_real = labels_real + labels_real_noise

        labels_real = Variable(labels_real).cuda()
        lossD_real = Loss(outputsD_real, labels_real)

        #Discriminator updates

        lossD = (lossD_real + lossD_fake) * 0.5
        lossD /= len(inputs)
        lossD.backward()
        optimD.step()

        coil_logger.add_scalar('Total LossD', lossD.data, iteration)
        coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs),
                               iteration)

        ##--------------------Generator part!!!!!!!!!!-----------------------##
        set_requires_grad(netD, False)
        set_requires_grad(netF, False)
        set_requires_grad(netG, True)
        optimG.zero_grad()

        outputsD_fake_forG = netD(fake_inputs)
        #Generator updates

        lossG_adv = Loss(outputsD_fake_forG, labels_real)
        lossG_smooth = L1_loss(fake_inputs, inputs)
        lossG = (lossG_adv + l1weight * lossG_smooth) / (1.0 + l1weight)
        lossG /= len(inputs)
        print(lossG)

        lossG.backward()
        optimG.step()

        #####Task network updates##########################
        set_requires_grad(netD, False)
        set_requires_grad(netF, True)
        set_requires_grad(netG, False)

        optimF.zero_grad()
        lossF = Task_Loss.MSELoss(branches,
                                  dataset.extract_targets(float_data).cuda(),
                                  controls.cuda(),
                                  dataset.extract_inputs(float_data).cuda())
        coil_logger.add_scalar('Task Loss', lossF.data, iteration)
        lossF.backward()
        optimF.step()

        coil_logger.add_scalar('Total LossG', lossG.data, iteration)
        coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs),
                               iteration)

        #optimization for one iter done!

        position = random.randint(0, len(float_data) - 1)
        if lossD.data < best_lossD:
            best_lossD = lossD.data.tolist()

        if lossG.data < best_lossG:
            best_lossG = lossG.data.tolist()
            best_loss_iter_G = iteration

        if lossF.data < best_lossF:
            best_lossF = lossF.data.tolist()
            best_loss_iter_F = iteration

        accumulated_time += time.time() - capture_time
        capture_time = time.time()
        print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(),
              "BestLossD", best_lossD, "BestLossG", best_lossG, "LossF", lossF,
              "BestLossF", best_lossF, "Iteration", iteration,
              "Best Loss Iteration G", best_loss_iter_G,
              "Best Loss Iteration F", best_loss_iter_F)

        coil_logger.add_message(
            'Iterating', {
                'Iteration':
                iteration,
                'LossD':
                lossD.data.tolist(),
                'LossG':
                lossG.data.tolist(),
                'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                'BestLossD':
                best_lossD,
                'BestLossG':
                best_lossG,
                'BestLossIterationG':
                best_loss_iter_G,
                'BestLossF':
                best_lossF,
                'BestLossIterationF':
                best_loss_iter_F,
                'GroundTruth':
                dataset.extract_targets(float_data)[position].data.tolist(),
                'Inputs':
                dataset.extract_inputs(float_data)[position].data.tolist()
            }, iteration)

        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter_G': best_loss_iter_G,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))
        if iteration == best_loss_iter_G and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter_G': best_loss_iter_G
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelG' + '.pth'))

        if iteration == best_loss_iter_F and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'best_lossF': best_lossF,
                'total_time': accumulated_time,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelF' + '.pth'))

        iteration += 1
Esempio n. 11
0
def execute(gpu, exp_batch, exp_alias):
    # We set the visible cuda devices

    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        set_type_of_process('train')

        coil_logger.add_message('Loading', {'GPU': gpu})

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        sys.stdout = open(os.path.join(
            '_output_logs',
            g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                          "a",
                          buffering=1)

        if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                                g_conf.PROCESS_NAME)[0] == "Finished":
            # TODO: print some cool summary or not ?
            return

        #Define the dataset. This structure is has the __get_item__ redefined in a way
        #that you can access the HDFILES positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                    g_conf.TRAIN_DATASET_NAME)

        #augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU)

        dataset = CoILDataset(full_dataset,
                              transform=transforms.Compose(
                                  [transforms.ToTensor()]))

        # Creates the sampler, this part is responsible for managing the keys. It divides
        # all keys depending on the measurements and produces a set of keys for each bach.
        sampler = BatchSequenceSampler(
            splitter.control_steer_split(dataset.measurements,
                                         dataset.meta_data), g_conf.BATCH_SIZE,
            g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        # TODO: batch size an number of workers go to some configuration file
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  batch_sampler=sampler,
                                                  shuffle=False,
                                                  num_workers=12,
                                                  pin_memory=True)
        # By instanciating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = iag.Augmenter(g_conf.AUGMENTATION_SUITE)

        # TODO: here there is clearly a posibility to make a cool "conditioning" system.

        model = CoILModel(g_conf.MODEL_NAME)
        model.cuda()
        exit()
        print(model)

        criterion = Loss()

        # TODO: DATASET SIZE SEEMS WEIRD
        optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file != None:
            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            accumulated_time = checkpoint['total_time']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 10000.0
            accumulated_time = 0  # We accumulate iteration time and keep the average speed
            best_loss_iter = 0

        # TODO: The checkpoint will continue, so it should erase everything up to the iteration

        print(dataset.meta_data)

        print(model)
        capture_time = time.time()
        for data in data_loader:

            input_data, float_data = data

            #TODO, ADD ITERATION SCHEDULE
            input_rgb_data = augmenter(0, input_data['rgb'])
            #coil_logger.add_images(input_rgb_data)

            # get the control commands from float_data, size = [120,1]

            controls = float_data[:, dataset.controls_position(), :]
            print(" CONTROLS  ", controls.shape)
            # The output(branches) is a list of 5 branches results, each branch is with size [120,3]

            model.zero_grad()
            print('INPUTS', dataset.extract_inputs(float_data).shape)
            branches = model(input_rgb_data,
                             dataset.extract_inputs(float_data).cuda())

            #print ("len ",len(branches))

            #targets = torch.cat([steer_gt, gas_gt, brake_gt], 1)
            print("Extracted targets ",
                  dataset.extract_targets(float_data).shape[0])
            loss = criterion.MSELoss(
                branches,
                dataset.extract_targets(float_data).cuda(), controls.cuda(),
                dataset.extract_inputs(float_data).cuda())

            # TODO: All these logging things could go out to clean up the main
            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            # Log a random position
            position = random.randint(0, len(float_data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output -
                              dataset.extract_targets(float_data).cuda())

            # TODO: For now we are computing the error for just the correct branch, it could be multi- branch,

            coil_logger.add_scalar('Loss', loss.data, iteration)

            loss.backward()
            optimizer.step()

            accumulated_time += time.time() - capture_time
            capture_time = time.time()

            # TODO: Get only the  float_data that are actually generating output
            # TODO: itearation is repeating , and that is dumb
            coil_logger.add_message(
                'Iterating', {
                    'Iteration':
                    iteration,
                    'Loss':
                    loss.data.tolist(),
                    'Images/s':
                    (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                    'BestLoss':
                    best_loss,
                    'BestLossIteration':
                    best_loss_iter,
                    'Output':
                    output[position].data.tolist(),
                    'GroundTruth':
                    dataset.extract_targets(
                        float_data)[position].data.tolist(),
                    'Error':
                    error[position].data.tolist(),
                    'Inputs':
                    dataset.extract_inputs(float_data)[position].data.tolist()
                }, iteration)

            # TODO: For now we are computing the error for just the correct branch, it could be multi-branch,

            # TODO: save also the optimizer state dictionary
            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'best_loss_iter': best_loss_iter
                }
                # TODO : maybe already summarize the best model ???
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(iteration) + '.pth'))

            iteration += 1

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except:
        traceback.print_exc()

        coil_logger.add_message('Error', {'Message': 'Something Happened'})
Esempio n. 12
0
def execute(gpu,
            exp_batch='nocrash',
            exp_alias='resnet34imnet10S1',
            suppress_output=True,
            yaml_file=None):
    latest = None
    # try:
    # We set the visible cuda devices
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    # At this point the log file with the correct naming is created.
    path_to_yaml_file = os.path.join('configs', exp_batch, exp_alias + '.yaml')
    if yaml_file is not None:
        path_to_yaml_file = os.path.join(yaml_file, exp_alias + '.yaml')
    merge_with_yaml(path_to_yaml_file)
    # The validation dataset is always fully loaded, so we fix a very high number of hours
    # g_conf.NUMBER_OF_HOURS = 10000 # removed to simplify code
    """
    # commenting this segment to simplify code, uncomment if necessary
    set_type_of_process('validation', dataset_name)

    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')

    if suppress_output:
        sys.stdout = open(os.path.join('_output_logs',
                                       exp_alias + '_' + g_conf.PROCESS_NAME + '_'
                                       + str(os.getpid()) + ".out"),
                          "a", buffering=1)
        sys.stderr = open(os.path.join('_output_logs',
                          exp_alias + '_err_' + g_conf.PROCESS_NAME + '_'
                                       + str(os.getpid()) + ".out"),
                          "a", buffering=1)
    """

    # Define the dataset. This structure is has the __get_item__ redefined in a way
    # that you can access the HDFILES positions from the root directory as a in a vector.

    full_dataset = os.path.join(
        os.environ["COIL_DATASET_PATH"], g_conf.DART_COVMAT_DATA
    )  # dataset used for computing dart covariance matrix

    augmenter = Augmenter(None)

    # Definition of the dataset to be used. Preload name is just the validation data name
    print('full dataset path: ', full_dataset)
    dataset = CoILDataset(full_dataset,
                          transform=augmenter,
                          preload_name=g_conf.DART_COVMAT_DATA
                          )  # specify DART_COVMAT_DATA in the config file

    # The data loader is the multi threaded module from pytorch that release a number of
    # workers to get all the data.
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=g_conf.BATCH_SIZE,
        shuffle=False,
        num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
        pin_memory=True)

    model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
    """ removing this segment to simplify code
    # The window used to keep track of the trainings
    l1_window = []
    latest = get_latest_evaluated_checkpoint()
    if latest is not None:  # When latest is noe
        l1_window = coil_logger.recover_loss_window(g_conf.DART_COVMAT_DATA, None)
    """

    model.cuda()

    best_mse = 1000
    best_error = 1000
    best_mse_iter = 0
    best_error_iter = 0

    # modified validation code from here to run a single model checkpoint
    # used for computing the covariance matrix with the DART model checkpoint
    checkpoint = torch.load(
        g_conf.DART_MODEL_CHECKPOINT
    )  # specify DART_MODEL_CHECKPOINT in the config file
    checkpoint_iteration = checkpoint['iteration']
    print("Validation loaded ", checkpoint_iteration)
    model.load_state_dict(checkpoint['state_dict'])

    model.eval()
    accumulated_mse = 0
    accumulated_error = 0
    iteration_on_checkpoint = 0

    # considering steer, throttle & brake so 3x3 matrix
    normalized_covariate_shift = torch.zeros(3, 3)

    print('data_loader size: ', len(data_loader))
    for data in data_loader:

        # Compute the forward pass on a batch from the validation dataset
        controls = data['directions']
        output = model.forward_branch(
            torch.squeeze(data['rgb']).cuda(),
            dataset.extract_inputs(data).cuda(), controls)
        """ removing this segment to simplify code
        # It could be either waypoints or direct control
        if 'waypoint1_angle' in g_conf.TARGETS:
            write_waypoints_output(checkpoint_iteration, output)
        else:
            write_regular_output(checkpoint_iteration, output)
        """

        mse = torch.mean(
            (output - dataset.extract_targets(data).cuda())**2).data.tolist()
        mean_error = torch.mean(
            torch.abs(output -
                      dataset.extract_targets(data).cuda())).data.tolist()

        accumulated_error += mean_error
        accumulated_mse += mse
        error = torch.abs(output -
                          dataset.extract_targets(data).cuda()).data.cpu()

        ### covariate shift segment starts
        error = error.unsqueeze(dim=2)
        error_transpose = torch.transpose(error, 1, 2)
        # compute covariate shift
        covariate_shift = torch.matmul(error, error_transpose)
        # expand traj length tensor to Bx3x3 (considering steer, throttle & brake)
        traj_lengths = torch.stack([
            torch.stack([data['current_traj_length'].squeeze(dim=1)] * 3,
                        dim=1)
        ] * 3,
                                   dim=2)
        covariate_shift = covariate_shift / traj_lengths
        covariate_shift = torch.sum(covariate_shift, dim=0)
        # print ('current covariate shift: ', covariate_shift.shape)

        normalized_covariate_shift += covariate_shift
        ### covariate shift segment ends

        total_episodes = data['episode_count'][-1].data
        iteration_on_checkpoint += 1
        if iteration_on_checkpoint % 50 == 0:
            print('iteration: ', iteration_on_checkpoint)

    print('total episodes: ', total_episodes)
    normalized_covariate_shift = normalized_covariate_shift / total_episodes
    print('normalized covariate shift: ', normalized_covariate_shift.shape,
          normalized_covariate_shift)

    # save the matrix to restart directly from the mat file
    # np.save(os.path.join(g_conf.COVARIANCE_MATRIX_PATH, 'covariance_matrix_%s.npy'%g_conf.DART_COVMATH_DATA), normalized_covariate_shift)
    return normalized_covariate_shift.numpy()
    '''
Esempio n. 13
0
def execute(gpu, exp_batch, exp_alias, dataset_name):
    # We set the visible cuda devices

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    # At this point the log file with the correct naming is created.
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('validation', dataset_name)

    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')

    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)

    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        # TODO: print some cool summary or not ?
        return

    #Define the dataset. This structure is has the __get_item__ redefined in a way
    #that you can access the HDFILES positions from the root directory as a in a vector.
    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)

    dataset = CoILDataset(full_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    # Creates the sampler, this part is responsible for managing the keys. It divides
    # all keys depending on the measurements and produces a set of keys for each bach.

    # The data loader is the multi threaded module from pytorch that release a number of
    # workers to get all the data.
    # TODO: batch size an number of workers go to some configuration file
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=120,
                                              shuffle=False,
                                              num_workers=12,
                                              pin_memory=True)

    # TODO: here there is clearly a posibility to make a cool "conditioning" system.
    model = CoILModel(g_conf.MODEL_NAME)
    model.cuda()

    criterion = Loss()

    latest = get_latest_evaluated_checkpoint()
    if latest is None:  # When nothing was tested, get latest returns none, we fix that.
        latest = 0

    print(dataset.meta_data)
    best_loss = 1000
    best_error = 1000
    best_loss_iter = 0
    best_error_iter = 0

    while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):

        if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):

            latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)

            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(latest) + '.pth'))
            checkpoint_iteration = checkpoint['iteration']
            print("Validation loaded ", checkpoint_iteration)

            accumulated_loss = 0
            accumulated_error = 0
            iteration_on_checkpoint = 0
            for data in data_loader:

                input_data, float_data = data
                control_position = np.where(
                    dataset.meta_data[:, 0] == 'control')[0][0]
                speed_position = np.where(
                    dataset.meta_data[:, 0] == 'speed_module')[0][0]
                print(torch.squeeze(input_data['rgb']).shape)

                print(control_position)
                print(speed_position)
                # Obs : Maybe we could also check for other branches ??

                output = model.forward_branch(
                    torch.squeeze(input_data['rgb']).cuda(),
                    float_data[:, speed_position, :].cuda(),
                    float_data[:, control_position, :].cuda())

                for i in range(input_data['rgb'].shape[0]):

                    coil_logger.write_on_csv(
                        checkpoint_iteration,
                        [output[i][0], output[i][1], output[i][2]])

                # TODO: Change this a functional standard using the loss functions.

                loss = torch.mean(
                    (output - dataset.extract_targets(float_data).cuda()
                     )**2).data.tolist()
                mean_error = torch.mean(
                    torch.abs(output -
                              dataset.extract_targets(float_data).cuda())
                ).data.tolist()
                accumulated_error += mean_error
                accumulated_loss += loss
                error = torch.abs(output -
                                  dataset.extract_targets(float_data).cuda())

                # Log a random position
                position = random.randint(0, len(float_data) - 1)
                #print (output[position].data.tolist())
                coil_logger.add_message(
                    'Iterating', {
                        'Checkpoint':
                        latest,
                        'Iteration': (str(iteration_on_checkpoint * 120) +
                                      '/' + str(len(dataset))),
                        'MeanError':
                        mean_error,
                        'Loss':
                        loss,
                        'Output':
                        output[position].data.tolist(),
                        'GroundTruth':
                        dataset.extract_targets(
                            float_data)[position].data.tolist(),
                        'Error':
                        error[position].data.tolist(),
                        'Inputs':
                        dataset.extract_inputs(float_data)
                        [position].data.tolist()
                    }, latest)
                iteration_on_checkpoint += 1

            checkpoint_average_loss = accumulated_loss / len(dataset)
            checkpoint_average_error = accumulated_error / len(dataset)
            coil_logger.add_scalar('Loss', checkpoint_average_loss, latest)
            coil_logger.add_scalar('Error', checkpoint_average_error, latest)

            if checkpoint_average_loss < best_loss:
                best_loss = checkpoint_average_loss
                best_loss_iter = latest

            if checkpoint_average_error < best_loss:
                best_error = checkpoint_average_error
                best_error_iter = latest

            coil_logger.add_message(
                'Iterating', {
                    'Summary': {
                        'Error': checkpoint_average_error,
                        'Loss': checkpoint_average_loss,
                        'BestError': best_error,
                        'BestLoss': best_loss,
                        'BestLossCheckpoint': best_loss_iter,
                        'BestErrorCheckpoint': best_error_iter
                    },
                    'Checkpoint': latest
                })

        else:
            time.sleep(1)
            print("Waiting for the next Validation")
Esempio n. 14
0
    def gen_update(self, x_a, x_b, float_data, hyperparameters):
        self.gen_opt.zero_grad()
        self.task_opt.zero_grad()

        # init data
        full_dataset = hyperparameters['train_dataset_name']
        real_dataset = hyperparameters['target_domain_path']
        dataset = CoILDataset(full_dataset,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.5, 0.5, 0.5),
                                                       (0.5, 0.5, 0.5))
                              ]))

        # encode
        h_a, n_a = self.gen_a.encode(x_a)
        h_b, n_b = self.gen_b.encode(x_b)
        # decode (within domain)
        x_a_recon = self.gen_a.decode(h_a + n_a)
        x_b_recon = self.gen_b.decode(h_b + n_b)
        # decode (cross domain)
        x_ba = self.gen_a.decode(h_b + n_b)
        x_ab = self.gen_b.decode(h_a + n_a)
        # encode again
        h_b_recon, n_b_recon = self.gen_a.encode(x_ba)
        h_a_recon, n_a_recon = self.gen_b.encode(x_ab)
        # decode again (if needed)
        x_aba = self.gen_a.decode(
            h_a_recon +
            n_a_recon) if hyperparameters['recon_x_cyc_w'] > 0 else None
        x_bab = self.gen_b.decode(
            h_b_recon +
            n_b_recon) if hyperparameters['recon_x_cyc_w'] > 0 else None

        # #task part
        identity_embed = h_a
        cycle_embed = h_a_recon

        identity_task = self.netF(
            identity_embed,
            Variable(dataset.extract_inputs(float_data)).cuda())
        cycle_task = self.netF(
            cycle_embed,
            Variable(dataset.extract_inputs(float_data)).cuda())
        controls = Variable(float_data[:, dataset.controls_position(), :])

        # task loss
        self.lossF_identity_task = self.Task_Loss.MSELoss(
            identity_task,
            Variable(dataset.extract_targets(float_data)).cuda(),
            controls.cuda(),
            Variable(dataset.extract_inputs(float_data)).cuda())
        self.lossF_cycle_task = self.Task_Loss.MSELoss(
            cycle_task,
            Variable(dataset.extract_targets(float_data)).cuda(),
            controls.cuda(),
            Variable(dataset.extract_inputs(float_data)).cuda())
        self.lossF_task = self.lossF_identity_task + self.lossF_cycle_task

        # reconstruction loss
        # print(x_a_recon[0][0][:5][:5])
        # print("Help loss:", self.recon_criterion(x_a_recon, x_a))
        # print("identity task", identity_task[0])
        # print("cycle task", cycle_task[0])

        self.loss_gen_recon_x_a = self.recon_criterion(x_a_recon, x_a)
        self.loss_gen_recon_x_b = self.recon_criterion(x_b_recon, x_b)
        self.loss_gen_recon_kl_a = self.__compute_kl(h_a)
        self.loss_gen_recon_kl_b = self.__compute_kl(h_b)
        self.loss_gen_cyc_x_a = self.recon_criterion(x_aba, x_a)
        self.loss_gen_cyc_x_b = self.recon_criterion(x_bab, x_b)
        self.loss_gen_recon_kl_cyc_aba = self.__compute_kl(h_a_recon)
        self.loss_gen_recon_kl_cyc_bab = self.__compute_kl(h_b_recon)
        # GAN loss
        self.loss_gen_adv_a = self.dis_a.calc_gen_loss(x_ba)
        self.loss_gen_adv_b = self.dis_b.calc_gen_loss(x_ab)
        # domain-invariant perceptual loss
        self.loss_gen_vgg_a = self.compute_vgg_loss(
            self.vgg, x_ba, x_b) if hyperparameters['vgg_w'] > 0 else 0
        self.loss_gen_vgg_b = self.compute_vgg_loss(
            self.vgg, x_ab, x_a) if hyperparameters['vgg_w'] > 0 else 0
        # total loss
        self.loss_gen_total = hyperparameters['gan_w'] * self.loss_gen_adv_a + \
                              hyperparameters['gan_w'] * self.loss_gen_adv_b + \
                              hyperparameters['recon_x_w'] * self.loss_gen_recon_x_a + \
                              hyperparameters['recon_kl_w'] * self.loss_gen_recon_kl_a + \
                              hyperparameters['recon_x_w'] * self.loss_gen_recon_x_b + \
                              hyperparameters['recon_kl_w'] * self.loss_gen_recon_kl_b + \
                              hyperparameters['recon_x_cyc_w'] * self.loss_gen_cyc_x_a + \
                              hyperparameters['recon_kl_cyc_w'] * self.loss_gen_recon_kl_cyc_aba + \
                              hyperparameters['recon_x_cyc_w'] * self.loss_gen_cyc_x_b + \
                              hyperparameters['recon_kl_cyc_w'] * self.loss_gen_recon_kl_cyc_bab + \
                              hyperparameters['vgg_w'] * self.loss_gen_vgg_a + \
                              hyperparameters['vgg_w'] * self.loss_gen_vgg_b + \
                              hyperparameters['task_w'] * self.lossF_task
        self.loss_gen_total.backward()
        self.gen_opt.step()

        self.task_opt.zero_grad()
        identity_task = self.netF(
            identity_embed,
            Variable(dataset.extract_inputs(float_data)).cuda())
        cycle_task = self.netF(
            cycle_embed,
            Variable(dataset.extract_inputs(float_data)).cuda())
        controls = Variable(float_data[:, dataset.controls_position(), :])

        # task loss
        self.lossF_identity_task = self.Task_Loss.MSELoss(
            identity_task,
            Variable(dataset.extract_targets(float_data)).cuda(),
            controls.cuda(),
            Variable(dataset.extract_inputs(float_data)).cuda())
        self.lossF_cycle_task = self.Task_Loss.MSELoss(
            cycle_task,
            Variable(dataset.extract_targets(float_data)).cuda(),
            controls.cuda(),
            Variable(dataset.extract_inputs(float_data)).cuda())
        self.lossF_task = self.lossF_identity_task + self.lossF_cycle_task

        self.task_opt.step()
def execute(gpu, exp_batch, exp_alias):

    from time import gmtime, strftime

    manualSeed = g_conf.SEED
    torch.cuda.manual_seed(manualSeed)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join('_output_logs',
                      g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"), "a", buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml', g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], g_conf.TRAIN_DATASET_NAME)
    real_dataset = '/datatmp/Datasets/UNIT_LW1toEW12/trainB' # os.path.join(os.environ["COIL_DATASET_PATH"], "FinalRealWorldDataset")

    #main data loader
    dataset = CoILDataset(full_dataset, real_dataset, transform=transforms.Compose([transforms.ToTensor()]))

    sampler = BatchSequenceSampler(splitter.control_steer_split(dataset.measurements, dataset.meta_data),
                          g_conf.BATCH_SIZE, g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler,
                                              shuffle=False, num_workers=6, pin_memory=True)

    st = lambda aug: iag.Sometimes(aug, 0.4)
    oc = lambda aug: iag.Sometimes(aug, 0.3)
    rl = lambda aug: iag.Sometimes(aug, 0.09)
    augmenter = iag.Augmenter([iag.ToGPU()] + [
        rl(iag.GaussianBlur((0, 1.5))), # blur images with a sigma between 0 and 1.5
        rl(iag.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05), per_channel=0.5)), # add gaussian noise to images
        oc(iag.Dropout((0.0, 0.10), per_channel=0.5)), # randomly remove up to X% of the pixels
        oc(iag.CoarseDropout((0.0, 0.10), size_percent=(0.08, 0.2),per_channel=0.5)), # randomly remove up to X% of the pixels
        oc(iag.Add((-40, 40), per_channel=0.5)), # change brightness of images (by -X to Y of original value)
        st(iag.Multiply((0.10, 2), per_channel=0.2)), # change brightness of images (X-Y% of original value)
        rl(iag.ContrastNormalization((0.5, 1.5), per_channel=0.5)), # improve or worsen the contrast
        rl(iag.Grayscale((0.0, 1))), # put grayscale
        ]# do all of the above in random order
    )


    l1weight = g_conf.L1_WEIGHT
    task_adv_weight = g_conf.TASK_ADV_WEIGHT
    image_size = tuple([88, 200])

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    print("Configurations of ", exp_alias)
    print("GANMODEL_NAME", g_conf.GANMODEL_NAME)
    print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION)
    print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE)
    print("SKIP", g_conf.SKIP)
    print("TYPE", g_conf.TYPE)
    print("L1 WEIGHT", g_conf.L1_WEIGHT)
    print("TASK ADV WEIGHT", g_conf.TASK_ADV_WEIGHT)
    print("LAB SMOOTH", g_conf.LABSMOOTH)

    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION, skip=g_conf.SKIP).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch':
        netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller':
        netD = ganmodels_nopatch_smaller._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch_smaller._netG(loss=g_conf.LOSS_FUNCTION).cuda()

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task':
        netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda()

        if g_conf.PRETRAINED == 'RECON':
            netF_statedict = torch.load('netF_GAN_Pretrained.wts')
            netF.load_state_dict(netF_statedict)

        elif g_conf.PRETRAINED == 'IL':
            print("Loading IL")
            model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth')
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()

            print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys()))
            for i, keys in enumerate(zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                # if newkey.split('.')[0] == "branch" and oldkey.split('.')[0] == "branches":
                #     print("No Transfer of ",  newkey, " to ", oldkey)
                # else:
                print("Transferring ", newkey, " to ", oldkey)
                netF_state_dict[newkey] = model_IL_state_dict[oldkey]

            netF.load_state_dict(netF_state_dict)
            print("IL Model Loaded!")


    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d':
        netD_bin = ganmodels_task._netD_task().cuda()
        netF = ganmodels_task._netF().cuda()

        if g_conf.PRETRAINED == 'IL':
            print("Loading IL")
            model_IL = torch.load('Encoder_IL.pth')
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()

            print(len(netF_state_dict.keys()), len(model_IL_state_dict.keys()))
            for i, keys in enumerate(zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                print("Transferring ", newkey, " to ", oldkey)
                netF_state_dict[newkey] = model_IL_state_dict[oldkey]

            netF.load_state_dict(netF_state_dict)
            print("IL Model Loaded!")

    init_weights(netD_bin)

    print(netD_bin)
    print(netF)

    optimD_bin = torch.optim.Adam(netD_bin.parameters(), lr=g_conf.LR_D, betas=(0.5, 0.999))
    if g_conf.TYPE =='task':
        optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE)
        Task_Loss = TaskLoss()

    if g_conf.GANMODEL_NAME == 'LSDcontrol_task_2d':
        print("Using cross entropy!")
        Loss = torch.nn.CrossEntropyLoss().cuda()

    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter_F = 0
    best_loss_iter_G = 0
    best_lossF = 1000000.0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0
    gen_iterations = 0
    n_critic = g_conf.N_CRITIC

    lossF = Variable(torch.Tensor([100.0]))
    lossG_adv = Variable(torch.Tensor([100.0]))
    lossG_smooth = Variable(torch.Tensor([100.0]))
    lossG = Variable(torch.Tensor([100.0]))

    netD_bin.train()
    netF.train()
    capture_time = time.time()

    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    #TODO check how C network is optimized in LSDSEG
    #TODO put family for losses
    #IMPORTANT WHILE RUNNING THIS, CONV.PY MUST HAVE BATCHNORMS

    fake_img_pool_src = ImagePool(50)
    fake_img_pool_tgt = ImagePool(50)

    for data in data_loader:

        set_requires_grad(netD_bin, True)
        set_requires_grad(netF, True)

        # print("ITERATION:", iteration)

        val = 0.0
        input_data, float_data, tgt_imgs = data

        if g_conf.IF_AUG:
            inputs = augmenter(0, input_data['rgb'])
            tgt_imgs = augmenter(0, tgt_imgs)
        else:
            inputs = input_data['rgb'].cuda()
            tgt_imgs = tgt_imgs.cuda()

        inputs = inputs.squeeze(1)
        inputs = inputs - val #subtracted by 0.5
        tgt_imgs = tgt_imgs - val #subtracted by 0.5

        controls = float_data[:, dataset.controls_position(), :]

        src_embed_inputs, src_branches = netF(inputs, dataset.extract_inputs(float_data).cuda())
        tgt_embed_inputs = netF(tgt_imgs, None)

        ##--------------------Discriminator part!!!!!!!!!!-------------------##
        set_requires_grad(netD_bin, True)
        set_requires_grad(netF, False)
        optimD_bin.zero_grad()

        outputsD_real_src_bin = netD_bin(src_embed_inputs)
        outputsD_real_tgt_bin = netD_bin(tgt_embed_inputs)

        gradient_penalty = calc_gradient_penalty(netD_bin, src_embed_inputs, tgt_embed_inputs)
        lossD_bin = torch.mean(outputsD_real_tgt_bin - outputsD_real_src_bin) + gradient_penalty
        lossD_bin.backward(retain_graph=True)
        optimD_bin.step()

        coil_logger.add_scalar('Total LossD Bin', lossD_bin.data, iteration)

        if ((iteration + 1) % n_critic) == 0:
        #####Task network updates##########################
            set_requires_grad(netD_bin, False)
            set_requires_grad(netF, True)
            optimF.zero_grad()

            src_embed_inputs, src_branches = netF(inputs, dataset.extract_inputs(float_data).cuda())
            tgt_embed_inputs = netF(tgt_imgs, None)
            
            lossF_task = Task_Loss.MSELoss(src_branches, dataset.extract_targets(float_data).cuda(),
                                         controls.cuda(), dataset.extract_inputs(float_data).cuda())

            lossF_adv = netD_bin(src_embed_inputs).mean() - netD_bin(tgt_embed_inputs).mean()
            lossF = (lossF_task + task_adv_weight * lossF_adv)

            coil_logger.add_scalar('Total Task Loss', lossF.data, iteration)
            coil_logger.add_scalar('Adv Task Loss', lossF_adv.data, iteration)
            coil_logger.add_scalar('Only Task Loss', lossF_task.data, iteration)
            lossF.backward(retain_graph=True)
            optimF.step()

            if lossF_task.data < best_lossF:
                best_lossF = lossF_task.data.tolist()
                best_loss_iter_F = iteration

            print ("Iteration", iteration, "Best loss F", best_lossF, "BestLossIteration", best_loss_iter_F)

        #optimization for one iter done!

        position = random.randint(0, len(float_data)-1)

        accumulated_time += time.time() - capture_time
        capture_time = time.time()


        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_bin_dict': netD_bin.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'total_time': accumulated_time,
                'best_loss_iter_F': best_loss_iter_F

            }
            torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias
                                           , 'checkpoints', str(iteration) + '.pth'))


        if iteration == best_loss_iter_F and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_bin_dict': netD_bin.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossF': best_lossF,
                'total_time': accumulated_time,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(state, os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch, exp_alias
                                           , 'best_modelF' + '.pth'))

        iteration += 1
Esempio n. 16
0
def execute(gpu, exp_batch, exp_alias):

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                g_conf.TRAIN_DATASET_NAME)
    dataset = CoILDataset(full_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    sampler = BatchSequenceSampler(
        splitter.control_steer_split(dataset.measurements,
                                     dataset.meta_data), g_conf.BATCH_SIZE,
        g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=sampler,
                                              shuffle=False,
                                              num_workers=6,
                                              pin_memory=True)

    l1weight = g_conf.L1_WEIGHT
    image_size = tuple([88, 200])

    print("Configurations of ", exp_alias)
    print("GANMODEL_NAME", g_conf.GANMODEL_NAME)
    print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION)
    print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE)
    print("SKIP", g_conf.SKIP)
    print("TYPE", g_conf.TYPE)
    print("L1 WEIGHT", g_conf.L1_WEIGHT)
    print("LAB SMOOTH", g_conf.LABSMOOTH)

    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION,
                               skip=g_conf.SKIP).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch':
        netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller':
        netD = ganmodels_nopatch_smaller._netD(
            loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch_smaller._netG(
            loss=g_conf.LOSS_FUNCTION).cuda()

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task':
        netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda()
        netF = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda()

    init_weights(netD)
    init_weights(netG)

    print(netD)
    print(netG)

    optimD = torch.optim.Adam(netD.parameters(),
                              lr=g_conf.LR_D,
                              betas=(0.7, 0.999))
    optimG = torch.optim.Adam(netG.parameters(),
                              lr=g_conf.LR_G,
                              betas=(0.7, 0.999))
    if g_conf.TYPE == 'task':
        optimF = torch.optim.Adam(netG.parameters(),
                                  lr=g_conf.LEARNING_RATE,
                                  betas=(0.7, 0.999))
        Task_Loss = TaskLoss()

    if g_conf.LOSS_FUNCTION == 'LSGAN':
        Loss = torch.nn.MSELoss().cuda()
    elif g_conf.LOSS_FUNCTION == 'NORMAL':
        Loss = torch.nn.BCELoss().cuda()

    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter = 0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0

    netG.train()
    netD.train()
    capture_time = time.time()

    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    #TODO add image queue
    #TODO add auxiliary regression loss for steering
    #TODO put family for losses

    fake_img_pool = ImagePool(50)

    for data in data_loader:

        val = 0.5
        input_data, float_data = data
        inputs = input_data['rgb'].cuda()
        inputs = inputs.squeeze(1)
        inputs_in = inputs - val

        fake_inputs = netG(inputs_in)  #subtracted by 0.5
        fake_inputs_in = fake_inputs

        if iteration % 200 == 0:
            imgs_to_save = torch.cat((inputs_in[:2] + val, fake_inputs_in[:2]),
                                     0).cpu().data
            vutils.save_image(imgs_to_save,
                              './imgs_' + exp_alias + '/' + str(iteration) +
                              '_real_and_fake.png',
                              normalize=True)
            coil_logger.add_image("Images", imgs_to_save, iteration)

        ##--------------------Discriminator part!!!!!!!!!!-------------------##
        set_requires_grad(netD, True)
        optimD.zero_grad()

        ##fake
        fake_inputs_forD = fake_img_pool.query(fake_inputs)
        outputsD_fake_forD = netD(fake_inputs_forD.detach())

        labsize = outputsD_fake_forD.size()
        labels_fake = torch.zeros(labsize)  #Fake labels
        label_fake_noise = torch.rand(
            labels_fake.size()) * 0.05 - 0.025  #Label smoothing

        if g_conf.LABSMOOTH == 1:
            labels_fake = labels_fake + labels_fake_noise

        labels_fake = Variable(labels_fake).cuda()
        lossD_fake = Loss(outputsD_fake_forD, labels_fake)

        ##real
        outputsD_real = netD(inputs)
        print("some d outputs", outputsD_real[0])

        labsize = outputsD_real.size()
        labels_real = torch.ones(labsize)  #Real labels
        label_real_noise = torch.rand(
            labels_real.size()) * 0.05 - 0.025  #Label smoothing

        if g_conf.LABSMOOTH == 1:
            labels_real = labels_real + labels_real_noise

        labels_real = Variable(labels_real).cuda()
        lossD_real = Loss(outputsD_real, labels_real)

        #Discriminator updates

        lossD = (lossD_real + lossD_fake) * 0.5
        # lossD /= len(inputs)
        lossD.backward()
        optimD.step()

        coil_logger.add_scalar('Total LossD', lossD.data, iteration)
        coil_logger.add_scalar('Real LossD', lossD_real.data, iteration)
        coil_logger.add_scalar('Fake LossD', lossD_fake.data, iteration)

        ##--------------------Generator part!!!!!!!!!!-----------------------

        set_requires_grad(netD, False)
        optimG.zero_grad()
        outputsD_fake_forG = netD(fake_inputs)
        #Generator updates

        lossG_adv = Loss(outputsD_fake_forG, labels_real)
        lossG_smooth = L1_loss(fake_inputs, inputs)
        lossG = (lossG_adv + l1weight * lossG_smooth) / (1.0 + l1weight)
        lossG

        lossG.backward()
        optimG.step()

        coil_logger.add_scalar('Total LossG', lossG.data, iteration)
        coil_logger.add_scalar('Adv LossG', lossG_adv.data, iteration)
        coil_logger.add_scalar('Smooth LossG', lossG_smooth.data, iteration)

        #optimization for one iter done!

        position = random.randint(0, len(float_data) - 1)
        if lossD.data < best_lossD:
            best_lossD = lossD.data.tolist()

        if lossG.data < best_lossG:
            best_lossG = lossG.data.tolist()
            best_loss_iter = iteration

        accumulated_time += time.time() - capture_time
        capture_time = time.time()
        print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(),
              "BestLossD", best_lossD, "BestLossG", best_lossG, "Iteration",
              iteration, "Best Loss Iteration", best_loss_iter)

        coil_logger.add_message(
            'Iterating', {
                'Iteration':
                iteration,
                'LossD':
                lossD.data.tolist(),
                'LossG':
                lossG.data.tolist(),
                'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                'BestLossD':
                best_lossD,
                'BestLossIteration':
                best_loss_iter,
                'BestLossG':
                best_lossG,
                'BestLossIteration':
                best_loss_iter,
                'GroundTruth':
                dataset.extract_targets(float_data)[position].data.tolist(),
                'Inputs':
                dataset.extract_inputs(float_data)[position].data.tolist()
            }, iteration)
        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter': best_loss_iter
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))
        if iteration == best_loss_iter:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter': best_loss_iter
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelG' + '.pth'))

        iteration += 1
Esempio n. 17
0
def execute(gpu, exp_batch, exp_alias, dataset_name, suppress_output):
    latest = None
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu)

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch, exp_alias+'.yaml'))

        # The validation dataset is always fully loaded, so we fix a very high number of hours
        g_conf.NUMBER_OF_HOURS = 10000
        set_type_of_process('validation', dataset_name)

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        if suppress_output:
            sys.stdout = open(os.path.join('_output_logs',
                                           exp_alias + '_' + g_conf.PROCESS_NAME + '_'
                                           + str(os.getpid()) + ".out"),
                              "a", buffering=1)
            sys.stderr = open(os.path.join('_output_logs',
                              exp_alias + '_err_' + g_conf.PROCESS_NAME + '_'
                                           + str(os.getpid()) + ".out"),
                              "a", buffering=1)


        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the HDFILES positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
        augmenter = Augmenter(None)
        # Definition of the dataset to be used. Preload name is just the validation data name
        dataset = CoILDataset(full_dataset, transform=augmenter, preload_name=dataset_name)

        # The data loader is the multi threaded module from pytorch that release a number of
        # workers to get all the data.
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=g_conf.BATCH_SIZE,
                                                  shuffle=False,
                                                  num_workers=g_conf.NUMBER_OF_LOADING_WORKERS,
                                                  pin_memory=True)

        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        # The window used to keep track of the trainings
        l1_window = []
        latest = get_latest_evaluated_checkpoint()
        if latest is not None:  # When latest is noe
            l1_window = coil_logger.recover_loss_window(dataset_name, None)

        model.cuda()

        best_mse = 1000
        best_error = 1000
        best_mse_iter = 0
        best_error_iter = 0

        while not maximun_checkpoint_reach(latest, g_conf.TEST_SCHEDULE):

            if is_next_checkpoint_ready(g_conf.TEST_SCHEDULE):

                latest = get_next_checkpoint(g_conf.TEST_SCHEDULE)

                checkpoint = torch.load(os.path.join('_logs', exp_batch, exp_alias
                                        , 'checkpoints', str(latest) + '.pth'))
                checkpoint_iteration = checkpoint['iteration']
                print("Validation loaded ", checkpoint_iteration)

                model.load_state_dict(checkpoint['state_dict'])

                model.eval()

                if torch.cuda.device_count() > 1:
                  model = torch.nn.DataParallel(model)
                  
                accumulated_mse = 0
                accumulated_error = 0
                iteration_on_checkpoint = 0
                for data in data_loader:

                    # Compute the forward pass on a batch from  the validation dataset
                    controls = data['directions']
                    if torch.cuda.device_count() > 1:
                      output = model.module.forward_branch(torch.squeeze(data['rgb']).cuda(),
                                                  dataset.extract_inputs(data).cuda(),
                                                  controls)
                    else:
                      output = model.forward_branch(torch.squeeze(data['rgb']).cuda(),
                                                  dataset.extract_inputs(data).cuda(),
                                                  controls)
                    # It could be either waypoints or direct control
                    if 'waypoint1_angle' in g_conf.TARGETS:
                        write_waypoints_output(checkpoint_iteration, output)
                    else:
                        write_regular_output(checkpoint_iteration, output)

                    mse = torch.mean((output -
                                      dataset.extract_targets(data).cuda())**2).data.tolist()
                    mean_error = torch.mean(
                                    torch.abs(output -
                                              dataset.extract_targets(data).cuda())).data.tolist()

                    accumulated_error += mean_error
                    accumulated_mse += mse
                    error = torch.abs(output - dataset.extract_targets(data).cuda())

                    # Log a random position
                    position = random.randint(0, len(output.data.tolist())-1)

                    coil_logger.add_message('Iterating',
                         {'Checkpoint': latest,
                          'Iteration': (str(iteration_on_checkpoint*120)+'/'+str(len(dataset))),
                          'MeanError': mean_error,
                          'MSE': mse,
                          'Output': output[position].data.tolist(),
                          'GroundTruth': dataset.extract_targets(data)[position].data.tolist(),
                          'Error': error[position].data.tolist(),
                          'Inputs': dataset.extract_inputs(data)[position].data.tolist()},
                          latest)
                    iteration_on_checkpoint += 1
                    print("Iteration %d  on Checkpoint %d : Error %f" % (iteration_on_checkpoint,
                                                                checkpoint_iteration, mean_error))

                """
                    ########
                    Finish a round of validation, write results, wait for the next
                    ########
                """

                checkpoint_average_mse = accumulated_mse/(len(data_loader))
                checkpoint_average_error = accumulated_error/(len(data_loader))
                coil_logger.add_scalar('Loss', checkpoint_average_mse, latest, True)
                coil_logger.add_scalar('Error', checkpoint_average_error, latest, True)

                if checkpoint_average_mse < best_mse:
                    best_mse = checkpoint_average_mse
                    best_mse_iter = latest

                if checkpoint_average_error < best_error:
                    best_error = checkpoint_average_error
                    best_error_iter = latest

                coil_logger.add_message('Iterating',
                     {'Summary':
                         {
                          'Error': checkpoint_average_error,
                          'Loss': checkpoint_average_mse,
                          'BestError': best_error,
                          'BestMSE': best_mse,
                          'BestMSECheckpoint': best_mse_iter,
                          'BestErrorCheckpoint': best_error_iter
                         },

                      'Checkpoint': latest},
                                        latest)

                l1_window.append(checkpoint_average_error)
                coil_logger.write_on_error_csv(dataset_name, checkpoint_average_error)

                # If we are using the finish when validation stops, we check the current
                if g_conf.FINISH_ON_VALIDATION_STALE is not None:
                    if dlib.count_steps_without_decrease(l1_window) > 3 and \
                            dlib.count_steps_without_decrease_robust(l1_window) > 3:
                        coil_logger.write_stop(dataset_name, latest)
                        break

            else:

                latest = get_latest_evaluated_checkpoint()
                time.sleep(1)
                # print ('checkpoint: ', latest)

                coil_logger.add_message('Loading', {'Message': 'Waiting Checkpoint'})
                print("Waiting for the next Validation")

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)

    except RuntimeError as e:
        if latest is not None:
            coil_logger.erase_csv(latest)
        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
        # We erase the output that was unfinished due to some process stop.
        if latest is not None:
            coil_logger.erase_csv(latest)
Esempio n. 18
0
def execute(gpu,
            exp_batch,
            exp_alias,
            suppress_output=True,
            number_of_workers=12):
    """
        The main training function. This functions loads the latest checkpoint
        for a given, exp_batch (folder) and exp_alias (experiment configuration).
        With this checkpoint it starts from the beginning or continue some training.
    Args:
        gpu: The GPU number
        exp_batch: the folder with the experiments
        exp_alias: the alias, experiment name
        suppress_output: if the output are going to be saved on a file
        number_of_workers: the number of threads used for data loading

    Returns:
        None

    """
    try:
        # We set the visible cuda devices to select the GPU
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
        g_conf.VARIABLE_WEIGHT = {}
        # At this point the log file with the correct naming is created.
        # You merge the yaml file with the global configuration structure.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        set_type_of_process('train')
        # Set the process into loading status.
        coil_logger.add_message('Loading', {'GPU': gpu})

        # Seed RNGs
        torch.manual_seed(g_conf.MAGICAL_SEED)
        random.seed(g_conf.MAGICAL_SEED)

        # Put the output to a separate file if it is the case

        if suppress_output:
            if not os.path.exists('_output_logs'):
                os.mkdir('_output_logs')
            sys.stdout = open(os.path.join(
                '_output_logs', exp_alias + '_' + g_conf.PROCESS_NAME + '_' +
                str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)
            sys.stderr = open(os.path.join(
                '_output_logs', exp_alias + '_err_' + g_conf.PROCESS_NAME +
                '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        if coil_logger.check_finish('train'):
            coil_logger.add_message('Finished', {})
            return

        # Preload option
        if g_conf.PRELOAD_MODEL_ALIAS is not None:
            checkpoint = torch.load(
                os.path.join('_logs', g_conf.PRELOAD_MODEL_BATCH,
                             g_conf.PRELOAD_MODEL_ALIAS, 'checkpoints',
                             str(g_conf.PRELOAD_MODEL_CHECKPOINT) + '.pth'))

        # Get the latest checkpoint to be loaded
        # returns none if there are no checkpoints saved for this model
        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file is not None:
            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']
        else:
            iteration = 0
            best_loss = 10000.0
            best_loss_iter = 0

        # Define the dataset.
        # Can specify a list of training datasets or just a single training dataset
        if len(g_conf.TRAIN_DATASET_NAMES) == 0:
            train_dataset_list = [g_conf.TRAIN_DATASET_NAME]
        else:
            train_dataset_list = g_conf.TRAIN_DATASET_NAMES
        full_dataset = [
            os.path.join(os.environ["COIL_DATASET_PATH"], dataset_name)
            for dataset_name in train_dataset_list
        ]

        # By instantiating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = Augmenter(g_conf.AUGMENTATION)

        # Instantiate the class used to read a dataset. The coil dataset generator
        # can be found
        dataset = CoILDataset(full_dataset,
                              transform=augmenter,
                              preload_names=[
                                  str(g_conf.NUMBER_OF_HOURS) + 'hours_' +
                                  dataset_name
                                  for dataset_name in train_dataset_list
                              ],
                              train_dataset=True)
        print("Loaded dataset")

        # Create dataloader, model, and optimizer
        data_loader = select_balancing_strategy(dataset, iteration,
                                                number_of_workers)
        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        model.cuda()
        optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE)

        # If we have a previous checkpoint, load model, optimizer, and record of previous
        # train loss values (used for the learning rate schedule)
        if checkpoint_file is not None or g_conf.PRELOAD_MODEL_ALIAS is not None:
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            accumulated_time = checkpoint['total_time']
            loss_window = coil_logger.recover_loss_window('train', iteration)
        else:  # We accumulate iteration time and keep the average speed
            accumulated_time = 0
            loss_window = []

        print("Before the loss")

        # Define control loss function
        criterion = Loss(g_conf.LOSS_FUNCTION)

        if iteration == 0 and is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'state_dict': model.state_dict(),
                'best_loss': best_loss,
                'total_time': accumulated_time,
                'optimizer': optimizer.state_dict(),
                'best_loss_iter': best_loss_iter
            }
            torch.save(
                state,
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))
        # Training loop
        for data in data_loader:

            # Basically in this mode of execution, we validate every X Steps, if it goes up 3 times,
            # add a stop on the _logs folder that is going to be read by this process
            if g_conf.FINISH_ON_VALIDATION_STALE is not None and \
                    check_loss_validation_stopped(iteration, g_conf.FINISH_ON_VALIDATION_STALE):
                break
            """
                ####################################
                    Main optimization loop
                ####################################
            """

            iteration += 1

            # Adjust learning rate based on training loss
            if iteration % 1000 == 0:
                adjust_learning_rate_auto(optimizer, loss_window)

            capture_time = time.time()
            model.zero_grad()

            controls = data['directions']

            # Run model forward and get outputs
            # First case corresponds to training squeeze network, second case corresponds to training driving model without
            # mimicking losses, last case corresponds to training mimic network
            if "seg" in g_conf.SENSORS.keys():
                branches = model(data,
                                 dataset.extract_inputs(data).cuda(),
                                 dataset.extract_intentions(data).cuda())
            elif not g_conf.USE_REPRESENTATION_LOSS:
                branches = model(data, dataset.extract_inputs(data).cuda())
            else:
                branches, intermediate_reps = model(
                    data,
                    dataset.extract_inputs(data).cuda())

            # Compute control loss
            targets_to_use = dataset.extract_targets(data)
            loss_function_params = {
                'branches': branches,
                'targets': targets_to_use.cuda(),
                'controls': controls.cuda(),
                'inputs': dataset.extract_inputs(data).cuda(),
                'branch_weights': g_conf.BRANCH_LOSS_WEIGHT,
                'variable_weights': g_conf.VARIABLE_WEIGHT
            }
            loss, _ = criterion(loss_function_params)

            # Compute mimicking loss
            if g_conf.USE_REPRESENTATION_LOSS:
                expert_reps = dataset.extract_representations(data)
                # Seg mask mimicking loss
                if g_conf.USE_PERCEPTION_REP_LOSS:
                    perception_rep_loss_elementwise = (
                        intermediate_reps[0] - expert_reps[0].cuda())**2
                    perception_rep_loss = g_conf.PERCEPTION_REP_WEIGHT * torch.sum(
                        perception_rep_loss_elementwise) / branches[0].shape[0]
                else:
                    perception_rep_loss = torch.tensor(0.).cuda()
                # Speed mimicking loss
                if g_conf.USE_SPEED_REP_LOSS:
                    speed_rep_loss_elementwise = (intermediate_reps[1] -
                                                  expert_reps[1].cuda())**2
                    speed_rep_loss = g_conf.SPEED_REP_WEIGHT * torch.sum(
                        speed_rep_loss_elementwise) / branches[0].shape[0]
                else:
                    speed_rep_loss = torch.tensor(0.).cuda()
                # Stop intentions mimicking loss
                if g_conf.USE_INTENTION_REP_LOSS:
                    intentions_rep_loss_elementwise = (
                        intermediate_reps[2] - expert_reps[2].cuda())**2
                    intentions_rep_loss = g_conf.INTENTIONS_REP_WEIGHT * torch.sum(
                        intentions_rep_loss_elementwise) / branches[0].shape[0]
                else:
                    intentions_rep_loss = torch.tensor(0.).cuda()
                rep_loss = g_conf.REP_LOSS_WEIGHT * (
                    perception_rep_loss + speed_rep_loss + intentions_rep_loss)
                overall_loss = loss + rep_loss
            else:
                overall_loss = loss
            overall_loss.backward()
            optimizer.step()
            """
                ####################################
                    Saving the model if necessary
                ####################################
            """

            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'optimizer': optimizer.state_dict(),
                    'best_loss_iter': best_loss_iter
                }
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(iteration) + '.pth'))
            """
                ################################################
                    Adding tensorboard logs.
                    Making calculations for logging purposes.
                    These logs are monitored by the printer module.
                #################################################
            """
            coil_logger.add_scalar('Loss', loss.data, iteration)
            if g_conf.USE_REPRESENTATION_LOSS:
                coil_logger.add_scalar('Perception Rep Loss',
                                       perception_rep_loss.data, iteration)
                coil_logger.add_scalar('Speed Rep Loss', speed_rep_loss.data,
                                       iteration)
                coil_logger.add_scalar('Intentions Rep Loss',
                                       intentions_rep_loss.data, iteration)
                coil_logger.add_scalar('Overall Rep Loss', rep_loss.data,
                                       iteration)
                coil_logger.add_scalar('Total Loss', overall_loss.data,
                                       iteration)
            if 'rgb' in data:
                coil_logger.add_image('Image', torch.squeeze(data['rgb']),
                                      iteration)
            if overall_loss.data < best_loss:
                best_loss = overall_loss.data.tolist()
                best_loss_iter = iteration

            # Log a random position
            position = random.randint(0, len(data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output - targets_to_use.cuda())

            accumulated_time += time.time() - capture_time

            # Log to terminal and log file
            if g_conf.USE_REPRESENTATION_LOSS:
                coil_logger.add_message(
                    'Iterating', {
                        'Iteration':
                        iteration,
                        'Loss':
                        overall_loss.data.tolist(),
                        'Control Loss':
                        loss.data.tolist(),
                        'Rep Loss':
                        rep_loss.data.tolist(),
                        'Images/s':
                        (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                        'BestLoss':
                        best_loss,
                        'BestLossIteration':
                        best_loss_iter,
                        'Output':
                        output[position].data.tolist(),
                        'GroundTruth':
                        targets_to_use[position].data.tolist(),
                        'Error':
                        error[position].data.tolist(),
                        'Inputs':
                        dataset.extract_inputs(data)[position].data.tolist()
                    }, iteration)
            else:
                coil_logger.add_message(
                    'Iterating', {
                        'Iteration':
                        iteration,
                        'Loss':
                        loss.data.tolist(),
                        'Images/s':
                        (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                        'BestLoss':
                        best_loss,
                        'BestLossIteration':
                        best_loss_iter,
                        'Output':
                        output[position].data.tolist(),
                        'GroundTruth':
                        targets_to_use[position].data.tolist(),
                        'Error':
                        error[position].data.tolist(),
                        'Inputs':
                        dataset.extract_inputs(data)[position].data.tolist()
                    }, iteration)
            # Save training loss history (useful for restoring training runs since learning rate is adjusted
            # based on training loss)
            loss_window.append(overall_loss.data.tolist())
            coil_logger.write_on_error_csv('train', overall_loss.data)
            print("Iteration: %d  Loss: %f" % (iteration, overall_loss.data))

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except RuntimeError as e:

        coil_logger.add_message('Error', {'Message': str(e)})

    except:
        traceback.print_exc()
        coil_logger.add_message('Error', {'Message': 'Something Happened'})
def execute(gpu, exp_batch, exp_alias):

    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    merge_with_yaml(os.path.join('configs', exp_batch, exp_alias + '.yaml'))
    set_type_of_process('train')

    coil_logger.add_message('Loading', {'GPU': gpu})
    if not os.path.exists('_output_logs'):
        os.mkdir('_output_logs')
    sys.stdout = open(os.path.join(
        '_output_logs', g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                      "a",
                      buffering=1)
    if monitorer.get_status(exp_batch, exp_alias + '.yaml',
                            g_conf.PROCESS_NAME)[0] == "Finished":
        return

    full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                g_conf.TRAIN_DATASET_NAME)
    real_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                "FinalRealWorldDataset")

    #main data loader
    dataset = CoILDataset(full_dataset,
                          real_dataset,
                          transform=transforms.Compose([transforms.ToTensor()
                                                        ]))

    sampler = BatchSequenceSampler(
        splitter.control_steer_split(dataset.measurements,
                                     dataset.meta_data), g_conf.BATCH_SIZE,
        g_conf.NUMBER_IMAGES_SEQUENCE, g_conf.SEQUENCE_STRIDE)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_sampler=sampler,
                                              shuffle=False,
                                              num_workers=6,
                                              pin_memory=True)

    l1weight = g_conf.L1_WEIGHT
    image_size = tuple([88, 200])

    if g_conf.TRAIN_TYPE == 'WGAN':
        clamp_value = g_conf.CLAMP
        n_critic = g_conf.N_CRITIC

    print("Configurations of ", exp_alias)
    print("GANMODEL_NAME", g_conf.GANMODEL_NAME)
    print("LOSS_FUNCTION", g_conf.LOSS_FUNCTION)
    print("LR_G, LR_D, LR", g_conf.LR_G, g_conf.LR_D, g_conf.LEARNING_RATE)
    print("SKIP", g_conf.SKIP)
    print("TYPE", g_conf.TYPE)
    print("L1 WEIGHT", g_conf.L1_WEIGHT)
    print("LAB SMOOTH", g_conf.LABSMOOTH)

    if g_conf.GANMODEL_NAME == 'LSDcontrol':
        netD = ganmodels._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels._netG(loss=g_conf.LOSS_FUNCTION,
                               skip=g_conf.SKIP).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch':
        netD = ganmodels_nopatch._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch._netG(loss=g_conf.LOSS_FUNCTION).cuda()
    elif g_conf.GANMODEL_NAME == 'LSDcontrol_nopatch_smaller':
        netD = ganmodels_nopatch_smaller._netD(
            loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_nopatch_smaller._netG(
            loss=g_conf.LOSS_FUNCTION).cuda()

    elif g_conf.GANMODEL_NAME == 'LSDcontrol_task':
        netD = ganmodels_task._netD(loss=g_conf.LOSS_FUNCTION).cuda()
        netG = ganmodels_task._netG(loss=g_conf.LOSS_FUNCTION).cuda()
        netF = ganmodels_task._netF(loss=g_conf.LOSS_FUNCTION).cuda()

        if g_conf.PRETRAINED == 'RECON':
            netF_statedict = torch.load('netF_GAN_Pretrained.wts')
            netF.load_state_dict(netF_statedict)
        elif g_conf.PRETRAINED == 'IL':
            model_IL = torch.load('best_loss_20-06_EpicClearWeather.pth')
            model_IL_state_dict = model_IL['state_dict']

            netF_state_dict = netF.state_dict()
            for i, keys in enumerate(
                    zip(netF_state_dict.keys(), model_IL_state_dict.keys())):
                newkey, oldkey = keys
                if newkey.split('.')[0] == "branch" and oldkey.split(
                        '.')[0] == "branches":
                    print("No Transfer of ", newkey, " to ", oldkey)
                else:
                    print("Transferring ", newkey, " to ", oldkey)
                    netF_state_dict[newkey] = model_IL_state_dict[oldkey]
                netF.load_state_dict(netF_state_dict)

    init_weights(netD)
    init_weights(netG)
    #do init for netF also later but now it is in the model code itself

    print(netD)
    print(netF)
    print(netG)

    optimD = torch.optim.Adam(netD.parameters(),
                              lr=g_conf.LR_D,
                              betas=(0.5, 0.999))
    optimG = torch.optim.Adam(netG.parameters(),
                              lr=g_conf.LR_G,
                              betas=(0.5, 0.999))
    if g_conf.TYPE == 'task':
        optimF = torch.optim.Adam(netF.parameters(), lr=g_conf.LEARNING_RATE)
        Task_Loss = TaskLoss()

    if g_conf.LOSS_FUNCTION == 'LSGAN':
        Loss = torch.nn.MSELoss().cuda()
    elif g_conf.LOSS_FUNCTION == 'NORMAL':
        Loss = torch.nn.BCEWithLogitsLoss().cuda()

    L1_loss = torch.nn.L1Loss().cuda()

    iteration = 0
    best_loss_iter_F = 0
    best_loss_iter_G = 0
    best_lossF = 1000000.0
    best_lossD = 1000000.0
    best_lossG = 1000000.0
    accumulated_time = 0
    lossF = Variable(torch.Tensor([100.0]))

    lossG_adv = Variable(torch.Tensor([100.0]))
    lossG_smooth = Variable(torch.Tensor([100.0]))
    lossG = Variable(torch.Tensor([100.0]))

    netG.train()
    netD.train()
    netF.train()
    capture_time = time.time()

    if not os.path.exists('./imgs_' + exp_alias):
        os.mkdir('./imgs_' + exp_alias)

    #TODO put family for losses

    fake_img_pool = ImagePool(50)

    for data in data_loader:

        set_requires_grad(netD, True)
        set_requires_grad(netF, True)
        set_requires_grad(netG, True)

        # print("ITERATION:", iteration)

        val = 0.0
        input_data, float_data, tgt_imgs = data
        inputs = input_data['rgb'].cuda()
        tgt_imgs = tgt_imgs.cuda()

        inputs = inputs.squeeze(1)
        inputs = inputs - val  #subtracted by 0.5
        tgt_imgs = tgt_imgs - val  #subtracted by 0.5

        #TODO: make sure the F network does not get optimized by G optim
        controls = float_data[:, dataset.controls_position(), :]
        src_embed_inputs, src_branches = netF(
            inputs,
            dataset.extract_inputs(float_data).cuda())
        tgt_embed_inputs = netF(tgt_imgs, None)

        src_fake_inputs = netG(src_embed_inputs.detach())
        tgt_fake_inputs = netG(tgt_embed_inputs.detach())

        if iteration % 500 == 0:
            imgs_to_save = torch.cat(
                (inputs_in[:2] + val, fake_inputs_in[:2] + val), 0).cpu().data
            vutils.save_image(imgs_to_save,
                              './imgs_' + exp_alias + '/' + str(iteration) +
                              '_real_and_fake.png',
                              normalize=True)
            coil_logger.add_image("Images", imgs_to_save, iteration)

        ##--------------------Discriminator part!!!!!!!!!!-------------------##
        set_requires_grad(netD, True)
        set_requires_grad(netF, False)
        set_requires_grad(netG, False)
        optimD.zero_grad()

        ##fake
        # fake_inputs_forD = fake_img_pool.query(fake_inputs)
        outputsD_src_fake_forD = netD(src_fake_inputs.detach())

        labsize = outputsD_src_fake_forD.size()
        if g_conf.LOSS_FUNCTION == 'NORMAL':
            labsize = labsize[0]
        print("Discriminator label size", labsize)

        if g_conf.LABSMOOTH:
            label_real_noise = torch.rand(
                labsize.size()) * 0.1  #Label smoothing
            label_fake_noise = torch.rand(labsize.size()) * 0.1

        labels_src_fake = torch.zeros(labsize).type(
            torch.LongTensor) + 1  #Fake labels
        labels_src_fake = Variable(labels_src_fake).cuda()

        ##source real
        outputsD_src_real_forD = netD(inputs)  # Pass real domain image here
        labels_src_real = torch.zeros(labsize).type(
            torch.LongTensor) + 2  #Real labels
        labels_src_real = Variable(labels_src_real).cuda()

        ##target fake
        outputsD_tgt_fake_forD = netD(tgt_fake_inputs.detach())
        labels_tgt_fake = torch.zeros(labsize).type(torch.LongTensor) + 3
        labels_tgt_fake = Variable(labels_tgt_fake).cuda()

        ##target real
        outputsD_tgt_real_forD = netD(tgt_imgs)  # Pass real domain image here
        labels_tgt_real = torch.zeros(labsize).type(
            torch.LongTensor) + 4  #Real labels
        labels_tgt_real = Variable(labels_tgt_real).cuda()

        #discriminator losses
        lossD_src_fake = torch.mean(outputsD_src_fake_forD)
        lossD_src_real = -1.0 * torch.mean(outputsD_src_real_forD)
        lossD_tgt_fake = torch.mean(outputsD_tgt_fake_forD)
        lossD_tgt_real = -1.0 * torch.mean(outputsD_tgt_real_forD)

        gp_src = calc_gradient_penalty(netD, inputs, src_fa)
        gp_tgt = calc_gradient_penalty
        gp_src_tgt = calc_gradient_penalty
        gp_tgt_src = calc_gradient_penalty

        ### Gradient Penalty ###
        gradient_penalty = calc_gradient_penalty(netD, inputs, fake_inputs)

        lossD = (lossD_src_real + lossD_src_fake + lossD_tgt_real +
                 lossD_tgt_fake) * 0.25

        # alpha = torch.rand((g_conf.BATCH_SIZE, 1, 1, 1))
        # alpha = alpha.cuda()
        #
        # x_hat = alpha * inputs.data + (1 - alpha) * fake_inputs.data
        # x_hat.requires_grad = True
        #
        # pred_hat = netD(x_hat)
        # gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).cuda(),
        #                 create_graph=True, retain_graph=True, only_inputs=True)[0]
        #
        # gradient_penalty = 10 * ((gradients.view(gradients.size()[0], -1).norm(2, 1) - 1) ** 2).mean()

        #Discriminator updates

        lossD = torch.mean(
            outputsD_fake_forD -
            outputsD_real) + gradient_penalty  #(lossD_real + lossD_fake) * 0.5
        # lossD /= len(inputs)
        print("Loss d", lossD)
        lossD.backward(retain_graph=True)
        optimD.step()

        # if g_conf.TRAIN_TYPE == 'WGAN':
        #     for p in netD.parameters():
        #         p.data.clamp_(-clamp_value, clamp_value)

        coil_logger.add_scalar('Total LossD', lossD.data, iteration)
        coil_logger.add_scalar('Real LossD', lossD_real.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Fake LossD', lossD_fake.data / len(inputs),
                               iteration)

        ##--------------------Generator part!!!!!!!!!!-----------------------##
        set_requires_grad(netD, False)
        set_requires_grad(netF, False)
        set_requires_grad(netG, True)

        if ((iteration + 1) % n_critic) == 0:
            optimG.zero_grad()
            outputsD_fake_forG = netD(fake_inputs)

            #Generator updates
            lossG_adv = -1.0 * torch.mean(
                outputsD_fake_forG)  #Loss(outputsD_fake_forG, labels_real)
            lossG_smooth = L1_loss(fake_inputs, inputs)
            lossG = (lossG_adv + l1weight * lossG_smooth) / (1.0 + l1weight)
            # lossG /= len(inputs)
            print(lossG)
            lossG.backward(retain_graph=True)
            optimG.step()

            #####Task network updates##########################
            set_requires_grad(netD, False)
            set_requires_grad(netF, True)
            set_requires_grad(netG, False)

            optimF.zero_grad()
            lossF = Variable(torch.Tensor())
            lossF = Task_Loss.MSELoss(
                branches,
                dataset.extract_targets(float_data).cuda(), controls.cuda(),
                dataset.extract_inputs(float_data).cuda())
            coil_logger.add_scalar('Task Loss', lossF.data, iteration)
            lossF.backward()
            optimF.step()

        coil_logger.add_scalar('Total LossG', lossG.data, iteration)
        coil_logger.add_scalar('Adv LossG', lossG_adv.data / len(inputs),
                               iteration)
        coil_logger.add_scalar('Smooth LossG', lossG_smooth.data / len(inputs),
                               iteration)

        #optimization for one iter done!

        position = random.randint(0, len(float_data) - 1)
        if lossD.data < best_lossD:
            best_lossD = lossD.data.tolist()
        # print (lossG.item(), best_lossG)
        if lossG.item() < best_lossG:
            best_lossG = lossG.item()
            best_loss_iter_G = iteration

        if lossF.item() < best_lossF:
            best_lossF = lossF.item()
            best_loss_iter_F = iteration

        accumulated_time += time.time() - capture_time
        capture_time = time.time()
        print("LossD", lossD.data.tolist(), "LossG", lossG.data.tolist(),
              "BestLossD", best_lossD, "BestLossG", best_lossG, "LossF", lossF,
              "BestLossF", best_lossF, "Iteration", iteration,
              "Best Loss Iteration G", best_loss_iter_G,
              "Best Loss Iteration F", best_loss_iter_F)

        coil_logger.add_message(
            'Iterating', {
                'Iteration':
                iteration,
                'LossD':
                lossD.data.tolist(),
                'LossG':
                lossG.data.tolist(),
                'Images/s': (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                'BestLossD':
                best_lossD,
                'BestLossG':
                best_lossG,
                'BestLossIterationG':
                best_loss_iter_G,
                'BestLossF':
                best_lossF,
                'BestLossIterationF':
                best_loss_iter_F,
                'GroundTruth':
                dataset.extract_targets(float_data)[position].data.tolist(),
                'Inputs':
                dataset.extract_inputs(float_data)[position].data.tolist()
            }, iteration)

        if is_ready_to_save(iteration):

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter_G': best_loss_iter_G,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'checkpoints',
                             str(iteration) + '.pth'))
        if iteration == best_loss_iter_G and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'total_time': accumulated_time,
                'best_loss_iter_G': best_loss_iter_G
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelG' + '.pth'))

        if iteration == best_loss_iter_F and iteration > 10000:

            state = {
                'iteration': iteration,
                'stateD_dict': netD.state_dict(),
                'stateG_dict': netG.state_dict(),
                'stateF_dict': netF.state_dict(),
                'best_lossD': best_lossD,
                'best_lossG': best_lossG,
                'best_lossF': best_lossF,
                'total_time': accumulated_time,
                'best_loss_iter_F': best_loss_iter_F
            }
            torch.save(
                state,
                os.path.join('/datatmp/Experiments/rohitgan/_logs', exp_batch,
                             exp_alias, 'best_modelF' + '.pth'))

        iteration += 1
Esempio n. 20
0
def execute(gpu, exp_batch, exp_alias, suppress_output=True):
    # We set the visible cuda devices

    # TODO: probable race condition, the train has to be started before.
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu

        # At this point the log file with the correct naming is created.
        merge_with_yaml(os.path.join('configs', exp_batch,
                                     exp_alias + '.yaml'))
        set_type_of_process('train')

        coil_logger.add_message('Loading', {'GPU': gpu})

        if not os.path.exists('_output_logs'):
            os.mkdir('_output_logs')

        # Put the output to a separate file
        if suppress_output:
            sys.stdout = open(os.path.join(
                '_output_logs',
                g_conf.PROCESS_NAME + '_' + str(os.getpid()) + ".out"),
                              "a",
                              buffering=1)

        checkpoint_file = get_latest_saved_checkpoint()
        if checkpoint_file is not None:
            checkpoint = torch.load(
                os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                             str(get_latest_saved_checkpoint())))
            iteration = checkpoint['iteration']
            best_loss = checkpoint['best_loss']
            best_loss_iter = checkpoint['best_loss_iter']

        else:
            iteration = 0
            best_loss = 10000.0
            best_loss_iter = 0

        # TODO: The checkpoint will continue, so it should erase everything up to the iteration on tensorboard
        # Define the dataset. This structure is has the __get_item__ redefined in a way
        # that you can access the HD_FILES positions from the root directory as a in a vector.
        full_dataset = os.path.join(os.environ["COIL_DATASET_PATH"],
                                    g_conf.TRAIN_DATASET_NAME)

        # augmenter_cpu = iag.AugmenterCPU(g_conf.AUGMENTATION_SUITE_CPU)

        # By instanciating the augmenter we get a callable that augment images and transform them
        # into tensors.
        augmenter = Augmenter(g_conf.AUGMENTATION)

        dataset = CoILDataset(full_dataset, transform=augmenter)

        data_loader = select_balancing_strategy(dataset, iteration)

        model = CoILModel(g_conf.MODEL_TYPE, g_conf.MODEL_CONFIGURATION)
        model.cuda()

        if checkpoint_file is not None:
            model.load_state_dict(checkpoint['state_dict'])

        print(model)

        criterion = Loss(g_conf.LOSS_FUNCTION)

        optimizer = optim.Adam(model.parameters(), lr=g_conf.LEARNING_RATE)

        print(dataset.meta_data)

        print(model)
        if checkpoint_file is not None:
            accumulated_time = checkpoint['total_time']
        else:
            accumulated_time = 0  # We accumulate iteration time and keep the average speed

        #TODO: test experiment continuation. Is the data sampler going to continue were it started.. ?
        capture_time = time.time()
        for data in data_loader:

            input_data, float_data = data

            # get the control commands from float_data, size = [120,1]

            controls = float_data[:, dataset.controls_position(), :]

            # The output(branches) is a list of 5 branches results, each branch is with size [120,3]

            model.zero_grad()

            branches = model(torch.squeeze(input_data['rgb'].cuda()),
                             dataset.extract_inputs(float_data).cuda())

            loss = criterion(branches,
                             dataset.extract_targets(float_data).cuda(),
                             controls.cuda(),
                             dataset.extract_inputs(float_data).cuda(),
                             branch_weights=g_conf.BRANCH_LOSS_WEIGHT,
                             variable_weights=g_conf.VARIABLE_WEIGHT)

            # TODO: All these logging things could go out to clean up the main
            if loss.data < best_loss:
                best_loss = loss.data.tolist()
                best_loss_iter = iteration

            # Log a random position
            position = random.randint(0, len(float_data) - 1)

            output = model.extract_branch(torch.stack(branches[0:4]), controls)
            error = torch.abs(output -
                              dataset.extract_targets(float_data).cuda())

            # TODO: For now we are computing the error for just the correct branch, it could be multi- branch,

            coil_logger.add_scalar('Loss', loss.data, iteration)
            coil_logger.add_image('Image', torch.squeeze(input_data['rgb']),
                                  iteration)

            loss.backward()
            optimizer.step()

            accumulated_time += time.time() - capture_time
            capture_time = time.time()

            # TODO: Get only the  float_data that are actually generating output
            # TODO: itearation is repeating , and that is dumb
            coil_logger.add_message(
                'Iterating', {
                    'Iteration':
                    iteration,
                    'Loss':
                    loss.data.tolist(),
                    'Images/s':
                    (iteration * g_conf.BATCH_SIZE) / accumulated_time,
                    'BestLoss':
                    best_loss,
                    'BestLossIteration':
                    best_loss_iter,
                    'Output':
                    output[position].data.tolist(),
                    'GroundTruth':
                    dataset.extract_targets(
                        float_data)[position].data.tolist(),
                    'Error':
                    error[position].data.tolist(),
                    'Inputs':
                    dataset.extract_inputs(float_data)[position].data.tolist()
                }, iteration)

            # TODO: For now we are computing the error for just the correct branch, it could be multi-branch,

            # TODO: save also the optimizer state dictionary
            if is_ready_to_save(iteration):

                state = {
                    'iteration': iteration,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'total_time': accumulated_time,
                    'best_loss_iter': best_loss_iter
                }
                # TODO : maybe already summarize the best model ???
                torch.save(
                    state,
                    os.path.join('_logs', exp_batch, exp_alias, 'checkpoints',
                                 str(iteration) + '.pth'))

            iteration += 1
            print(iteration)

            if iteration % 1000 == 0:
                adjust_learning_rate(optimizer, iteration)

            del data

        coil_logger.add_message('Finished', {})

    except KeyboardInterrupt:
        coil_logger.add_message('Error', {'Message': 'Killed By User'})

    except:
        traceback.print_exc()

        coil_logger.add_message('Error', {'Message': 'Something Happened'})