Exemple #1
0
    def lr_find(self, freeze_until=None, start_lr=1e-7, end_lr=1, num_it=100):
        """Gridsearch the optimal learning rate for the training

        Args:
           freeze_until (str, optional): last layer to freeze
           start_lr (float, optional): initial learning rate
           end_lr (float, optional): final learning rate
           num_it (int, optional): number of iterations to perform
        """

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(start_lr)
        gamma = (end_lr / start_lr)**(1 / (num_it - 1))
        scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma)

        self.lr_recorder = [start_lr * gamma**idx for idx in range(num_it)]
        self.loss_recorder = []

        for batch_idx, (x, target) in enumerate(self.train_loader):
            x, target = self.to_cuda(x, target)

            # Forward
            batch_loss = self._get_loss(x, target)
            self._backprop_step(batch_loss)
            # Update LR
            scheduler.step()

            # Record
            self.loss_recorder.append(batch_loss.item())
            # Stop after the number of iterations
            if batch_idx + 1 == num_it:
                break
Exemple #2
0
 def __init__(self, args={}):
     self.args = args
     self.parse_args(args)
     self.classifier = ConvNet()
     self.optimizer = optim.Adam(self.classifier.parameters(),
                                 lr=self.lr,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
     self.loss_function = nn.CrossEntropyLoss()
     lmbda = lambda epoch: self.lr_factor
     self.lr_scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda)
Exemple #3
0
    def lr_find(
        self,
        freeze_until: Optional[str] = None,
        start_lr: float = 1e-7,
        end_lr: float = 1,
        norm_weight_decay: Optional[float] = None,
        num_it: int = 100,
    ) -> None:
        """Gridsearch the optimal learning rate for the training

        Args:
           freeze_until (str, optional): last layer to freeze
           start_lr (float, optional): initial learning rate
           end_lr (float, optional): final learning rate
           norm_weight_decay (float, optional): weight decay to apply to normalization parameters
           num_it (int, optional): number of iterations to perform
        """

        if num_it > len(self.train_loader):
            raise ValueError("the value of `num_it` needs to be lower than the number of available batches")

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(start_lr, norm_weight_decay)
        gamma = (end_lr / start_lr) ** (1 / (num_it - 1))
        scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma)

        self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)]
        self.loss_recorder = []

        if self.amp:
            self.scaler = torch.cuda.amp.GradScaler()

        for batch_idx, (x, target) in enumerate(self.train_loader):
            x, target = self.to_cuda(x, target)

            # Forward
            batch_loss = self._get_loss(x, target)
            self._backprop_step(batch_loss)
            # Update LR
            scheduler.step()

            # Record
            if torch.isnan(batch_loss) or torch.isinf(batch_loss):
                if batch_idx == 0:
                    raise ValueError("loss value is NaN or inf.")
                else:
                    break
            self.loss_recorder.append(batch_loss.item())
            # Stop after the number of iterations
            if batch_idx + 1 == num_it:
                break

        self.lr_recorder = self.lr_recorder[:len(self.loss_recorder)]
Exemple #4
0
    def configure_optimizers(self):
        lr_params = self.hparams.Optim
        optim_args = lr_params["args"][lr_params["name"]]
        optimizers = {"adam": Adam, "sgd": SGD, "rmsprop": RMSprop}
        # Define optimizer
        optimizer = optimizers[lr_params["name"]](
            self.parameters(), lr=self.hparams.lr, **optim_args
        )

        # Define Learning Rate Scheduling
        def lambda1(val):
            return lambda epoch: epoch // val

        sched_params = self.hparams.Optim["Schedule"]
        sched_name = sched_params["name"]
        if not sched_name:
            return optimizer

        sched_args = sched_params["args"][sched_name]

        if sched_name == "step":
            scheduler = StepLR(optimizer, **sched_args)
        elif sched_name == "multiplicative":
            scheduler = MultiplicativeLR(
                optimizer, lr_lambda=[lambda1(sched_args["val"])]
            )
        elif sched_name == "lambda":
            scheduler = LambdaLR(optimizer, lr_lambda=[lambda1(sched_args["val"])])
        else:
            raise NotImplementedError("Unimplemented Scheduler!")

        return [optimizer], [scheduler]
Exemple #5
0
def lr_range_test(model, train, test, train_loader, test_loader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    #model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.0001)
    lmbda = lambda epoch: 1.4
    #scheduler = OneCycleLR(optimizer,max_lr=0.5,total_steps=25)
    scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
    learning_lr_trace = []
    for epoch in range(1, 25):

        print(f'Epoch: {epoch} Learning_Rate {scheduler.get_lr()}')
        learning_lr_trace.append(scheduler.get_lr())
        train_loss, train_acc = train(model, device, train_loader, optimizer,
                                      epoch)
        test_loss, test_acc_l1 = test(model, device, test_loader)
        scheduler.step()

    return learning_lr_trace, train_acc, test_acc_l1
 def configure_optimizers(self):
     optimizer = torch.optim.Adam(self.parameters(),
                                  lr=self.hparams.args.lr)  #,momentum=0.9)
     lmbda = lambda epoch: 1.05
     scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
     return {
         'optimizer': optimizer,
         'lr_scheduler': scheduler
         #'monitor': 'train_loss'
     }
Exemple #7
0
    def configure_optimizers(self):
        # log config here because it's the only place where we always have the logger (it's never called during inference)
        with NamedTemporaryFile(suffix=".yml") as f:
            self.config.to_yaml_file(f.name)
            self.logger.log_artifact(f.name, "config.yml")

        no_decay = {"bias", "norm.weight"}  # norm.weight only applies to nn.LayerNorm
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.config.experiment.tts_training.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        def get_optimizer(lr):
            return AdamW(
                optimizer_grouped_parameters,
                lr=lr,
                weight_decay=self.config.experiment.tts_training.weight_decay,
            )

        if self.config.experiment.tts_training.lr_scheduler is not None:
            schedule_config = self.config.experiment.tts_training.lr_scheduler
            assert schedule_config.start_schedule_epoch >= 1, "start_schedule_epoch has to be >= 1"
            start = schedule_config.start_schedule_epoch
            end = schedule_config.end_schedule_epoch
            end = self.config.experiment.max_epochs if end is None else end
            gamma = np.log(schedule_config.initial_lr) - np.log(schedule_config.final_lr)
            gamma /= end - start

            def exp_dec(current):
                if start <= current <= end:
                    a = np.exp(-1 * gamma * float(current)) * schedule_config.initial_lr
                    b = np.exp(-1 * gamma * float(current-1)) * schedule_config.initial_lr
                    decay = a / b
                else:
                    decay = 1
                self.logger.log_metric("learning_rate_decay", decay)
                return decay

            optimizer = get_optimizer(schedule_config.initial_lr)
            scheduler = MultiplicativeLR(optimizer, exp_dec)
            return [optimizer], [scheduler]
        else:
            optimizer = get_optimizer(self.config.experiment.tts_training.learning_rate)
            return optimizer
Exemple #8
0
def get_generator(args):
    if args.model == 'rrdb':
        generator = RRDBNet()
        optimizer_G = torch.optim.Adam(generator.parameters(),
                                       lr=args.lr,
                                       betas=args.betas)
        if args.multistep_lr:
            scheduler_G = MultiStepLR(optimizer_G,
                                      milestones=args.multistep_milestones,
                                      gamma=args.multistep_gamma)
        else:
            lr_lambda = lambda epoch: 1
            scheduler_G = MultiplicativeLR(optimizer_G, lr_lambda)
        return generator, optimizer_G, scheduler_G
    raise NotImplementedError(str(args.model) + " is not implemented")
Exemple #9
0
def get_scheduler(sched_params, optimizer):
    sched_name = sched_params["name"]

    if not sched_name:
        return optimizer

    sched_args = sched_params["args"][sched_name]

    if sched_name == "step":
        scheduler = StepLR(optimizer, **sched_args)
    elif sched_name == "multiplicative":
        scheduler = MultiplicativeLR(optimizer, lr_lambda=[lambda1(sched_args["val"])])
    elif sched_name == "lambda":
        scheduler = LambdaLR(optimizer, lr_lambda=[lambda1(sched_args["val"])])
    else:
        raise NotImplementedError("Unimplemented Scheduler!")

    return [scheduler]
Exemple #10
0
def get_linear_schedule_with_minlr(optimizer: Optimizer,
                                   num_warmup_steps: int,
                                   num_training_steps: int,
                                   last_epoch: int = -1,
                                   min_lr: int = 1e-07):
    """
    Creates a scheduler with a learning rate that linearly decreases but saturates at min_lr value.

    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (:obj:`int`):
            The number of steps for the warmup phase.
        num_training_steps (:obj:`int`):
            The total number of training steps.
        num_cycles (:obj:`int`, `optional`, defaults to 1):
            The number of hard restarts to use.
        last_epoch (:obj:`int`, `optional`, defaults to -1):
            The index of the last epoch when resuming training.
        min_lr (:obj:`int`, `optional`, defaults to 1e-07):
            The value of minimum learning rate where it should saturate.

    Return:
        :obj:`torch.optim.lr_scheduler.MultiplicativeLR` with the appropriate schedule.
    """

    init_lr = optimizer.defaults['lr']

    def lr_lambda(current_step: int):

        steps_done = float(num_training_steps - current_step)

        if current_step > 1:
            mul_fac = steps_done / max(steps_done + 1, 1)
        else:
            mul_fac = steps_done / (num_training_steps)

        if mul_fac * init_lr > min_lr:
            return mul_fac
        else:
            return 1

    return MultiplicativeLR(optimizer, lr_lambda, last_epoch)
Exemple #11
0
def init_network(params, params_gan):
    netD = None
    optimizerD = None
    train_gan = params['train_gan']
    ngpu = torch.cuda.device_count()

    modelMDE = FPNNet(num_channels=params['num_channels'])
    # save model architecture
    with open(f'{MODEL_DIR}/network_layers.txt', 'w') as f:
        print(modelMDE, file=f)
    if train_gan:
        netD = Discriminator(ngpu)
        with open(f'{MODEL_DIR}/discrim_layers.txt', 'w') as f:
            print(netD, file=f)

    # wrap into DataParallel to run in several GPUs
    if params['parallel'] and ngpu > 1:
        print(f"Using {ngpu} GPUs")
        modelMDE = nn.DataParallel(modelMDE, list(range(ngpu)))
        if train_gan: netD = nn.DataParallel(netD, list(range(ngpu)))

    modelMDE.to(device)
    optimizer = torch.optim.Adam(modelMDE.parameters(), lr=params['lr'], weight_decay=4e-5)
    lmbda = lambda epoch: params['lr_decay']
    scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)

    total_params = sum(p.numel() for p in modelMDE.parameters())
    print(f'\nNum of parameters in MDE net: {total_params}')

    if train_gan:
        netD.to(device)
        netD.apply(weights_init)
        total_params_d = sum(p.numel() for p in netD.parameters())
        print(f'\nNum of parameters in Discriminator net: {total_params_d}')
        optimizerD = torch.optim.Adam(netD.parameters(), lr=params_gan['lr'], betas=(params_gan['beta1_d'], 0.999))
    return modelMDE, netD, optimizer, optimizerD, scheduler
    # Load EfficientNet Model
    model = IVFEfficientNet(ARCHITECTURE)
    #model.load('model/regression_epoch-18.pt')
    #model.load('model/efficientnet-b4regression_epoch-20.pt')
    #model.load('model/efficientnet-b4_finetune_regression_epoch-10.pt')

    # loss function
    criterion = nn.SmoothL1Loss()

    # optimizer
    # optimizer = optim.SGD(model.parameters(), lr=3e-3, momentum=0.9, nesterov = True)
    optimizer = optim.Adam(model.parameters(),
                           lr=LEARNING_RATE,
                           weight_decay=L2_COEFFICIENT)
    lmbda = lambda epoch: LEARNING_RATE_DECAY
    scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)

    model._train(train_loader,
                 epochs=EPOCHS,
                 loss_function=criterion,
                 optimizer=optimizer,
                 valid_loader=valid_loader,
                 scheduler=scheduler,
                 save_filename=MODEL_SAVE_NAME)

    outputs = model(overall_loader)
    outputs = scaler.inverse_transform(outputs)
    outputs = pd.DataFrame(outputs).rename(columns={
        0: 'pred BS',
        1: 'pred ICM',
        2: 'pred TE'
    def fit(
        self,
        X_train,
        y_train,
        X_validation=None,
        y_validation=None,
        loss_key="opt",
        batch_size=128,
        num_workers=0,
        learning_rate=1e-3,
        learning_rate_lambda=0.995,
        max_epoch=10000,
        early_stopping=100,
        device="cpu",
        verbose=False,
    ):
        """
        Train the model using gradient descent back propagation

        Parameters
        ----------
        X_train : {array-like, sparse matrix} of shape (n_samples, n_features)
            Features matrix used to train the model
        y_train : vector-like of shape (n_samples, 1)
            The target vector used to train the model
        X_validation : {array-like, sparse matrix} of shape (n_samples, n_features)
            Features matrix used for early stopping of the training
        y_validation : vector-like of shape (n_samples, 1)
            The target vector used for early stopping of the training
        loss_key: string (default = 'opt')
            Which field of the loss dictionary to optimize
        batch_size: int (default = 128)
            Batch size
        num_workers: int (default = 0)
            Number of cpus to use
        learning_rate: float (default = 1e-3)
            Gradient descent learning rate
        learning_rate_lambda: float (default = 0.995)
            The rate of decreasing learning_rate
        max_epoch: int (default = 10000)
            The maximum number of optimization epochs
        early_stopping: int (default = 100)
            The number of epochs without improving the bast validation loss allowed before stopping
        device : 'cpu' or 'gpu' (default = 'cpu')
            Device used by pytorch for training the model and using the trained model for encoding/decoding
        verbose: True or False (default = False)
            Verbosity
        """
        assert X_train.shape[1] == self.input_dim
        self.to(device)
        train_loader = torch.utils.data.DataLoader(
            TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train)),
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
        )
        if X_validation is not None:
            validation_loader = torch.utils.data.DataLoader(
                TensorDataset(torch.Tensor(X_validation),
                              torch.Tensor(y_validation)),
                batch_size=batch_size,
                shuffle=True,
                num_workers=num_workers,
            )
        else:
            validation_loader = None

        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        scheduler = MultiplicativeLR(
            optimizer, lr_lambda=(lambda epoch: learning_rate_lambda))
        best_validation_loss = None
        iter_no_improve = 0
        for epoch in range(max_epoch):
            self.train()
            training_loss = 0
            for data in train_loader:
                Xb = data[0].to(device)
                optimizer.zero_grad()
                output = self(Xb)
                loss = self.loss(output, Xb)[loss_key]
                loss.backward()
                optimizer.step()
                training_loss += loss.detach().cpu().numpy()
            self.eval()
            validation_loss = 0
            if validation_loader:
                with torch.no_grad():
                    for data in validation_loader:
                        Xb = data[0].to(device)
                        output = self(Xb)
                        loss = self.loss(output, Xb)[loss_key]
                        validation_loss += loss.detach().cpu().numpy()
                    if best_validation_loss is None or validation_loss < best_validation_loss:
                        best_validation_loss = validation_loss
                        iter_no_improve = 0
                    else:
                        iter_no_improve += 1
                    if iter_no_improve > early_stopping:
                        if verbose:
                            print(f"Early stopping after {epoch} epochs")
                        break
            scheduler.step()
            if verbose:
                print(
                    f"[{epoch}] training loss={training_loss}, validation loss={validation_loss}"
                )
        return self
Exemple #14
0
 def set_scheduler(self, lmbda):
     self.scheduler = MultiplicativeLR(self.optimizer,
                                       lr_lambda=lmbda,
                                       verbose=False)
def collate(batch):
    return dist_custom_collate(batch, dist_bins, 64)


training_loader = DataLoader(training_data,
                             batch_size=4,
                             shuffle=True,
                             num_workers=0,
                             pin_memory=True,
                             collate_fn=collate)
accumulate = 1

torch.autograd.set_detect_anomaly(True)

optimizer = AdamW(network.parameters(), lr=1e-6)
scheduler = MLR(optimizer, lambda x: 1.1)


class FocalLoss(nn.CrossEntropyLoss):
    ''' Focal loss for classification tasks on imbalanced datasets '''
    def __init__(self, gamma, alpha=None, ignore_index=-100, reduction='mean'):
        super().__init__(weight=alpha,
                         ignore_index=ignore_index,
                         reduction='mean')
        self.reduction = reduction
        self.gamma = gamma

    def forward(self, input_, target):
        cross_entropy = super().forward(input_, target)
        # Temporarily mask out ignore index to '0' for valid gather-indices input.
        # This won't contribute final loss as the cross_entropy contribution
Exemple #16
0
        return MetaAgent([agent(params, env_arg) for env_arg in train_envs])

    rho = 16
    n_rules = int(
        sum([t.Size(s).numel() for s in param_shapes.values()]) / rho)
    population = GaussianMixturePopulation(
        {k: t.Size(v[:-1])
         for k, v in param_shapes.items()}, (n_rules, 5), constructor, 0.1,
        device)

    iterations = 500
    pop_size = 500

    optim = SGD(population.parameters(), lr=0.02)
    lr_decay = 0.995  # t.exp(t.log(t.scalar_tensor(0.5)) / 100)  # halves every 100 steps
    sched = MultiplicativeLR(optim, lr_lambda=lambda step: lr_decay)
    pbar = tqdm.tqdm(range(iterations))
    best_so_far = -1e9
    train_writer, test_writer = util.get_writers('hebbian')

    def fitness_shaping(x):
        return normalize(compute_centered_ranks(x))

    for i in pbar:
        optim.zero_grad()
        with Pool(cpu_count() // 2) as pool:
            raw_fitness = population.fitness_grads(pop_size, pool,
                                                   fitness_shaping)

        train_writer.add_scalar('fitness', raw_fitness.mean(), i)
        train_writer.add_scalar('fitness/std', raw_fitness.std(), i)
Exemple #17
0
def initalize_schedulers(ae_optim, disc_optim, cfg):
    ae_sched = MultiplicativeLR(ae_optim,
                                lr_lambda=partial(lambda_rule_ae, cfg=cfg))
    disc_sched = MultiplicativeLR(disc_optim,
                                  lr_lambda=partial(lambda_rule_disc, cfg=cfg))
    return ae_sched, disc_sched
    num_train = int(P_TRAIN * num_cars)
    num_test = num_cars - num_train
    train_data, test_data = random_split(dataset, [num_train, num_test])

    # set up the train and test data loaders
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

    # load ResNet-50 with every layer frozen except for layer3-bottleneck5 and beyond,
    # and a new fully-connected network which outputs a 196-dim vector
    device = get_device()
    model = load_resnet50_layer3_bottleneck5(num_car_models)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = MultiplicativeLR(optimizer, lr_lambda=lambda epoch: LR_DECAY)

    # set up the output logger
    output_dir = '/home/mchobanyan/data/research/transfer/vis/finetune-car-resnet50'
    model_dir = os.path.join(output_dir, 'models')
    create_folder(model_dir)
    logger = TrainingLogger(filepath=os.path.join(output_dir, 'training-log.csv'))

    for epoch in tqdm(range(NUM_EPOCHS)):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test_epoch(model, test_loader, criterion, device)
        scheduler.step()
        logger.add_entry(epoch, train_loss, test_loss, train_acc, test_acc)
        checkpoint(model, os.path.join(model_dir, f'model_epoch{epoch}.pt'))
Exemple #19
0
def train(
        train_data,
        exp_dir=datetime.now().strftime("corrector_model/%Y-%m-%d_%H%M"),
        learning_rate=0.00005,
        rsize=10,
        epochs=1,
        checkpoint_path='',
        seed=6548,
        batch_size=4,
        edge_loss=False,
        model_type='cnet',
        model_cap='normal',
        optimizer_type='radam',
        reset_optimizer=True,  # if true, does not load optimizer chekcpoints
        safe_descent=True,
        activation_type='mish',
        activation_args={},
        io=None,
        dynamic_lr=True,
        dropout=0,
        rotations=False,
        use_batch_norm=True,
        batch_norm_momentum=None,
        batch_norm_affine=True,
        use_gc=True,
        no_lr_schedule=False,
        diff_features_only=False):

    start_time = time.time()

    io.cprint("-------------------------------------------------------" +
              "\nexport dir = " + '/checkpoints/' + exp_dir +
              "\nbase_learning_rate = " + str(learning_rate) +
              "\nuse_batch_norm = " + str(use_batch_norm) +
              "\nbatch_norm_momentum = " + str(batch_norm_momentum) +
              "\nbatch_norm_affine = " + str(batch_norm_affine) +
              "\nno_lr_schedule = " + str(no_lr_schedule) + "\nuse_gc = " +
              str(use_gc) + "\nrsize = " + str(rsize) + "\npython_version: " +
              sys.version + "\ntorch_version: " + torch.__version__ +
              "\nnumpy_version: " + np.version.version + "\nmodel_type: " +
              model_type + "\nmodel_cap: " + model_cap + "\noptimizer: " +
              optimizer_type + "\nactivation_type: " + activation_type +
              "\nsafe_descent: " + str(safe_descent) + "\ndynamic_lr: " +
              str(dynamic_lr) + "\nrotations: " + str(rotations) +
              "\nepochs = " + str(epochs) +
              (("\ncheckpoint = " + checkpoint_path) if
               (checkpoint_path != None and checkpoint_path != '') else '') +
              "\nseed = " + str(seed) + "\nbatch_size = " + str(batch_size) +
              "\n#train_data = " +
              str(sum([bin.size(0) for bin in train_data["train_bins"]])) +
              "\n#test_data = " + str(len(train_data["test_samples"])) +
              "\n#validation_data = " + str(len(train_data["val_samples"])) +
              "\nedge_loss = " + str(edge_loss) +
              "\n-------------------------------------------------------" +
              "\nstart_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") +
              "\n-------------------------------------------------------")

    # initialize torch & cuda ---------------------------------------------------------------------

    torch.manual_seed(seed)
    np.random.seed(seed)

    device = utils.getDevice(io)

    # extract train- & test data (and move to device) --------------------------------------------

    # train_bins = [bin.float().to(device) for bin in train_data["train_bins"]]
    # test_samples = [sample.float().to(device) for sample in train_data["test_samples"]]
    # val_samples = [sample.float().to(device) for sample in train_data["val_samples"]]

    train_bins = [bin.float() for bin in train_data["train_bins"]]
    test_samples = [sample.float() for sample in train_data["test_samples"]]
    val_samples = [sample.float() for sample in train_data["val_samples"]]

    # Initialize Model ------------------------------------------------------------------------------

    model_args = {
        'model_type': model_type,
        'model_cap': model_cap,
        'input_channels': test_samples[0].size(1),
        'output_channels': test_samples[0].size(1),
        'rsize': rsize,
        'emb_dims': 1024,
        'activation_type': activation_type,
        'activation_args': activation_args,
        'dropout': dropout,
        'batch_norm': use_batch_norm,
        'batch_norm_affine': batch_norm_affine,
        'batch_norm_momentum': batch_norm_momentum,
        'diff_features_only': diff_features_only
    }

    model = getModel(model_args).to(device)

    # init optimizer & scheduler -------------------------------------------------------------------

    lookahead_sync_period = 6

    optimizer = None
    if optimizer_type == 'radam':
        optimizer = RAdam(model.parameters(),
                          lr=learning_rate,
                          betas=(0.9, 0.999),
                          eps=1e-8,
                          use_gc=use_gc)
    elif optimizer_type == 'lookahead':
        optimizer = Ranger(model.parameters(),
                           lr=learning_rate,
                           alpha=0.9,
                           k=lookahead_sync_period)

    # make sure that either a LR schedule is given or dynamic LR is enabled
    assert dynamic_lr or not no_lr_schedule

    scheduler = None if no_lr_schedule else MultiplicativeLR(
        optimizer, lr_lambda=MultiplicativeAnnealing(epochs))

    # set train settings & load previous model state ------------------------------------------------------------

    checkpoint = getEmptyCheckpoint()
    last_epoch = 0

    if (checkpoint_path != None and checkpoint_path != ''):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'][-1])
        if not reset_optimizer:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'][-1])
        last_epoch = len(checkpoint['model_state_dict'])
        print('> loaded checkpoint! (%d epochs)' % (last_epoch))

    checkpoint['train_settings'].append({
        'learning_rate':
        learning_rate,
        'scheduler':
        scheduler,
        'epochs':
        epochs,
        'seed':
        seed,
        'batch_size':
        batch_size,
        'edge_loss':
        edge_loss,
        'optimizer':
        optimizer_type,
        'safe_descent:':
        str(safe_descent),
        'dynamic_lr':
        str(dynamic_lr),
        'rotations':
        str(rotations),
        'train_data_count':
        sum([bin.size(0) for bin in train_data["train_bins"]]),
        'test_data_count':
        len(train_data["test_samples"]),
        'validation_data_count':
        len(train_data["val_samples"]),
        'model_args':
        model_args
    })

    # set up report interval (for logging) and batch size -------------------------------------------------------------------

    report_interval = 100
    loss_function = torch.nn.MSELoss(reduction='mean')

    # begin training ###########################################################################################################################

    io.cprint("\nBeginning Training..\n")

    for epoch in range(last_epoch + 1, last_epoch + epochs + 1):

        io.cprint(
            "Epoch: %d ------------------------------------------------------------------------------------------"
            % (epoch))
        io.cprint("Current LR: %.10f" % (optimizer.param_groups[0]['lr']))

        model.train()
        optimizer.zero_grad()

        checkpoint['train_batch_loss'].append([])
        checkpoint['train_batch_N'].append([])
        checkpoint['train_batch_lr_adjust'].append([])
        checkpoint['train_batch_loss_reduction'].append([])
        checkpoint['lr'].append(optimizer.param_groups[0]['lr'])

        # draw random batches from random bins
        binbatches = utils.drawBinBatches([bin.size(0) for bin in train_bins],
                                          batchsize=batch_size)

        checkpoint['train_batch_N'][-1] = [
            train_bins[bin_id][batch_ids].size(1)
            for (bin_id, batch_ids) in binbatches
        ]

        failed_loss_optims = 0
        cum_lr_adjust_fac = 0
        cum_loss_reduction = 0

        # pre-compute random rotations if needed
        batch_rotations = [None] * len(binbatches)
        if rotations:
            start_rotations = time.time()
            batch_rotations = torch.zeros(
                (len(binbatches), batch_size, test_samples[0].size(1),
                 test_samples[0].size(1)),
                device=device)
            for i in range(len(binbatches)):
                for j in range(batch_size):
                    batch_rotations[i, j] = utils.getRandomRotation(
                        test_samples[0].size(1), device=device)
            print("created batch rotations (%ds)" %
                  (time.time() - start_rotations))

        b = 0  # batch counter

        train_start = time.time()

        for (bin_id, batch_ids) in binbatches:

            b += 1

            # print ("handling batch %d" % (b))

            # prediction & loss ----------------------------------------

            batch_sample = train_bins[bin_id][batch_ids].to(
                model.base.device)  # size: (B x N x d x 2)

            batch_loss = getBatchLoss(model,
                                      batch_sample,
                                      loss_function,
                                      edge_loss=edge_loss,
                                      rotations=batch_rotations[b - 1])
            batch_loss.backward()

            checkpoint['train_batch_loss'][-1].append(batch_loss.item())

            new_loss = 0.0
            lr_adjust = 1.0
            loss_reduction = 0.0

            # if safe descent is enabled, try to optimize the descent step so that a reduction in loss is guaranteed
            if safe_descent:

                # create backups to restore states before the optimizer step
                model_state_backup = copy.deepcopy(model.state_dict())
                opt_state_backup = copy.deepcopy(optimizer.state_dict())

                # make an optimizer step
                optimizer.step()

                # in each itearation, check if the optimzer gave an improvement
                # if not, restore the original states, reduce the learning rate and try again
                # no gradient needed for the plain loss calculation
                with torch.no_grad():
                    for i in range(10):

                        new_loss = getBatchLoss(
                            model,
                            batch_sample,
                            loss_function,
                            edge_loss=edge_loss,
                            rotations=batch_rotations[b - 1]).item()

                        # if the model performs better now we continue, if not we try a smaller learning step
                        if (new_loss < batch_loss.item()):
                            # print("lucky! (%f -> %f) reduction: %.4f%%" % (batch_loss.item(), new_loss, 100 * (batch_loss.item()-new_loss) / batch_loss.item()))
                            break
                        else:
                            # print("try again.. (%f -> %f)" % (batch_loss.item(), new_loss))
                            model.load_state_dict(model_state_backup)
                            optimizer.load_state_dict(opt_state_backup)
                            lr_adjust *= 0.7
                            optimizer.step(lr_adjust=lr_adjust)

                loss_reduction = 100 * (batch_loss.item() -
                                        new_loss) / batch_loss.item()

                if new_loss >= batch_loss.item():
                    failed_loss_optims += 1
                else:
                    cum_lr_adjust_fac += lr_adjust
                    cum_loss_reduction += loss_reduction

            else:

                cum_lr_adjust_fac += lr_adjust
                optimizer.step()

            checkpoint['train_batch_lr_adjust'][-1].append(lr_adjust)
            checkpoint['train_batch_loss_reduction'][-1].append(loss_reduction)

            # reset gradients
            optimizer.zero_grad()

            # statistic caluclation and output -------------------------

            if b % report_interval == 0:

                last_100_loss = sum(checkpoint['train_batch_loss'][-1]
                                    [b - report_interval:b]) / report_interval
                improvement_indicator = '+' if epoch > 1 and last_100_loss < checkpoint[
                    'train_loss'][-1] else ''

                io.cprint(
                    '  Batch %4d to %4d | loss: %.10f%1s | av. dist. per neighbor: %.10f | E%3d | T:%5ds | Failed Optims: %3d (%05.2f%%) | Av. Adjust LR: %.6f | Av. Loss Reduction: %07.4f%%'
                    % (b - (report_interval - 1), b, last_100_loss,
                       improvement_indicator, np.sqrt(last_100_loss), epoch,
                       time.time() - train_start, failed_loss_optims, 100 *
                       (failed_loss_optims / report_interval),
                       (cum_lr_adjust_fac /
                        (report_interval - failed_loss_optims)
                        if failed_loss_optims < report_interval else -1),
                       (cum_loss_reduction /
                        (report_interval - failed_loss_optims)
                        if failed_loss_optims < report_interval else -1)))

                failed_loss_optims = 0
                cum_lr_adjust_fac = 0
                cum_loss_reduction = 0

        checkpoint['train_loss'].append(
            sum(checkpoint['train_batch_loss'][-1]) / b)
        checkpoint['train_time'].append(time.time() - train_start)

        io.cprint(
            '----\n  TRN | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f'
            % (checkpoint['train_time'][-1], checkpoint['train_loss'][-1],
               np.sqrt(checkpoint['train_loss'][-1])))

        torch.cuda.empty_cache()

        ####################
        # Test & Validation
        ####################

        with torch.no_grad():

            if use_batch_norm:

                model.eval_bn()

                eval_bn_start = time.time()

                # run through all train samples again to accumulate layer-wise input distribution statistics (mean and variance) with fixed weights
                # these statistics are later used for the BatchNorm layers during inference
                for (bin_id, batch_ids) in binbatches:
                    input = train_bins[bin_id][batch_ids][:, :, :, 0].squeeze(
                        -1)  # size: (B x N x d)
                    model(input.transpose(1,
                                          2).to(model.base.device)).transpose(
                                              1, 2)  # size: (B x N x d)

                io.cprint('Accumulated BN Layer statistics (%ds)' %
                          (time.time() - eval_bn_start))

            model.eval()

            test_start = time.time()

            test_loss = getTestLoss(model,
                                    test_samples,
                                    loss_function,
                                    edge_loss=edge_loss)

            checkpoint['test_loss'].append(test_loss)
            checkpoint['test_time'].append(time.time() - test_start)

            io.cprint(
                '  TST | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f'
                % (checkpoint['test_time'][-1], checkpoint['test_loss'][-1],
                   np.sqrt(checkpoint['test_loss'][-1])))

            val_start = time.time()

            val_loss = getTestLoss(model,
                                   val_samples,
                                   loss_function,
                                   edge_loss=edge_loss)

            checkpoint['val_loss'].append(val_loss)
            checkpoint['val_time'].append(time.time() - val_start)

            io.cprint(
                '  VAL | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f'
                % (checkpoint['val_time'][-1], checkpoint['val_loss'][-1],
                   np.sqrt(checkpoint['val_loss'][-1])))

        ####################
        # Scheduler Step
        ####################

        if not no_lr_schedule:
            scheduler.step()

        if epoch > 1 and dynamic_lr and sum(
                checkpoint['train_batch_lr_adjust'][-1]) > 0:
            io.cprint("----\n  dynamic lr adjust: %.10f" %
                      (0.5 *
                       (1 + sum(checkpoint['train_batch_lr_adjust'][-1]) /
                        len(checkpoint['train_batch_lr_adjust'][-1]))))
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.5 * (
                    1 + sum(checkpoint['train_batch_lr_adjust'][-1]) /
                    len(checkpoint['train_batch_lr_adjust'][-1]))

        # Save model and optimizer state ..
        checkpoint['model_state_dict'].append(copy.deepcopy(
            model.state_dict()))
        checkpoint['optimizer_state_dict'].append(
            copy.deepcopy(optimizer.state_dict()))

        torch.save(checkpoint, exp_dir + '/corrector_checkpoints.t7')

    io.cprint("\n-------------------------------------------------------" +
              ("\ntotal_time: %.2fh" % ((time.time() - start_time) / 3600)) +
              ("\ntrain_time: %.2fh" %
               (sum(checkpoint['train_time']) / 3600)) +
              ("\ntest_time: %.2fh" % (sum(checkpoint['test_time']) / 3600)) +
              ("\nval_time: %.2fh" % (sum(checkpoint['val_time']) / 3600)) +
              "\n-------------------------------------------------------" +
              "\nend_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") +
              "\n-------------------------------------------------------")
Exemple #20
0
class CNNModel():
    def __init__(self, args={}):
        self.args = args
        self.parse_args(args)
        self.classifier = ConvNet()
        self.optimizer = optim.Adam(self.classifier.parameters(),
                                    lr=self.lr,
                                    betas=(0.9, 0.98),
                                    eps=1e-9)
        self.loss_function = nn.CrossEntropyLoss()
        lmbda = lambda epoch: self.lr_factor
        self.lr_scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda)

    def parse_args(self, args):
        self.lr = args['learning_rate'] if 'learning_rate' in args else 0.001
        self.max_epoch = args['max_epoch'] if 'max_epoch' in args else 100
        self.early_stop = args['early_stop'] if 'early_stop' in args else False
        self.batch_size = args['batch_size'] if 'batch_size' in args else 64
        self.shuffle = args['shuffle'] if 'shuffle' in args else False
        self.adjust_lr = args[
            'adaptive_learning_rate'] if 'adaptive_learning_rate' in args else False
        self.early_stop_idx_limit = 10
        self.lr_factor = 0.95
        self.min_lr = 5e-6

    def adjust_learning_rate(optimizer, factor=.5, min_lr=0.00001):
        for i, param_group in enumerate(optimizer.param_groups):
            old_lr = float(param_group['lr'])
            new_lr = max(old_lr * factor, min_lr)
            param_group['lr'] = new_lr
            logger.info('adjusting learning rate from %.6f to %.6f' %
                        (old_lr, new_lr))

    def train_model(self, train_X, train_Y):
        if self.early_stop:
            best_acc = 0
            best_model = None
            early_stop_idx = 0

            train_X, dev_X = np.split(train_X, [int(len(train_X) * .8)])
            train_Y, dev_Y = np.split(train_Y, [int(len(train_Y) * .8)])

            tensor_dev_X = torch.Tensor(dev_X)
            tensor_dev_Y = torch.Tensor(dev_Y).type(torch.LongTensor)
            dev = TensorDataset(tensor_dev_X, tensor_dev_Y)
            dev_loader = DataLoader(dev,
                                    batch_size=self.batch_size,
                                    shuffle=False)

        tensor_train_X = torch.Tensor(train_X)
        tensor_train_Y = torch.Tensor(train_Y).type(torch.LongTensor)
        train = TensorDataset(tensor_train_X, tensor_train_Y)
        train_loader = DataLoader(train,
                                  batch_size=self.batch_size,
                                  shuffle=self.shuffle)
        prev_loss = np.inf

        for epoch in range(self.max_epoch):
            running_loss = 0.0
            for i, data in enumerate(train_loader):
                features, labels = data
                self.optimizer.zero_grad()
                outputs = self.classifier(
                    features.view(features.size(0), 1, 28, 28))
                loss = self.loss_function(outputs, labels)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()

            print("epoch: ", epoch, "training loss: ", running_loss)

            if self.adjust_lr and running_loss > prev_loss:
                old_lr = self.optimizer.param_groups[0]['lr']
                self.lr_scheduler.step()
                new_lr = self.optimizer.param_groups[0]['lr']
                print("Adjusting learning rate from %.5f to %.5f" %
                      (old_lr, new_lr))

            prev_loss = running_loss

            if self.early_stop:
                with torch.no_grad():
                    dev_correct = 0.
                    dev_total = 0.
                    dev_loss = 0.
                    for data in dev_loader:
                        features, labels = data
                        outputs = self.classifier(
                            features.view(features.size(0), 1, 28, 28))
                        loss = self.loss_function(outputs, labels)
                        _, predicted = torch.max(outputs.data, 1)
                        dev_total += labels.size(0)
                        dev_correct += (predicted == labels).sum().item()
                        dev_loss += loss.item()

                    current_acc = dev_correct / dev_total

                    if current_acc > best_acc:
                        print("Best dev accuracy obtained: %.3f" % current_acc)
                        best_model = copy.deepcopy(self.classifier)
                        best_acc = current_acc
                        early_stop_idx = 0
                    else:
                        early_stop_idx += 1

                if early_stop_idx >= self.early_stop_idx_limit:
                    print("early stop triggered")
                    self.classifier = best_model
                    break

        return self

    def score(self, test_X, test_Y):
        tensor_test_X = torch.Tensor(test_X)
        tensor_test_Y = torch.Tensor(test_Y).type(torch.LongTensor)
        test = TensorDataset(tensor_test_X, tensor_test_Y)
        test_loader = DataLoader(test,
                                 batch_size=self.batch_size,
                                 shuffle=False)
        correct = 0.0
        total = 0.0
        with torch.no_grad():
            for data in test_loader:
                features, labels = data
                outputs = self.classifier(
                    features.view(features.size(0), 1, 28, 28))
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        return correct / total

    @staticmethod
    def Name():
        return "CNN"
Exemple #21
0
def train(train_data,
          exp_dir=datetime.now().strftime("detector_model/%Y-%m-%d_%H%M"),
          learning_rate=0.00005,
          rsize=10,
          epochs=1,
          checkpoint_path='',
          seed=6548,
          batch_size=4,
          model_type='cnet',
          model_cap='normal',
          optimizer='radam',
          safe_descent=True,
          activation_type='mish',
          activation_args={},
          io=None,
          dynamic_lr=True,
          dropout=0,
          rotations=False,
          use_batch_norm=True,
          batch_norm_momentum=None,
          batch_norm_affine=True,
          use_gc=True,
          no_lr_schedule=False,
          diff_features_only=False,
          scale_min=1,
          scale_max=1,
          noise=0):

    start_time = time.time()

    scale_min = scale_min if scale_min < 1 else 1
    scale_max = scale_max if scale_max > 1 else 1

    io.cprint("-------------------------------------------------------" +
              "\nexport dir = " + '/checkpoints/' + exp_dir +
              "\nbase_learning_rate = " + str(learning_rate) +
              "\nuse_batch_norm = " + str(use_batch_norm) +
              "\nbatch_norm_momentum = " + str(batch_norm_momentum) +
              "\nbatch_norm_affine = " + str(batch_norm_affine) +
              "\nno_lr_schedule = " + str(no_lr_schedule) + "\nuse_gc = " +
              str(use_gc) + "\nrsize = " + str(rsize) + "\npython_version: " +
              sys.version + "\ntorch_version: " + torch.__version__ +
              "\nnumpy_version: " + np.version.version + "\nmodel_type: " +
              model_type + "\nmodel_cap: " + model_cap + "\noptimizer: " +
              optimizer + "\nactivation_type: " + activation_type +
              "\nsafe_descent: " + str(safe_descent) + "\ndynamic_lr: " +
              str(dynamic_lr) + "\nrotations: " + str(rotations) +
              "\nscaling: " + str(scale_min) + " to " + str(scale_max) +
              "\nnoise: " + str(noise) + "\nepochs = " + str(epochs) +
              (("\ncheckpoint = " +
                checkpoint_path) if checkpoint_path != '' else '') +
              "\nseed = " + str(seed) + "\nbatch_size = " + str(batch_size) +
              "\n#train_data = " +
              str(sum([bin.size(0) for bin in train_data["train_bins"]])) +
              "\n#test_data = " + str(len(train_data["test_samples"])) +
              "\n#validation_data = " + str(len(train_data["val_samples"])) +
              "\n-------------------------------------------------------" +
              "\nstart_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") +
              "\n-------------------------------------------------------")

    # initialize torch & cuda ---------------------------------------------------------------------

    torch.manual_seed(seed)
    np.random.seed(seed)

    device = utils.getDevice(io)

    # extract train- & test data (and move to device) --------------------------------------------

    pts = train_data["pts"].to(device)
    val_pts = train_data["val_pts"].to(device)

    train_bins = train_data["train_bins"]
    test_samples = train_data["test_samples"]
    val_samples = train_data["val_samples"]

    # the maximum noise offset for each point is equal to the distance to its nearest neighbor
    max_noise = torch.square(pts[train_data["knn"][:, 0]] -
                             pts).sum(dim=1).sqrt()

    # Initialize Model ------------------------------------------------------------------------------

    model_args = {
        'model_type': model_type,
        'model_cap': model_cap,
        'input_channels': pts.size(1),
        'output_channels': 2,
        'rsize': rsize,
        'emb_dims': 1024,
        'activation_type': activation_type,
        'activation_args': activation_args,
        'dropout': dropout,
        'batch_norm': use_batch_norm,
        'batch_norm_affine': batch_norm_affine,
        'batch_norm_momentum': batch_norm_momentum,
        'diff_features_only': diff_features_only
    }

    model = getModel(model_args).to(device)

    # init optimizer & scheduler -------------------------------------------------------------------

    lookahead_sync_period = 6

    opt = None
    if optimizer == 'radam':
        opt = RAdam(model.parameters(),
                    lr=learning_rate,
                    betas=(0.9, 0.999),
                    eps=1e-8,
                    use_gc=use_gc)
    elif optimizer == 'lookahead':
        opt = Ranger(model.parameters(),
                     lr=learning_rate,
                     alpha=0.9,
                     k=lookahead_sync_period)

    # make sure that either a LR schedule is given or dynamic LR is enabled
    assert dynamic_lr or not no_lr_schedule

    scheduler = None if no_lr_schedule else MultiplicativeLR(
        opt, lr_lambda=MultiplicativeAnnealing(epochs))

    # set train settings & load previous model state ------------------------------------------------------------

    checkpoint = getEmptyCheckpoint()
    last_epoch = 0

    if (checkpoint_path != ''):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'][-1])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'][-1])
        last_epoch = len(checkpoint['model_state_dict'])
        print('> loaded checkpoint! (%d epochs)' % (last_epoch))

    checkpoint['train_settings'].append({
        'learning_rate':
        learning_rate,
        'scheduler':
        scheduler,
        'epochs':
        epochs,
        'seed':
        seed,
        'batch_size':
        batch_size,
        'optimizer':
        optimizer,
        'safe_descent:':
        str(safe_descent),
        'dynamic_lr':
        str(dynamic_lr),
        'rotations':
        str(rotations),
        'scale_min':
        scale_min,
        'scale_max':
        scale_max,
        'noise':
        noise,
        'train_data_count':
        sum([bin.size(0) for bin in train_data["train_bins"]]),
        'test_data_count':
        len(train_data["test_samples"]),
        'validation_data_count':
        len(train_data["val_samples"]),
        'model_args':
        model_args
    })

    # calculate class weights ---------------------------------------------------------------------

    av_c1_freq = sum([
        torch.sum(bin[:, :, 1]).item() for bin in train_data["train_bins"]
    ]) / sum([bin[:, :, 1].numel() for bin in train_data["train_bins"]])
    class_weights = torch.tensor([av_c1_freq,
                                  1 - av_c1_freq]).float().to(device)

    io.cprint("\nC0 Weight: %.4f" % (class_weights[0].item()))
    io.cprint("C1 Weight: %.4f" % (class_weights[1].item()))

    # Adjust Weights in favor of C1 (edge:true class)
    # class_weights[0] = class_weights[0] / 2
    # class_weights[1] = 1 - class_weights[0]
    # io.cprint("\nAdjusted C0 Weight: %.4f" % (class_weights[0].item()))
    # io.cprint("Adjusted C1 Weight: %.4f" % (class_weights[1].item()))

    # set up report interval (for logging) and batch size -------------------------------------------------------------------

    report_interval = 100

    # begin training ###########################################################################################################################

    io.cprint("\nBeginning Training..\n")

    for epoch in range(last_epoch + 1, last_epoch + epochs + 1):

        io.cprint(
            "Epoch: %d ------------------------------------------------------------------------------------------"
            % (epoch))
        io.cprint("Current LR: %.10f" % (opt.param_groups[0]['lr']))

        model.train()
        opt.zero_grad()

        checkpoint['train_batch_loss'].append([])
        checkpoint['train_batch_N'].append([])
        checkpoint['train_batch_acc'].append([])
        checkpoint['train_batch_C0_acc'].append([])
        checkpoint['train_batch_C1_acc'].append([])
        checkpoint['train_batch_lr_adjust'].append([])
        checkpoint['train_batch_loss_reduction'].append([])
        checkpoint['lr'].append(opt.param_groups[0]['lr'])

        # draw random batches from random bins
        binbatches = utils.drawBinBatches([bin.size(0) for bin in train_bins],
                                          batchsize=batch_size)

        checkpoint['train_batch_N'][-1] = [
            train_bins[bin_id][batch_ids].size(1)
            for (bin_id, batch_ids) in binbatches
        ]

        failed_loss_optims = 0
        cum_lr_adjust_fac = 0
        cum_loss_reduction = 0

        # pre-compute random rotations if needed
        batch_rotations = [None] * len(binbatches)
        if rotations:
            start_rotations = time.time()
            batch_rotations = torch.zeros(
                (len(binbatches), batch_size, pts.size(1), pts.size(1)),
                device=device)
            for i in range(len(binbatches)):
                for j in range(batch_size):
                    batch_rotations[i,
                                    j] = utils.getRandomRotation(pts.size(1),
                                                                 device=device)
            print("created batch rotations (%ds)" %
                  (time.time() - start_rotations))

        b = 0  # batch counter

        train_start = time.time()

        for (bin_id, batch_ids) in binbatches:

            b += 1

            batch_pts_ids = train_bins[bin_id][batch_ids][:, :,
                                                          0]  # size: (B x N)
            batch_input = pts[batch_pts_ids]  # size: (B x N x d)
            batch_target = train_bins[bin_id][batch_ids][:, :, 1].to(
                device)  # size: (B x N)

            if batch_rotations[b - 1] != None:
                batch_input = batch_input.matmul(batch_rotations[b - 1])

            if noise > 0:
                noise_v = torch.randn(
                    batch_input.size(),
                    device=batch_input.device)  # size: (B x N x d)
                noise_v.div_(
                    torch.square(noise_v).sum(
                        dim=2).sqrt()[:, :, None])  # norm to unit vectors
                batch_input.addcmul(noise_v,
                                    max_noise[batch_pts_ids][:, :, None],
                                    value=noise)

            if scale_min < 1 or scale_max > 1:
                # batch_scales = scale_min + torch.rand(batch_input.size(0), device=batch_input.device) * (scale_max - scale_min)
                batch_scales = torch.rand(batch_input.size(0),
                                          device=batch_input.device)
                batch_scales.mul_(scale_max - scale_min)
                batch_scales.add_(scale_min)
                batch_input.mul(batch_scales[:, None, None])

            batch_input = batch_input.transpose(1, 2)  # size: (B x d x N)

            # prediction & loss ----------------------------------------

            batch_prediction = model(batch_input).transpose(
                1, 2)  # size: (B x N x 2)
            batch_loss = cross_entropy(batch_prediction.reshape(-1, 2),
                                       batch_target.view(-1),
                                       class_weights,
                                       reduction='mean')
            batch_loss.backward()

            checkpoint['train_batch_loss'][-1].append(batch_loss.item())

            new_loss = 0.0
            lr_adjust = 1.0
            loss_reduction = 0.0

            # if safe descent is enabled, try to optimize the descent step so that a reduction in loss is guaranteed
            if safe_descent:

                # create backups to restore states before the optimizer step
                model_state_backup = copy.deepcopy(model.state_dict())
                opt_state_backup = copy.deepcopy(opt.state_dict())

                # make an optimizer step
                opt.step()

                # in each itearation, check if the optimzer gave an improvement
                # if not, restore the original states, reduce the learning rate and try again
                # no gradient needed for the plain loss calculation
                with torch.no_grad():
                    for i in range(10):

                        # new_batch_prediction = model(batch_input).transpose(1,2).contiguous()
                        new_batch_prediction = model(batch_input).transpose(
                            1, 2)
                        new_loss = cross_entropy(new_batch_prediction.reshape(
                            -1, 2),
                                                 batch_target.view(-1),
                                                 class_weights,
                                                 reduction='mean').item()

                        # if the model performs better now we continue, if not we try a smaller learning step
                        if (new_loss < batch_loss.item()):
                            # print("lucky! (%f -> %f) reduction: %.4f%%" % (batch_loss.item(), new_loss, 100 * (batch_loss.item()-new_loss) / batch_loss.item()))
                            break
                        else:
                            # print("try again.. (%f -> %f)" % (batch_loss.item(), new_loss))
                            model.load_state_dict(model_state_backup)
                            opt.load_state_dict(opt_state_backup)
                            lr_adjust *= 0.7
                            opt.step(lr_adjust=lr_adjust)

                loss_reduction = 100 * (batch_loss.item() -
                                        new_loss) / batch_loss.item()

                if new_loss >= batch_loss.item():
                    failed_loss_optims += 1
                else:
                    cum_lr_adjust_fac += lr_adjust
                    cum_loss_reduction += loss_reduction

            else:

                cum_lr_adjust_fac += lr_adjust
                opt.step()

            checkpoint['train_batch_lr_adjust'][-1].append(lr_adjust)
            checkpoint['train_batch_loss_reduction'][-1].append(loss_reduction)

            # reset gradients
            opt.zero_grad()

            # make class prediction and save stats -----------------------

            success_vector = torch.argmax(batch_prediction,
                                          dim=2) == batch_target

            c0_idx = batch_target == 0
            c1_idx = batch_target == 1

            checkpoint['train_batch_acc'][-1].append(
                torch.sum(success_vector).item() / success_vector.numel())
            checkpoint['train_batch_C0_acc'][-1].append(
                torch.sum(success_vector[c0_idx]).item() /
                torch.sum(c0_idx).item())  # TODO handle divsion by zero
            checkpoint['train_batch_C1_acc'][-1].append(
                torch.sum(success_vector[c1_idx]).item() /
                torch.sum(c1_idx).item())  # TODO

            # statistic caluclation and output -------------------------

            if b % report_interval == 0:

                last_100_loss = sum(checkpoint['train_batch_loss'][-1]
                                    [b - report_interval:b]) / report_interval
                last_100_acc = sum(checkpoint['train_batch_acc'][-1]
                                   [b - report_interval:b]) / report_interval
                last_100_acc_c0 = sum(
                    checkpoint['train_batch_C0_acc'][-1]
                    [b - report_interval:b]) / report_interval
                last_100_acc_c1 = sum(
                    checkpoint['train_batch_C1_acc'][-1]
                    [b - report_interval:b]) / report_interval

                io.cprint(
                    '  Batch %4d to %4d | loss: %.5f%1s| acc: %.4f%1s| C0 acc: %.4f%1s| C1 acc: %.4f%1s| E%3d | T:%5ds | Failed Optims: %3d (%05.2f%%) | Av. Adjust LR: %.6f | Av. Loss Reduction: %07.4f%%'
                    %
                    (b -
                     (report_interval - 1), b, last_100_loss, '+' if epoch > 1
                     and last_100_loss < checkpoint['train_loss'][-1] else '',
                     last_100_acc, '+' if epoch > 1
                     and last_100_acc > checkpoint['train_acc'][-1] else '',
                     last_100_acc_c0, '+' if epoch > 1
                     and last_100_acc_c0 > checkpoint['train_C0_acc'][-1] else
                     '', last_100_acc_c1, '+' if epoch > 1 and last_100_acc_c1
                     > checkpoint['train_C1_acc'][-1] else '', epoch,
                     time.time() - train_start, failed_loss_optims, 100 *
                     (failed_loss_optims / report_interval),
                     (cum_lr_adjust_fac /
                      (report_interval - failed_loss_optims)
                      if failed_loss_optims < report_interval else -1),
                     (cum_loss_reduction /
                      (report_interval - failed_loss_optims)
                      if failed_loss_optims < report_interval else -1)))

                failed_loss_optims = 0
                cum_lr_adjust_fac = 0
                cum_loss_reduction = 0

        checkpoint['train_loss'].append(
            sum(checkpoint['train_batch_loss'][-1]) / b)
        checkpoint['train_acc'].append(
            sum(checkpoint['train_batch_acc'][-1]) / b)
        checkpoint['train_C0_acc'].append(
            sum(checkpoint['train_batch_C0_acc'][-1]) / b)
        checkpoint['train_C1_acc'].append(
            sum(checkpoint['train_batch_C1_acc'][-1]) / b)
        checkpoint['train_time'].append(time.time() - train_start)

        io.cprint(
            '----\n  TRN | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f'
            % (checkpoint['train_time'][-1], checkpoint['train_loss'][-1],
               checkpoint['train_acc'][-1], checkpoint['train_C0_acc'][-1],
               checkpoint['train_C1_acc'][-1]))

        torch.cuda.empty_cache()

        ####################
        # Test & Validation
        ####################

        with torch.no_grad():

            if use_batch_norm:

                model.eval_bn()

                eval_bn_start = time.time()

                # run through all train samples again to accumulate layer-wise input distribution statistics (mean and variance) with fixed weights
                # these statistics are later used for the BatchNorm layers during inference
                for (bin_id, batch_ids) in binbatches:

                    batch_pts_ids = train_bins[bin_id][
                        batch_ids][:, :, 0]  # size: (B xN)
                    batch_input = pts[batch_pts_ids]  # size: (B x N x d)

                    # batch_input = batch_input.transpose(1,2).contiguous()             # size: (B x d x N)
                    batch_input = batch_input.transpose(1,
                                                        2)  # size: (B x d x N)
                    model(batch_input)

                io.cprint('Accumulated BN Layer statistics (%ds)' %
                          (time.time() - eval_bn_start))

            model.eval()

            if len(test_samples) > 0:

                test_start = time.time()

                test_loss, test_acc, test_acc_c0, test_acc_c1 = getTestLoss(
                    pts, test_samples, model, class_weights)

                checkpoint['test_loss'].append(test_loss)
                checkpoint['test_acc'].append(test_acc)
                checkpoint['test_C0_acc'].append(test_acc_c0)
                checkpoint['test_C1_acc'].append(test_acc_c1)

                checkpoint['test_time'].append(time.time() - test_start)

                io.cprint(
                    '  TST | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f'
                    %
                    (checkpoint['test_time'][-1], checkpoint['test_loss'][-1],
                     checkpoint['test_acc'][-1], checkpoint['test_C0_acc'][-1],
                     checkpoint['test_C1_acc'][-1]))

            else:
                io.cprint('  TST | n/a (no samples)')

            if len(val_samples) > 0:

                val_start = time.time()

                val_loss, val_acc, val_acc_c0, val_acc_c1 = getTestLoss(
                    val_pts, val_samples, model, class_weights)

                checkpoint['val_loss'].append(val_loss)
                checkpoint['val_acc'].append(val_acc)
                checkpoint['val_C0_acc'].append(val_acc_c0)
                checkpoint['val_C1_acc'].append(val_acc_c1)

                checkpoint['val_time'].append(time.time() - val_start)

                io.cprint(
                    '  VAL | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f'
                    % (checkpoint['val_time'][-1], checkpoint['val_loss'][-1],
                       checkpoint['val_acc'][-1], checkpoint['val_C0_acc'][-1],
                       checkpoint['val_C1_acc'][-1]))

            else:
                io.cprint('  VAL | n/a (no samples)')

        ####################
        # Scheduler Step
        ####################

        if not no_lr_schedule:
            scheduler.step()

        if epoch > 1 and dynamic_lr and sum(
                checkpoint['train_batch_lr_adjust'][-1]) > 0:
            io.cprint("----\n  dynamic lr adjust: %.10f" %
                      (0.5 *
                       (1 + sum(checkpoint['train_batch_lr_adjust'][-1]) /
                        len(checkpoint['train_batch_lr_adjust'][-1]))))
            for param_group in opt.param_groups:
                param_group['lr'] *= 0.5 * (
                    1 + sum(checkpoint['train_batch_lr_adjust'][-1]) /
                    len(checkpoint['train_batch_lr_adjust'][-1]))

        # Save model and optimizer state ..
        checkpoint['model_state_dict'].append(copy.deepcopy(
            model.state_dict()))
        checkpoint['optimizer_state_dict'].append(
            copy.deepcopy(opt.state_dict()))

        torch.save(checkpoint, exp_dir + '/detector_checkpoints.t7')

    io.cprint("\n-------------------------------------------------------" +
              ("\ntotal_time: %.2fh" % ((time.time() - start_time) / 3600)) +
              ("\ntrain_time: %.2fh" %
               (sum(checkpoint['train_time']) / 3600)) +
              ("\ntest_time: %.2fh" % (sum(checkpoint['test_time']) / 3600)) +
              ("\nval_time: %.2fh" % (sum(checkpoint['val_time']) / 3600)) +
              "\n-------------------------------------------------------" +
              "\nend_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") +
              "\n-------------------------------------------------------")
Exemple #22
0
def record_lr(
    model: torch.nn.Module,
    train_loader: DataLoader,
    batch_transforms,
    optimizer,
    start_lr: float = 1e-7,
    end_lr: float = 1,
    num_it: int = 100,
    amp: bool = False,
):
    """Gridsearch the optimal learning rate for the training.
    Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py
    """

    if num_it > len(train_loader):
        raise ValueError(
            "the value of `num_it` needs to be lower than the number of available batches"
        )

    model = model.train()
    # Update param groups & LR
    optimizer.defaults["lr"] = start_lr
    for pgroup in optimizer.param_groups:
        pgroup["lr"] = start_lr

    gamma = (end_lr / start_lr)**(1 / (num_it - 1))
    scheduler = MultiplicativeLR(optimizer, lambda step: gamma)

    lr_recorder = [start_lr * gamma**idx for idx in range(num_it)]
    loss_recorder = []

    if amp:
        scaler = torch.cuda.amp.GradScaler()

    for batch_idx, (images, targets) in enumerate(train_loader):
        if torch.cuda.is_available():
            images = images.cuda()

        images = batch_transforms(images)

        # Forward, Backward & update
        optimizer.zero_grad()
        if amp:
            with torch.cuda.amp.autocast():
                train_loss = model(images, targets)["loss"]
            scaler.scale(train_loss).backward()
            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            # Update the params
            scaler.step(optimizer)
            scaler.update()
        else:
            train_loss = model(images, targets)["loss"]
            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
        # Update LR
        scheduler.step()

        # Record
        if not torch.isfinite(train_loss):
            if batch_idx == 0:
                raise ValueError("loss value is NaN or inf.")
            else:
                break
        loss_recorder.append(train_loss.item())
        # Stop after the number of iterations
        if batch_idx + 1 == num_it:
            break

    return lr_recorder[:len(loss_recorder)], loss_recorder
Exemple #23
0
class DeepSeqNet(Module):

    def __init__(self):
        super(DeepSeqNet, self).__init__()

    def _compile(self, optimizer, learning_rate):
        self._set_optim(optimizer, learning_rate)
        self._set_scheduler()
        self._set_criterion()

    def _set_optim(self, optimizer, learning_rate):
        optimizer = optimizer.lower()
        if optimizer == "adam":
            self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        elif optimizer == "rmsprop":
            self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate)
        else:
            self.optimizer = optim.SGD(self.parameters(), lr=learning_rate)

    def _set_scheduler(self):
        self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=(lambda x: 0.95))

    def _set_criterion(self):
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x_txt, x_num):

        txt_features = self.txt_net_forward(x_txt)
        num_features = self.num_net_forward(x_num)

        features = torch.cat((txt_features, num_features), 1)
        out_features = self.dropout(features)

        logits = self.fc(out_features)

        return logits

    def txt_net_forward(self, x_txt):
        raise NotImplementedError()

    def num_net_forward(self, x_num):
        for linear in self.linear_layers:
            x_num = self.activation_layer(linear(x_num))
        return x_num

    def fit(self, x_txt, x_num, y):

        self.train()

        self.optimizer.zero_grad()

        y_ = self.forward(x_txt, x_num)

        loss = self.criterion(y_, y)
        loss.backward()

        self.optimizer.step()

        return loss

    def evaluate(self, data_iterator):

        self.eval()

        labels, preds = [], []
        for _, (x_txt, x_num, y) in enumerate(data_iterator):

            x_txt, x_num = x_txt.t(), x_num.t()
            if torch.cuda.is_available():
                x_txt, x_num = x_txt.cuda(), x_num.cuda()

            y_ = self.forward(x_txt, x_num)
            pred = torch.argmax(y_, 1)

            preds.extend(pred.cpu().numpy())
            labels.extend(y.numpy())

        score = accuracy_score(labels, np.array(preds).flatten())

        return score

    def run_epoch(self, train_iterator, val_iterator):

        train_losses = []
        val_accuracies = []
        losses = []
        for i, (x_txt, x_num, y) in enumerate(train_iterator):

            x_txt, x_num = x_txt.t(), x_num.t()
            if torch.cuda.is_available():
                x_txt, x_num = x_txt.cuda(), x_num.cuda()
                y = y.cuda()

            loss = self.fit(x_txt, x_num, y)
            losses.append(loss.item())

            if i % 100 == 0 and i != 0:
                avg_train_loss = float(np.mean(losses))
                train_losses.append(avg_train_loss)
                losses = []

                val_accuracy = self.evaluate(val_iterator)
                print("Iteration: %4d | train loss: %3.2f | val acc.: %.2f" % ((i + 1),
                                                                               avg_train_loss * 100,
                                                                               val_accuracy * 100))

        # Run the scheduler to reduce the learning rate
        self.scheduler.step(epoch=None)

        return train_losses, val_accuracies
 def configure_optimizers(self):
     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
     scheduler = MultiplicativeLR(optimizer,
                                  lr_lambda=lambda epoch: self.decay)
     return [optimizer], [scheduler]
Exemple #25
0
 def _set_scheduler(self):
     self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=(lambda x: 0.95))