Ejemplo n.º 1
0
def create_optimizer(args, model: Net):
    from torch.optim import Adam
    from torch.optim.lr_scheduler import StepLR

    opt1 = Adam(model.partial_parameters(True), lr=args.lr_ddr)
    opt2 = Adam(model.partial_parameters(False), lr=args.lr)

    lrs1 = StepLR(opt1, args.lr_steps, args.lr_gamma)
    lrs1.last_epoch = args.start_epoch - 1
    lrs2 = StepLR(opt2, args.lr_steps, args.lr_gamma)
    lrs2.last_epoch = args.start_epoch - 1
    return opt1, opt2, lrs1, lrs2
Ejemplo n.º 2
0
def train(args):
    """ The function to run the training loop.
    Args:
        dataset: The dataset is provided by ElasticDL for the elastic training.
        Now, the dataset if tf.data.Dataset and we need to convert
        the data in dataset to torch.tensor. Later, ElasticDL will
        pass a torch.utils.data.DataLoader.
        elastic_controller: The controller for elastic training.
    """
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    train_data = torchvision.datasets.ImageFolder(args.training_data)
    test_data = torchvision.datasets.ImageFolder(args.validation_data)

    allreduce_controller = create_elastic_controller(
        batch_size=args.batch_size,
        dataset_size=len(train_data.imgs),
        num_epochs=args.num_epochs,
        shuffle=True,
    )
    train_dataset = ElasticDataset(
        train_data.imgs, allreduce_controller.data_shard_service
    )
    train_loader = DataLoader(
        dataset=train_dataset, batch_size=args.batch_size, num_workers=2
    )

    test_dataset = ElasticDataset(test_data.imgs)
    test_loader = DataLoader(
        dataset=test_dataset, batch_size=args.batch_size, num_workers=2
    )

    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate)
    optimizer = DistributedOptimizer(optimizer, fixed_global_batch_size=True)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

    # Set the model and optimizer to broadcast.
    allreduce_controller.set_broadcast_model(model)
    allreduce_controller.set_broadcast_optimizer(optimizer)
    epoch = 0
    # Use the elastic function to wrap the training function with a batch.
    elastic_train_one_batch = allreduce_controller.elastic_run(train_one_batch)
    if torch.cuda.is_available():
        model.cuda()
    with allreduce_controller.scope():
        for batch_idx, (data, target) in enumerate(train_loader):
            model.train()
            target = target.type(torch.LongTensor)
            data, target = data.to(device), target.to(device)
            loss = elastic_train_one_batch(model, optimizer, data, target)
            print("loss = {}, step = {}".format(loss, batch_idx))
            new_epoch = allreduce_controller.get_current_epoch()
            if new_epoch > epoch:
                epoch = new_epoch
                # Set epoch of the scheduler
                scheduler.last_epoch = epoch - 1
                scheduler.step()
                test(model, device, test_loader)
Ejemplo n.º 3
0
def create_lr_scheduler(optimizer, configs):
    """Create learning rate scheduler for training process"""
    if configs.lr_type == 'step_lr':
        lr_scheduler = StepLR(optimizer,
                              step_size=configs.lr_step_size,
                              gamma=configs.lr_factor)
    elif configs.lr_type == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         factor=configs.lr_factor,
                                         patience=configs.lr_patience)
    elif configs.optimizer_type == 'cosin':
        # Scheduler https://arxiv.org/pdf/1812.01187.pdf
        lf = lambda x: (((1 + math.cos(x * math.pi / configs.num_epochs)) / 2)
                        **1.0) * 0.9 + 0.1  # cosine
        lr_scheduler = LambdaLR(optimizer, lr_lambda=lf)
        lr_scheduler.last_epoch = configs.start_epoch - 1  # do not move
        # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
        # plot_lr_scheduler(optimizer, scheduler, epochs)
    else:
        raise TypeError

    return lr_scheduler
Ejemplo n.º 4
0
def _train(train_img_path, train_txt_path, val_img_path, val_txt_path,
           path_to_log_dir, path_to_restore_checkpoint_file, training_options):
    batch_size = training_options['batch_size']
    initial_learning_rate = training_options['learning_rate']
    initial_patience = training_options['patience']
    num_steps_to_show_loss = 100
    num_steps_to_check = 1000

    step = 0
    patience = initial_patience
    best_accuracy = 0.0
    duration = 0.0

    model = Model(21)
    model.cuda()

    transform = transforms.Compose([
        transforms.Resize([285, 285]),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    train_loader = torch.utils.data.DataLoader(BarcodeDataset(
        train_img_path, train_txt_path, transform),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=4,
                                               pin_memory=True)
    evaluator = Evaluator(val_img_path, val_txt_path)
    optimizer = optim.SGD(model.parameters(),
                          lr=initial_learning_rate,
                          momentum=0.9,
                          weight_decay=0.0005)
    scheduler = StepLR(optimizer,
                       step_size=training_options['decay_steps'],
                       gamma=training_options['decay_rate'])

    if path_to_restore_checkpoint_file is not None:
        assert os.path.isfile(
            path_to_restore_checkpoint_file
        ), '%s not found' % path_to_restore_checkpoint_file
        step = model.restore(path_to_restore_checkpoint_file)
        scheduler.last_epoch = step
        print('Model restored from file: %s' % path_to_restore_checkpoint_file)

    path_to_losses_npy_file = os.path.join(path_to_log_dir, 'losses.npy')
    if os.path.isfile(path_to_losses_npy_file):
        losses = np.load(path_to_losses_npy_file)
    else:
        losses = np.empty([0], dtype=np.float32)

    while True:
        for batch_idx, (images, digits_labels) in enumerate(train_loader):
            start_time = time.time()
            images, digits_labels = images.cuda(), [
                digit_label.cuda() for digit_label in digits_labels
            ]
            digit2_logits, digit3_logits, digit4_logits, digit5_logits, digit6_logits, digit7_logits, digit8_logits, digit9_logits, digit10_logits, digit11_logits, digit12_logits, digit13_logits = model.train(
            )(images)
            loss = _loss(digit2_logits, digit3_logits, digit4_logits,
                         digit5_logits, digit6_logits, digit7_logits,
                         digit8_logits, digit9_logits, digit10_logits,
                         digit11_logits, digit12_logits, digit13_logits,
                         digits_labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            step += 1
            duration += time.time() - start_time

            if step % num_steps_to_show_loss == 0:
                examples_per_sec = batch_size * num_steps_to_show_loss / duration
                duration = 0.0
                print(
                    '=> %s: step %d, loss = %f, learning_rate = %f (%.1f examples/sec)'
                    % (datetime.now(), step, loss.item(),
                       scheduler.get_lr()[0], examples_per_sec))

            if step % num_steps_to_check != 0:
                continue

            losses = np.append(losses, loss.item())
            np.save(path_to_losses_npy_file, losses)

            print('=> Evaluating on validation dataset...')
            accuracy = evaluator.evaluate(model)
            print('==> accuracy = %f, best accuracy %f' %
                  (accuracy, best_accuracy))

            if accuracy > best_accuracy:
                path_to_checkpoint_file = model.store(path_to_log_dir,
                                                      step=step)
                print('=> Model saved to file: %s' % path_to_checkpoint_file)
                patience = initial_patience
                best_accuracy = accuracy
            else:
                patience -= 1

            print('=> patience = %d' % patience)
            if patience == 0:
                return
Ejemplo n.º 5
0
def _train(path_to_train_lmdb_dir, path_to_val_lmdb_dir, path_to_log_dir,
           path_to_restore_checkpoint_file, training_options, max_steps):
    batch_size = training_options['batch_size']
    initial_learning_rate = training_options['learning_rate']
    initial_patience = training_options['patience']
    num_steps_to_show_loss = 100
    num_steps_to_check = training_options["validation_interval"]

    step = 0
    patience = initial_patience
    best_accuracy = 0.0
    duration = 0.0

    model = Model()
    model.cuda()

    transform = transforms.Compose([
        transforms.RandomCrop([54, 54]),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    train_loader = torch.utils.data.DataLoader(Dataset(path_to_train_lmdb_dir, transform),
                                               batch_size=batch_size, shuffle=True,
                                               num_workers=0, pin_memory=True)
    evaluator = Evaluator(path_to_val_lmdb_dir)
    optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=0.9, weight_decay=0.0005)
    scheduler = StepLR(optimizer, step_size=training_options['decay_steps'], gamma=training_options['decay_rate'])

    if path_to_restore_checkpoint_file is not None:
        assert os.path.isfile(path_to_restore_checkpoint_file), '%s not found' % path_to_restore_checkpoint_file
        step = model.restore(path_to_restore_checkpoint_file)
        scheduler.last_epoch = step
        print('Model restored from file: %s' % path_to_restore_checkpoint_file)

    path_to_losses_npy_file = os.path.join(path_to_log_dir, 'losses.npy')
    if os.path.isfile(path_to_losses_npy_file):
        losses = np.load(path_to_losses_npy_file)
    else:
        losses = np.empty([0], dtype=np.float32)

    path_to_test_losses_npy_file = os.path.join(path_to_log_dir, 'test_losses.npy')
    if os.path.isfile(path_to_test_losses_npy_file):
        test_losses = np.load(path_to_test_losses_npy_file)
    else:
        test_losses = np.empty([0], dtype=np.float32)

    train_loss_array = []
    val_loss_array = []
    model_checkpoints = []
    model_saved = False

    # Used to save model (checkpoint) every 2 epochs
    model_save_counter = 0

    while True:
        for batch_idx, (images, length_labels, digits_labels, _) in enumerate(train_loader):
            start_time = time.time()
            images, length_labels, digits_labels = images.cuda(), length_labels.cuda(), [digit_labels.cuda() for digit_labels in digits_labels]
            length_logits, digit1_logits, digit2_logits, digit3_logits, digit4_logits, digit5_logits = model.train()(images)
            loss = _loss(length_logits, digit1_logits, digit2_logits, digit3_logits, digit4_logits, digit5_logits, length_labels, digits_labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            step += 1
            duration += time.time() - start_time

            if step % num_steps_to_show_loss == 0:
                examples_per_sec = batch_size * num_steps_to_show_loss / duration
                duration = 0.0
                print('=> %s: step %d, loss = %f, learning_rate = %f (%.1f examples/sec)' % (
                    datetime.now(), step, loss.item(), scheduler.get_lr()[0], examples_per_sec))

            if step % num_steps_to_check != 0:
                continue

            model_save_counter += 1
            losses = np.append(losses, loss.item())
            np.save(path_to_losses_npy_file, losses)
            train_loss_array.append((step, loss.item()))

            print('=> Evaluating on validation dataset...')
            accuracy, test_loss_args = evaluator.evaluate(model)
            test_loss = _loss(*test_loss_args)
            val_loss_array.append((step, test_loss.item()))

            print('==> accuracy = %f, best accuracy %f' % (accuracy, best_accuracy))
            # print(f'==> loss = {test_loss}')

            # Save model every 2 epochs
            if model_save_counter >= 2 or step in  [1000, 2000, 3000, 4000, 5000]:
                path_to_checkpoint_file = model.store(path_to_log_dir, step=step)
                print('=> Model saved to file: %s' % path_to_checkpoint_file)
                model_save_counter = 0
                model_saved = True
                model_checkpoints.append((step, f"model-{step}.pth"))

            if accuracy > best_accuracy:
                patience = initial_patience
                best_accuracy = accuracy
            else:
                patience -= 1

            print("Train losses: ", train_loss_array)
            print("Saved Model Checkpoints: ", model_checkpoints)

            print('=> patience = %d' % patience)
            if patience == 0 or step >= max_steps:
                if not model_saved:
                    path_to_checkpoint_file = model.store(path_to_log_dir, step=step)
                    print('=> Model MANUALLY saved to file: %s' % path_to_checkpoint_file)
                    model_checkpoints.append((step, f"model-{step}.pth"))

                training_output = {
                    "model_checkpoints": model_checkpoints,
                    "train_loss": train_loss_array,
                    "val_loss": val_loss_array,
                }
                print("TRAINING OUTPUT -----------------------------")
                print(training_output)
                return training_output