def create_optimizer(args, model: Net): from torch.optim import Adam from torch.optim.lr_scheduler import StepLR opt1 = Adam(model.partial_parameters(True), lr=args.lr_ddr) opt2 = Adam(model.partial_parameters(False), lr=args.lr) lrs1 = StepLR(opt1, args.lr_steps, args.lr_gamma) lrs1.last_epoch = args.start_epoch - 1 lrs2 = StepLR(opt2, args.lr_steps, args.lr_gamma) lrs2.last_epoch = args.start_epoch - 1 return opt1, opt2, lrs1, lrs2
def train(args): """ The function to run the training loop. Args: dataset: The dataset is provided by ElasticDL for the elastic training. Now, the dataset if tf.data.Dataset and we need to convert the data in dataset to torch.tensor. Later, ElasticDL will pass a torch.utils.data.DataLoader. elastic_controller: The controller for elastic training. """ use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_data = torchvision.datasets.ImageFolder(args.training_data) test_data = torchvision.datasets.ImageFolder(args.validation_data) allreduce_controller = create_elastic_controller( batch_size=args.batch_size, dataset_size=len(train_data.imgs), num_epochs=args.num_epochs, shuffle=True, ) train_dataset = ElasticDataset( train_data.imgs, allreduce_controller.data_shard_service ) train_loader = DataLoader( dataset=train_dataset, batch_size=args.batch_size, num_workers=2 ) test_dataset = ElasticDataset(test_data.imgs) test_loader = DataLoader( dataset=test_dataset, batch_size=args.batch_size, num_workers=2 ) model = Net() optimizer = optim.SGD(model.parameters(), lr=args.learning_rate) optimizer = DistributedOptimizer(optimizer, fixed_global_batch_size=True) scheduler = StepLR(optimizer, step_size=1, gamma=0.5) # Set the model and optimizer to broadcast. allreduce_controller.set_broadcast_model(model) allreduce_controller.set_broadcast_optimizer(optimizer) epoch = 0 # Use the elastic function to wrap the training function with a batch. elastic_train_one_batch = allreduce_controller.elastic_run(train_one_batch) if torch.cuda.is_available(): model.cuda() with allreduce_controller.scope(): for batch_idx, (data, target) in enumerate(train_loader): model.train() target = target.type(torch.LongTensor) data, target = data.to(device), target.to(device) loss = elastic_train_one_batch(model, optimizer, data, target) print("loss = {}, step = {}".format(loss, batch_idx)) new_epoch = allreduce_controller.get_current_epoch() if new_epoch > epoch: epoch = new_epoch # Set epoch of the scheduler scheduler.last_epoch = epoch - 1 scheduler.step() test(model, device, test_loader)
def create_lr_scheduler(optimizer, configs): """Create learning rate scheduler for training process""" if configs.lr_type == 'step_lr': lr_scheduler = StepLR(optimizer, step_size=configs.lr_step_size, gamma=configs.lr_factor) elif configs.lr_type == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, factor=configs.lr_factor, patience=configs.lr_patience) elif configs.optimizer_type == 'cosin': # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / configs.num_epochs)) / 2) **1.0) * 0.9 + 0.1 # cosine lr_scheduler = LambdaLR(optimizer, lr_lambda=lf) lr_scheduler.last_epoch = configs.start_epoch - 1 # do not move # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) else: raise TypeError return lr_scheduler
def _train(train_img_path, train_txt_path, val_img_path, val_txt_path, path_to_log_dir, path_to_restore_checkpoint_file, training_options): batch_size = training_options['batch_size'] initial_learning_rate = training_options['learning_rate'] initial_patience = training_options['patience'] num_steps_to_show_loss = 100 num_steps_to_check = 1000 step = 0 patience = initial_patience best_accuracy = 0.0 duration = 0.0 model = Model(21) model.cuda() transform = transforms.Compose([ transforms.Resize([285, 285]), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]) train_loader = torch.utils.data.DataLoader(BarcodeDataset( train_img_path, train_txt_path, transform), batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) evaluator = Evaluator(val_img_path, val_txt_path) optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=0.9, weight_decay=0.0005) scheduler = StepLR(optimizer, step_size=training_options['decay_steps'], gamma=training_options['decay_rate']) if path_to_restore_checkpoint_file is not None: assert os.path.isfile( path_to_restore_checkpoint_file ), '%s not found' % path_to_restore_checkpoint_file step = model.restore(path_to_restore_checkpoint_file) scheduler.last_epoch = step print('Model restored from file: %s' % path_to_restore_checkpoint_file) path_to_losses_npy_file = os.path.join(path_to_log_dir, 'losses.npy') if os.path.isfile(path_to_losses_npy_file): losses = np.load(path_to_losses_npy_file) else: losses = np.empty([0], dtype=np.float32) while True: for batch_idx, (images, digits_labels) in enumerate(train_loader): start_time = time.time() images, digits_labels = images.cuda(), [ digit_label.cuda() for digit_label in digits_labels ] digit2_logits, digit3_logits, digit4_logits, digit5_logits, digit6_logits, digit7_logits, digit8_logits, digit9_logits, digit10_logits, digit11_logits, digit12_logits, digit13_logits = model.train( )(images) loss = _loss(digit2_logits, digit3_logits, digit4_logits, digit5_logits, digit6_logits, digit7_logits, digit8_logits, digit9_logits, digit10_logits, digit11_logits, digit12_logits, digit13_logits, digits_labels) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() step += 1 duration += time.time() - start_time if step % num_steps_to_show_loss == 0: examples_per_sec = batch_size * num_steps_to_show_loss / duration duration = 0.0 print( '=> %s: step %d, loss = %f, learning_rate = %f (%.1f examples/sec)' % (datetime.now(), step, loss.item(), scheduler.get_lr()[0], examples_per_sec)) if step % num_steps_to_check != 0: continue losses = np.append(losses, loss.item()) np.save(path_to_losses_npy_file, losses) print('=> Evaluating on validation dataset...') accuracy = evaluator.evaluate(model) print('==> accuracy = %f, best accuracy %f' % (accuracy, best_accuracy)) if accuracy > best_accuracy: path_to_checkpoint_file = model.store(path_to_log_dir, step=step) print('=> Model saved to file: %s' % path_to_checkpoint_file) patience = initial_patience best_accuracy = accuracy else: patience -= 1 print('=> patience = %d' % patience) if patience == 0: return
def _train(path_to_train_lmdb_dir, path_to_val_lmdb_dir, path_to_log_dir, path_to_restore_checkpoint_file, training_options, max_steps): batch_size = training_options['batch_size'] initial_learning_rate = training_options['learning_rate'] initial_patience = training_options['patience'] num_steps_to_show_loss = 100 num_steps_to_check = training_options["validation_interval"] step = 0 patience = initial_patience best_accuracy = 0.0 duration = 0.0 model = Model() model.cuda() transform = transforms.Compose([ transforms.RandomCrop([54, 54]), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]) train_loader = torch.utils.data.DataLoader(Dataset(path_to_train_lmdb_dir, transform), batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) evaluator = Evaluator(path_to_val_lmdb_dir) optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=0.9, weight_decay=0.0005) scheduler = StepLR(optimizer, step_size=training_options['decay_steps'], gamma=training_options['decay_rate']) if path_to_restore_checkpoint_file is not None: assert os.path.isfile(path_to_restore_checkpoint_file), '%s not found' % path_to_restore_checkpoint_file step = model.restore(path_to_restore_checkpoint_file) scheduler.last_epoch = step print('Model restored from file: %s' % path_to_restore_checkpoint_file) path_to_losses_npy_file = os.path.join(path_to_log_dir, 'losses.npy') if os.path.isfile(path_to_losses_npy_file): losses = np.load(path_to_losses_npy_file) else: losses = np.empty([0], dtype=np.float32) path_to_test_losses_npy_file = os.path.join(path_to_log_dir, 'test_losses.npy') if os.path.isfile(path_to_test_losses_npy_file): test_losses = np.load(path_to_test_losses_npy_file) else: test_losses = np.empty([0], dtype=np.float32) train_loss_array = [] val_loss_array = [] model_checkpoints = [] model_saved = False # Used to save model (checkpoint) every 2 epochs model_save_counter = 0 while True: for batch_idx, (images, length_labels, digits_labels, _) in enumerate(train_loader): start_time = time.time() images, length_labels, digits_labels = images.cuda(), length_labels.cuda(), [digit_labels.cuda() for digit_labels in digits_labels] length_logits, digit1_logits, digit2_logits, digit3_logits, digit4_logits, digit5_logits = model.train()(images) loss = _loss(length_logits, digit1_logits, digit2_logits, digit3_logits, digit4_logits, digit5_logits, length_labels, digits_labels) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() step += 1 duration += time.time() - start_time if step % num_steps_to_show_loss == 0: examples_per_sec = batch_size * num_steps_to_show_loss / duration duration = 0.0 print('=> %s: step %d, loss = %f, learning_rate = %f (%.1f examples/sec)' % ( datetime.now(), step, loss.item(), scheduler.get_lr()[0], examples_per_sec)) if step % num_steps_to_check != 0: continue model_save_counter += 1 losses = np.append(losses, loss.item()) np.save(path_to_losses_npy_file, losses) train_loss_array.append((step, loss.item())) print('=> Evaluating on validation dataset...') accuracy, test_loss_args = evaluator.evaluate(model) test_loss = _loss(*test_loss_args) val_loss_array.append((step, test_loss.item())) print('==> accuracy = %f, best accuracy %f' % (accuracy, best_accuracy)) # print(f'==> loss = {test_loss}') # Save model every 2 epochs if model_save_counter >= 2 or step in [1000, 2000, 3000, 4000, 5000]: path_to_checkpoint_file = model.store(path_to_log_dir, step=step) print('=> Model saved to file: %s' % path_to_checkpoint_file) model_save_counter = 0 model_saved = True model_checkpoints.append((step, f"model-{step}.pth")) if accuracy > best_accuracy: patience = initial_patience best_accuracy = accuracy else: patience -= 1 print("Train losses: ", train_loss_array) print("Saved Model Checkpoints: ", model_checkpoints) print('=> patience = %d' % patience) if patience == 0 or step >= max_steps: if not model_saved: path_to_checkpoint_file = model.store(path_to_log_dir, step=step) print('=> Model MANUALLY saved to file: %s' % path_to_checkpoint_file) model_checkpoints.append((step, f"model-{step}.pth")) training_output = { "model_checkpoints": model_checkpoints, "train_loss": train_loss_array, "val_loss": val_loss_array, } print("TRAINING OUTPUT -----------------------------") print(training_output) return training_output