def predict(args, model): """Entrypoint for predict mode""" test_loader = dataset.get_test_loader(args) train_loader, val_loader = dataset.get_train_val_loader(args, predict=True) if args.fp16: model = amp.initialize(model, opt_level='O1') logging.info('Starting prediction') output = {} for k, loader in [('test', test_loader), ('val', val_loader)]: output[k] = {} res = infer(args, model, loader) for i, v in res.items(): d = loader.dataset.data[i] name = '{}_{}_{}'.format(d[0], d[1], d[2]) if name not in output[k]: output[k][name] = [] output[k][name].append(v) logging.info('Saving predictions to {}'.format(args.load + '.output' + args.pred_suffix)) with open(args.load + '.output' + args.pred_suffix, 'wb') as file: pickle.dump(output, file)
def predict(args, model): """Entrypoint for predict mode""" test_loader = dataset.get_test_loader(args) train_loader, val_loader = dataset.get_train_val_loader(args, predict=True) if args.fp16: model = amp.initialize(model, opt_level="O1") logging.info("Starting prediction") output = {} for k, loader in [("test", test_loader), ("val", val_loader)]: output[k] = {} res = infer(args, model, loader) for i, v in res.items(): d = loader.dataset.data[i] name = "{}_{}_{}".format(d[0], d[1], d[2]) if name not in output[k]: output[k][name] = [] output[k][name].append(v) logging.info( "Saving predictions to {}".format(args.load + ".output" + args.pred_suffix) ) with open(args.load + ".output" + args.pred_suffix, "wb") as file: pickle.dump(output, file)
def train(args, model): optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_decay_step, gamma=args.lr_decay) criterion = nn.MSELoss() if args.retrain: optimizer.load_state_dict(torch.load(args.load + '_optimizer')) # reset lr to initial learning rate for g in optimizer.param_groups: g['lr'] = args.lr train_loader, validation_loader = dataset.get_train_val_loader(args) train_iterations = len(train_loader) val_iterations = len(validation_loader) best_loss = 10000 for epoch in range(args.epochs): current_lr = optimizer.param_groups[0]['lr'] logging.info(f'Train: epoch {epoch} learning rate: {current_lr}') model.train() optimizer.zero_grad() # Train set for i, (images, targets) in enumerate(train_loader): # rotate and resize batch if requested if args.transform: # Choose random rotation angle and scaling for this batch angle = random.choice(range(360)) scale = random.choice(np.linspace(0.2, 2, 49)) [new_height, new_width] = [ np.int(np.round(images.size()[2] * scale)), np.int(np.round(images.size()[3] * scale)) ] new_ims, new_targets = transform_input(images[0], targets[0], angle, new_height, new_width) for l in range(len(images)): new_im, new_target = transform_input( images[l], targets[l], angle, new_height, new_width) new_ims = torch.cat((new_ims, new_im), dim=0) new_targets = torch.cat((new_targets, new_target), dim=0) images = copy.deepcopy(new_ims) targets = copy.deepcopy(new_targets) del (new_ims, new_targets) images = images.to(device) targets = targets.to(device) output = model(images).to(torch.double) loss = criterion(output, targets) loss.backward() optimizer.step() optimizer.zero_grad() # Update on training progress every 5th iteration if (i + 1) % 5 == 0: logging.info( f'epoch {epoch + 1}/{args.epochs}, step {i + 1}/{train_iterations}, loss {loss}' ) # Validation set loss_log = np.zeros(len(validation_loader)) for i, (images, targets) in enumerate(validation_loader): # rotate and resize if requested if args.transform: # Choose random rotation angle and scaling for this batch angle = random.choice(range(360)) scale = random.choice(np.linspace(0.2, 2, 49)) [new_height, new_width] = [ np.int(np.round(images.size()[2] * scale)), np.int(np.round(images.size()[3] * scale)) ] new_ims, new_targets = transform_input(images[0], targets[0], angle, new_height, new_width) for l in range(len(images)): new_im, new_target = transform_input( images[l], targets[l], angle, new_height, new_width) new_ims = torch.cat((new_ims, new_im), dim=0) new_targets = torch.cat((new_targets, new_target), dim=0) images = copy.deepcopy(new_ims) targets = copy.deepcopy(new_targets) del (new_ims, new_targets) model.eval() images = images.to(device) targets = targets.to(device) output = model(images) loss = criterion(output, targets) loss_log[i] = loss # Update on validation loss logging.info( f'===== VALIDATION epoch {epoch + 1}/{args.epochs}, step {i + 1}/{val_iterations},' f'validation loss {loss} =====') if np.mean(loss_log) < best_loss: best_loss = np.mean(loss_log) logging.info(f'Saving best to {args.save} with loss {best_loss}') torch.save(model.state_dict(), str(args.save + '/' + args.backbone)) torch.save(optimizer.state_dict(), str(args.save + '/' + args.backbone + '_optimizer')) exp_lr_scheduler.step()
def train(args, model): train_loader, val_loader = dataset.get_train_val_loader(args) optimizer = torch.optim.Adam(model.parameters(), lr=0, weight_decay=args.wd) if args.horovod: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=args.gradient_accumulation) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if args.load is not None: best_acc = score(args, model, val_loader) else: best_acc = float('-inf') if args.mode == 'val': return if args.pl_epoch is not None: test_loader = dataset.get_test_loader(args, exclude_leak=True) pl_data = set() for epoch in range(args.start_epoch, args.epochs): if args.pl_epoch is not None: pseudo_label(args, epoch, pl_data, model, val_loader, test_loader, train_loader) with torch.no_grad(): avg_norm = np.mean([v.norm().item() for v in model.parameters()]) logging.info('Train: epoch {} avg_norm: {}'.format(epoch, avg_norm)) model.train() optimizer.zero_grad() cum_loss = 0 cum_acc = 0 cum_count = 0 tic = time.time() for i, (X, S, _, Y) in enumerate(train_loader): lr = get_learning_rate(args, epoch + i / len(train_loader)) for g in optimizer.param_groups: g['lr'] = lr X = X.cuda() S = S.cuda() Y = Y.cuda() X, S, Y = transform_input(args, X, S, Y) loss, acc = model.train_forward(X, S, Y) if args.fp16: ''' with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.synchronize() if (i + 1) % args.gradient_accumulation == 0: with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() ''' apply_grads = (i + 1) % args.gradient_accumulation == 0 with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if hasattr(optimizer, "synchronize") and apply_grads: optimizer.synchronize() if apply_grads: if hasattr(optimizer, "skip_synchronize"): with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() else: loss.backward() if (i + 1) % args.gradient_accumulation == 0: optimizer.step() optimizer.zero_grad() cum_count += 1 cum_loss += loss.item() cum_acc += acc if (i + 1) % args.disp_batches == 0: logging.info( 'Epoch: {:3d} Iter: {:4d} -> speed: {:6.1f} lr: {:.9f} loss: {:.6f} acc: {:.6f}' .format(epoch, i + 1, cum_count * args.batch_size / (time.time() - tic), optimizer.param_groups[0]['lr'], cum_loss / cum_count, cum_acc / cum_count)) cum_loss = 0 cum_acc = 0 cum_count = 0 tic = time.time() acc = score(args, model, val_loader) torch.save(model.state_dict(), str(args.save + '.{}'.format(epoch))) if acc >= best_acc: best_acc = acc logging.info('Saving best to {} with score {}'.format( args.save, best_acc)) torch.save(model.state_dict(), str(args.save))
def train(args, model): optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_decay_step, gamma=args.lr_decay) criterion = nn.MSELoss() if args.retrain: optimizer.load_state_dict(torch.load(args.load + '_optimizer')) # reset lr to initial learning rate for g in optimizer.param_groups: g['lr'] = args.lr train_loader, validation_loader = dataset.get_train_val_loader(args) train_iterations = len(train_loader) val_iterations = len(validation_loader) best_loss = 10000 for epoch in range(args.epochs): current_lr = optimizer.param_groups[0]['lr'] logging.info(f'Train: epoch {epoch} learning rate: {current_lr}') model.train() optimizer.zero_grad() # Train set for i, (images, targets) in enumerate(train_loader): # rotate and resize batch if requested if args.transform: # Choose random rotation angle and scaling for this batch angle = random.choice(range(360)) scale = random.choice(np.linspace(0.2, 2, 49)) [new_height, new_width] = [np.int(np.round(images.size()[2] * scale)), np.int(np.round(images.size()[3] * scale))] # Get transformed images and targets for image_ind in range(len(images)): if image_ind == 0: new_ims, new_targets = transform_input(images[0], targets[0], angle, new_height, new_width) else: new_im, new_target = transform_input(images[image_ind], targets[image_ind], angle, new_height, new_width) new_ims = torch.cat((new_ims, new_im), dim=0) new_targets = torch.cat((new_targets, new_target), dim=0) images = copy.deepcopy(new_ims) targets = copy.deepcopy(new_targets) del (new_ims, new_targets) # For debugging - check transformed images and targets # testim = images[0, 0, :, :].detach().numpy() # testtargs = targets[0, :].detach().numpy() # # trck_pts = np.zeros([2, 8]) # trck_pts[0, :] = testtargs[0:8] * testim.shape[0] # trck_pts[1, :] = testtargs[8:16] * testim.shape[1] # trck_pts = np.transpose(trck_pts) # for ind in range(trck_pts.shape[0]): # pt = trck_pts[ind, :] # testim[np.int(pt[0]) - 4:np.int(pt[0]) + 4, np.int(pt[1]) - 4:np.int(pt[1]) + 4] = 1 # # plt.imshow(testim) # plt.savefig(args.save + 'test.png', dpi=300, quality=100, format='png') images = images.to(device) targets = targets.to(device) output = model(images).to(torch.double) loss = criterion(output, targets) loss.backward() optimizer.step() optimizer.zero_grad() # Update on training progress every 5th iteration if (i + 1) % 5 == 0: logging.info(f'epoch {epoch + 1}/{args.epochs}, step {i + 1}/{train_iterations}, loss {loss}') # Validation set loss_log = np.zeros(len(validation_loader)) for i, (images, targets) in enumerate(validation_loader): # rotate and resize if requested if args.transform: # Choose random rotation angle and scaling for this batch angle = random.choice(range(360)) scale = random.choice(np.linspace(0.2, 2, 49)) [new_height, new_width] = [np.int(np.round(images.size()[2] * scale)), np.int(np.round(images.size()[3] * scale))] # Get transformed images and targets for image_ind in range(len(images)): if image_ind == 0: new_ims, new_targets = transform_input(images[0], targets[0], angle, new_height, new_width) else: new_im, new_target = transform_input(images[image_ind], targets[image_ind], angle, new_height, new_width) new_ims = torch.cat((new_ims, new_im), dim=0) new_targets = torch.cat((new_targets, new_target), dim=0) images = copy.deepcopy(new_ims) targets = copy.deepcopy(new_targets) del (new_ims, new_targets) model.eval() images = images.to(device) targets = targets.to(device) output = model(images) loss = criterion(output, targets) loss_log[i] = loss # Update on validation loss logging.info(f'===== VALIDATION epoch {epoch + 1}/{args.epochs}, step {i + 1}/{val_iterations},' f'validation loss {loss} =====') if np.mean(loss_log) < best_loss: best_loss = np.mean(loss_log) logging.info(f'Saving best to {args.save} with loss {best_loss}') torch.save(model.state_dict(), str(args.save + '/' + args.backbone)) torch.save(optimizer.state_dict(), str(args.save + '/' + args.backbone + '_optimizer')) exp_lr_scheduler.step()
def train(args, model): train_loader, val_loader = dataset.get_train_val_loader(args) optimizer = torch.optim.Adam(model.parameters(), lr=0, weight_decay=args.wd) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if args.load is not None: best_acc = score(args, model, val_loader) else: best_acc = float("-inf") if args.mode == "val": return if args.pl_epoch is not None: test_loader = dataset.get_test_loader(args, exclude_leak=True) pl_data = set() for epoch in range(args.start_epoch, args.epochs): if args.pl_epoch is not None: pseudo_label( args, epoch, pl_data, model, val_loader, test_loader, train_loader ) with torch.no_grad(): avg_norm = np.mean([v.norm().item() for v in model.parameters()]) logging.info("Train: epoch {} avg_norm: {}".format(epoch, avg_norm)) print(f'GRAD: {torch.is_grad_enabled()}') model.train() optimizer.zero_grad() cum_loss = 0 cum_acc = 0 cum_count = 0 tic = time.time() for i, (X, S, _, Y) in enumerate(train_loader): lr = get_learning_rate(args, epoch + i / len(train_loader)) for g in optimizer.param_groups: g["lr"] = lr X = X.cuda() S = S.cuda() Y = Y.cuda() X, S, Y = transform_input(args, X, S, Y) loss, acc = model.train_forward(X, S, Y) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (i + 1) % args.gradient_accumulation == 0: optimizer.step() optimizer.zero_grad() cum_count += 1 cum_loss += loss.item() cum_acc += acc if (i + 1) % args.disp_batches == 0: logging.info( "Epoch: {:3d} Iter: {:4d} -> speed: {:6.1f} lr: {:.9f} loss: {:.6f} acc: {:.6f}".format( epoch, i + 1, cum_count * args.batch_size / (time.time() - tic), optimizer.param_groups[0]["lr"], cum_loss / cum_count, cum_acc / cum_count, ) ) cum_loss = 0 cum_acc = 0 cum_count = 0 tic = time.time() acc = score(args, model, val_loader) torch.save(model.state_dict(), str(args.save + ".{}".format(epoch))) if acc >= best_acc: best_acc = acc logging.info("Saving best to {} with score {}".format(args.save, best_acc)) torch.save(model.state_dict(), str(args.save))