def main(args): args = parse_args() tag = args.tag device = torch.device('cuda:0') no_epochs = args.epochs batch_size = args.batch linear_hidden = args.linear conv_hidden = args.conv #Get train test paths -> later on implement cross val steps = get_paths(as_tuples=True, shuffle=True, tag=tag) steps_train, steps_test = steps[:int(len(steps) * .8)], steps[int(len(steps) * .2):] transform = transforms.Compose( [DepthSegmentationPreprocess(no_data_points=1), ToSupervised()]) dataset_train = SimpleDataset(ids=steps_train, batch_size=batch_size, transform=transform, **SENSORS) dataset_test = SimpleDataset(ids=steps_test, batch_size=batch_size, transform=transform, **SENSORS) dataloader_params = { 'batch_size': batch_size, 'shuffle': True, 'num_workers': 8 } #we've already shuffled paths dataset_train = DataLoader(dataset_train, **dataloader_params) dataset_test = DataLoader(dataset_test, **dataloader_params) batch = next(iter(dataset_test)) action_shape = batch['action'][0].shape img_shape = batch['img'][0].shape #Nets net = DDPGActor(img_shape=img_shape, numeric_shape=[len(NUMERIC_FEATURES)], output_shape=[2], linear_hidden=linear_hidden, conv_filters=conv_hidden) # net = DDPGCritic(actor_out_shape=action_shape, img_shape=img_shape, numeric_shape=[len(NUMERIC_FEATURES)], # linear_hidden=linear_hidden, conv_filters=conv_filters) print(len(steps)) print(net) print(get_n_params(net)) # save path net_path = f'../data/models/imitation/{DATE_TIME}/{net.name}' os.makedirs(net_path, exist_ok=True) optim_steps = args.optim_steps logging_idx = int(len(dataset_train.dataset) / (batch_size * optim_steps)) writer_train = SummaryWriter(f'{net_path}/train', max_queue=30, flush_secs=5) writer_test = SummaryWriter(f'{net_path}/test', max_queue=1, flush_secs=5) #Optimizers optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.0005) if args.scheduler == 'cos': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=optim_steps, T_mult=2) elif args.scheduler == 'one_cycle': scheduler = OneCycleLR(optimizer, max_lr=0.001, epochs=no_epochs, steps_per_epoch=optim_steps) #Loss function loss_function = torch.nn.MSELoss(reduction='sum') test_loss_function = torch.nn.MSELoss(reduction='sum') best_train_loss = 1e10 best_test_loss = 1e10 for epoch_idx in range(no_epochs): train_loss = .0 running_loss = .0 # critic_running_loss = .0 avg_max_grad = 0. avg_avg_grad = 0. for idx, batch in enumerate(iter(dataset_train)): global_step = int((len(dataset_train.dataset) / batch_size * epoch_idx) + idx) batch = unpack_batch(batch=batch, device=device) loss, grad = train(input=batch, label=batch['action'], net=net, optimizer=optimizer, loss_fn=loss_function) # loss, grad = train(input=batch, label=batch['q'], net=net, optimizer=optimizer, loss_fn=loss_function) avg_max_grad += max([element.max() for element in grad]) avg_avg_grad += sum([element.mean() for element in grad]) / len(grad) running_loss += loss train_loss += loss writer_train.add_scalar(tag=f'{net.name}/running_loss', scalar_value=loss / batch_size, global_step=global_step) writer_train.add_scalar(tag=f'{net.name}/max_grad', scalar_value=avg_max_grad, global_step=global_step) writer_train.add_scalar(tag=f'{net.name}/mean_grad', scalar_value=avg_avg_grad, global_step=global_step) if idx % logging_idx == logging_idx - 1: print( f'Actor Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {running_loss/logging_idx}, Lr: {scheduler.get_last_lr()[0]}' ) if (running_loss / logging_idx) < best_train_loss: best_train_loss = running_loss / logging_idx torch.save(net.state_dict(), f'{net_path}/train/train.pt') writer_train.add_scalar( tag=f'{net.name}/lr', scalar_value=scheduler.get_last_lr()[0], global_step=global_step) running_loss = 0.0 avg_max_grad = 0. avg_avg_grad = 0. scheduler.step() print( f'{net.name} best train loss for epoch {epoch_idx+1} - {best_train_loss}' ) writer_train.add_scalar(tag=f'{net.name}/global_loss', scalar_value=train_loss / len(dataset_train.dataset), global_step=(epoch_idx + 1)) test_loss = .0 with torch.no_grad(): for idx, batch in enumerate(iter(dataset_test)): batch = unpack_batch(batch=batch, device=device) pred = net(**batch) loss = test_loss_function(pred, batch['action']) # loss = test_loss_function(pred.view(-1), batch['q']) test_loss += loss if (test_loss / len(dataset_test)) < best_test_loss: best_test_loss = (test_loss / len(dataset_test)) torch.save(net.state_dict(), f'{net_path}/test/test_{epoch_idx+1}.pt') print(f'{net.name} test loss {(test_loss/len(dataset_test)):.3f}') print(f'{net.name} best test loss {best_test_loss:.3f}') writer_test.add_scalar(tag=f'{net.name}/global_loss', scalar_value=(test_loss / len(dataset_test.dataset)), global_step=(epoch_idx + 1)) torch.save(optimizer.state_dict(), f=f'{net_path}/{optimizer.__class__.__name__}.pt') torch.save(scheduler.state_dict(), f=f'{net_path}/{scheduler.__class__.__name__}.pt') json.dump(vars(args), fp=open(f'{net_path}/args.json', 'w'), sort_keys=True, indent=4) writer_train.flush() writer_test.flush() writer_train.close() writer_test.close() batch = next(iter(dataset_test)) batch = unpack_batch(batch=batch, device=device) y = net(**batch) g = make_dot(y, params=dict(net.named_parameters())) g.save(filename=f'{DATE_TIME}_{net.name}.dot', directory=net_path) check_call([ 'dot', '-Tpng', '-Gdpi=200', f'{net_path}/{DATE_TIME}_{net.name}.dot', '-o', f'{net_path}/{DATE_TIME}_{net.name}.png' ])
def main(opt): train_data, valid_data = get_train_valid_split_data_names(opt.img_folder, opt.ano_folder, valid_size=1/8) # データの読み込み print("load data") train_dataset = Phase1Dataset(train_data, load_size=(640, 640), augment=True, limit=opt.limit) print("train data length : %d" % (len(train_dataset))) valid_dataset = Phase1Dataset(valid_data, load_size=(640, 640), augment=False, limit=opt.limit) print("valid data length : %d" % (len(valid_dataset))) # DataLoaderの作成 train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) # GPUの設定(PyTorchでは明示的に指定する必要がある) device = torch.device('cuda' if opt.gpus > 0 else 'cpu') # モデルの作成 heads = {'hm': 1} model = get_pose_net(18, heads, 256).to(device) if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer) # 最適化手法を定義 if opt.optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr)#, momentum=m, dampening=d, weight_decay=w, nesterov=n) elif opt.optimizer == "Adam": optimizer = torch.optim.Adam(model.parameters(), opt.lr) elif opt.optimizer == "RAdam": optimizer = optim.RAdam(model.parameters(), lr=opt.lr) # 損失関数を定義 criterion = HMLoss() # 学習率のスケジューリングを定義 scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=0.00001) start_epoch = 0 best_validation_loss = 1e10 # 保存用フォルダの作成 os.makedirs(os.path.join(opt.save_dir, opt.task, 'visualized'), exist_ok=True) # 学習 TODO エポック終了時点ごとにテスト用データで評価とモデル保存 for epoch in range(start_epoch + 1, opt.num_epochs + 1): print("learning rate : %f" % scheduler.get_last_lr()[0]) train(train_loader, model, optimizer, criterion, device, opt.num_epochs, epoch) if opt.optimizer == "SGD": scheduler.step() # 最新モデルの保存 save_model(os.path.join(opt.save_dir, opt.task, 'model_last.pth'), epoch, model, optimizer, scheduler) # テスト用データで評価 validation_loss, accumulate_datas = valid(valid_loader, model, criterion, device) # ベストスコア更新でモデルの保存 if validation_loss < best_validation_loss: best_validation_loss = validation_loss save_model(os.path.join(opt.save_dir, opt.task, 'model_best.pth'), epoch, model, optimizer, scheduler) print("saved best model") visualization(os.path.join(opt.save_dir, opt.task, 'visualized'), accumulate_datas)
def main(): global args, best_performance set_seed(args.rand_seed) if args.model == 'FCNet': # dataloader train_loader, valid_loader, test_loader = get_FCNet_train_valid_test_loader( root=args.data_root, target=args.target, max_Miller=args.max_Miller, diffraction=args.diffraction, cell_type=args.cell_type, permute_hkl=args.fcnet_permute_hkl, randomize_hkl=args.fcnet_randomize_hkl, batch_size=args.batch_size, num_data_workers=args.num_data_workers) # construct model model = FCNet(max_Miller=args.max_Miller, fc_dims=args.fcnet_fc_dims, dropout=args.dropout) elif args.model == 'PointNet': # dataloader train_loader, valid_loader, test_loader = get_PointNet_train_valid_test_loader( root=args.data_root, target=args.target, max_Miller=args.max_Miller, diffraction=args.diffraction, cell_type=args.cell_type, randomly_scale_intensity=args.pointnet_randomly_scale_intensity, systematic_absence=args.pointnet_systematic_absence, batch_size=args.batch_size, num_data_workers=args.num_data_workers) # construct model model = PointNet(conv_filters=args.pointnet_conv_filters, fc_dims=args.pointnet_fc_dims, dropout=args.dropout) else: raise NotImplementedError # send model to device if torch.cuda.is_available(): print('running on GPU:\n') else: print('running on CPU\n') model = model.to(args.device) # show number of trainable model parameters trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print( 'Number of trainable model parameters: {:d}'.format(trainable_params)) # define loss function criterion = torch.nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # HDFS if args.hdfs_dir is not None: os.system(f'hdfs dfs -mkdir -p {args.hdfs_dir}') # optionally resume from a checkpoint if args.restore_path != '': assert os.path.isfile(args.restore_path) print("=> loading checkpoint '{}'".format(args.restore_path), flush=True) checkpoint = torch.load(args.restore_path, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] + 1 best_performance = checkpoint['best_performance'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.restore_path, checkpoint['epoch']), flush=True) # learning-rate scheduler scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=args.epochs, eta_min=1E-8) print('\nStart training..', flush=True) for epoch in range(args.start_epoch, args.start_epoch + args.epochs): lr = scheduler.get_last_lr() logging.info('Epoch: {}, LR: {:.6f}'.format(epoch, lr[0])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set performance = validate(valid_loader, model, criterion) scheduler.step() # check performance is_best = performance > best_performance best_performance = max(performance, best_performance) # save checkpoint save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_performance': best_performance, 'optimizer': optimizer.state_dict(), }, is_best, args) # test best model print('---------Evaluate Model on Test Set---------------', flush=True) best_model = load_best_model() print('best validation performance: {:.3f}'.format( best_model['best_performance'])) model.load_state_dict(best_model['state_dict']) validate(test_loader, model, criterion, test_mode=True)
while True: # for j in range(50): for batch, (inp, target) in enumerate(dl): inp, target = inp.to(device), target.to(device) opt.zero_grad() out = model(inp) loss = F.cross_entropy(out, target) loss.backward() opt.step() sched.step() acc = accuracy(F.softmax(out), target)[0].item() loss_i = loss.item() print( f'{epoch:2}/{batch:3} Loss: {loss_i:.7f} Accuracy: {acc:.7f} LR: {sched.get_last_lr()[0]:.7f}' ) wandb.log( { 'loss': loss_i, 'accuracy': acc, 'lr': sched.get_last_lr()[0] }, step=global_step) global_step += 1 epoch += 1
def main(args): args = parse_args() tag = args.tag device = torch.device('cuda:0') no_epochs = args.epochs batch_size = args.batch linear_hidden = args.linear conv_hidden = args.conv #Get train test paths -> later on implement cross val steps = get_paths(as_tuples=True, shuffle=True, tag=tag) steps_train, steps_test = steps[:int(len(steps) * .8)], steps[int(len(steps) * .2):] transform = transforms.Compose([ DepthSegmentationPreprocess(no_data_points=args.no_data), ToSupervised() ]) dataset_train = SimpleDataset(ids=steps_train, batch_size=batch_size, transform=transform, **SENSORS) dataset_test = SimpleDataset(ids=steps_test, batch_size=batch_size, transform=transform, **SENSORS) dataloader_params = { 'batch_size': batch_size, 'shuffle': True, 'num_workers': 8 } #we've already shuffled paths dataset_train = DataLoader(dataset_train, **dataloader_params) dataset_test = DataLoader(dataset_test, **dataloader_params) batch = next(iter(dataset_test)) action_shape = batch['action'][0].shape img_shape = batch['img'][0].shape #Nets actor_net = DDPGActor(img_shape=img_shape, numeric_shape=[len(NUMERIC_FEATURES)], output_shape=[2], linear_hidden=linear_hidden, conv_filters=conv_hidden) critic_net = DDPGCritic(actor_out_shape=action_shape, img_shape=img_shape, numeric_shape=[len(NUMERIC_FEATURES)], linear_hidden=linear_hidden, conv_filters=conv_hidden) print(len(steps)) print(actor_net) print(get_n_params(actor_net)) print(critic_net) print(get_n_params(critic_net)) # save path actor_net_path = f'../data/models/offline/{DATE_TIME}/{actor_net.name}' critic_net_path = f'../data/models/offline/{DATE_TIME}/{critic_net.name}' os.makedirs(actor_net_path, exist_ok=True) os.makedirs(critic_net_path, exist_ok=True) optim_steps = args.optim_steps logging_idx = int(len(dataset_train.dataset) / (batch_size * optim_steps)) actor_writer_train = SummaryWriter(f'{actor_net_path}/train', max_queue=30, flush_secs=5) critic_writer_train = SummaryWriter(f'{critic_net_path}/train', max_queue=1, flush_secs=5) actor_writer_test = SummaryWriter(f'{actor_net_path}/test', max_queue=30, flush_secs=5) critic_writer_test = SummaryWriter(f'{critic_net_path}/test', max_queue=1, flush_secs=5) #Optimizers actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=0.001) critic_optimizer = torch.optim.Adam(critic_net.parameters(), lr=0.001) actor_scheduler = CosineAnnealingWarmRestarts(actor_optimizer, T_0=optim_steps, T_mult=2) critic_scheduler = CosineAnnealingWarmRestarts(critic_optimizer, T_0=optim_steps, T_mult=2) #Loss function loss_function = torch.nn.MSELoss(reduction='sum') actor_best_train_loss = 1e10 critic_best_train_loss = 1e10 actor_best_test_loss = 1e10 critic_best_test_loss = 1e10 for epoch_idx in range(no_epochs): actor_train_loss = .0 critic_train_loss = .0 actor_running_loss = .0 critic_running_loss = .0 actor_avg_max_grad = .0 critic_avg_max_grad = .0 actor_avg_avg_grad = .0 critic_avg_avg_grad = .0 for idx, batch in enumerate(iter(dataset_train)): global_step = int((len(dataset_train.dataset) / batch_size * epoch_idx) + idx) batch = unpack_batch(batch=batch, device=device) actor_loss, critic_loss, actor_grad, critic_grad = train_rl( batch=batch, actor_net=actor_net, critic_net=critic_net, actor_optimizer=actor_optimizer, critic_optimizer=critic_optimizer, loss_fn=loss_function) del batch gc.collect() actor_avg_max_grad += max( [element.max() for element in actor_grad]) critic_avg_max_grad += max( [element.max() for element in critic_grad]) actor_avg_avg_grad += sum( [element.mean() for element in actor_grad]) / len(actor_grad) critic_avg_avg_grad += sum( [element.mean() for element in critic_grad]) / len(critic_grad) actor_running_loss += actor_loss critic_train_loss += critic_loss actor_train_loss += actor_loss critic_running_loss += critic_loss actor_writer_train.add_scalar(tag=f'{actor_net.name}/running_loss', scalar_value=actor_loss / batch_size, global_step=global_step) actor_writer_train.add_scalar(tag=f'{actor_net.name}/max_grad', scalar_value=actor_avg_max_grad, global_step=global_step) actor_writer_train.add_scalar(tag=f'{actor_net.name}/mean_grad', scalar_value=actor_avg_avg_grad, global_step=global_step) critic_writer_train.add_scalar( tag=f'{critic_net.name}/running_loss', scalar_value=critic_loss / batch_size, global_step=global_step) critic_writer_train.add_scalar(tag=f'{critic_net.name}/max_grad', scalar_value=critic_avg_max_grad, global_step=global_step) critic_writer_train.add_scalar(tag=f'{critic_net.name}/mean_grad', scalar_value=critic_avg_avg_grad, global_step=global_step) if idx % logging_idx == logging_idx - 1: print( f'Actor Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {actor_running_loss/logging_idx}' ) print( f'Critic Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {critic_running_loss/logging_idx}' ) if (critic_running_loss / logging_idx) < critic_best_train_loss: critic_best_train_loss = critic_running_loss / logging_idx torch.save(actor_net.state_dict(), f'{actor_net_path}/train/train.pt') torch.save(critic_net.state_dict(), f'{critic_net_path}/train/train.pt') actor_writer_train.add_scalar( tag=f'{actor_net.name}/lr', scalar_value=actor_scheduler.get_last_lr()[0], global_step=global_step) critic_writer_train.add_scalar( tag=f'{critic_net.name}/lr', scalar_value=critic_scheduler.get_last_lr()[0], global_step=global_step) actor_scheduler.step() critic_scheduler.step() actor_running_loss = .0 actor_avg_max_grad = .0 actor_avg_avg_grad = .0 critic_running_loss = .0 critic_avg_max_grad = .0 critic_avg_avg_grad = .0 print( f'{critic_net.name} best train loss for epoch {epoch_idx+1} - {critic_best_train_loss}' ) actor_writer_train.add_scalar( tag=f'{actor_net.name}/global_loss', scalar_value=(actor_train_loss / (len(dataset_train.dataset))), global_step=(epoch_idx + 1)) critic_writer_train.add_scalar( tag=f'{critic_net.name}/global_loss', scalar_value=(critic_train_loss / (len(dataset_train.dataset))), global_step=(epoch_idx + 1)) actor_test_loss = .0 critic_test_loss = .0 with torch.no_grad(): for idx, batch in enumerate(iter(dataset_test)): batch = unpack_batch(batch=batch, device=device) q_pred = critic_net(**batch) action_pred = actor_net(**batch) critic_loss = loss_function(q_pred.view(-1), batch['q']).abs().sum() actor_loss = loss_function(action_pred, batch['action']).abs().sum() critic_test_loss += critic_loss actor_test_loss += actor_loss if critic_test_loss / len( dataset_test.dataset) < critic_best_test_loss: critic_best_test_loss = (critic_test_loss / len(dataset_test.dataset)) if actor_test_loss / len(dataset_test.dataset) < actor_best_test_loss: actor_best_test_loss = (actor_test_loss / len(dataset_test.dataset)) torch.save(critic_net.state_dict(), f'{critic_net_path}/test/test_{epoch_idx+1}.pt') torch.save(actor_net.state_dict(), f'{actor_net_path}/test/test_{epoch_idx+1}.pt') print( f'{critic_net.name} test loss {(critic_test_loss/len(dataset_test.dataset)):.3f}' ) print( f'{actor_net.name} test loss {(actor_test_loss/len(dataset_test.dataset)):.3f}' ) print(f'{critic_net.name} best test loss {critic_best_test_loss:.3f}') print(f'{actor_net.name} best test loss {actor_best_test_loss:.3f}') critic_writer_test.add_scalar( tag=f'{critic_net.name}/global_loss', scalar_value=(critic_test_loss / (len(dataset_test.dataset))), global_step=(epoch_idx + 1)) actor_writer_test.add_scalar( tag=f'{actor_net.name}/global_loss', scalar_value=(actor_test_loss / (len(dataset_test.dataset))), global_step=(epoch_idx + 1)) torch.cuda.empty_cache() gc.collect() torch.save(actor_optimizer.state_dict(), f=f'{actor_net_path}/{actor_optimizer.__class__.__name__}.pt') torch.save(critic_optimizer.state_dict(), f=f'{critic_net_path}/{critic_optimizer.__class__.__name__}.pt') json.dump(vars(args), fp=open(f'{actor_net_path}/args.json', 'w'), sort_keys=True, indent=4) json.dump(vars(args), fp=open(f'{critic_net_path}/args.json', 'w'), sort_keys=True, indent=4) actor_writer_train.flush() actor_writer_test.flush() actor_writer_train.close() actor_writer_test.close() critic_writer_train.flush() critic_writer_test.flush() critic_writer_train.close() critic_writer_test.close() batch = next(iter(dataset_test)) batch = unpack_batch(batch=batch, device=device) #Actor architecture save y = actor_net(**batch) g = make_dot(y, params=dict(actor_net.named_parameters())) g.save(filename=f'{DATE_TIME}_{actor_net.name}.dot', directory=actor_net_path) #Critic architecture save y = critic_net(**batch) g = make_dot(y, params=dict(critic_net.named_parameters())) g.save(filename=f'{DATE_TIME}_{critic_net.name}.dot', directory=critic_net_path) check_call([ 'dot', '-Tpng', '-Gdpi=200', f'{critic_net_path}/{DATE_TIME}_{critic_net.name}.dot', '-o', f'{critic_net_path}/{DATE_TIME}_{critic_net.name}.png' ]) check_call([ 'dot', '-Tpng', '-Gdpi=200', f'{actor_net_path}/{DATE_TIME}_{actor_net.name}.dot', '-o', f'{actor_net_path}/{DATE_TIME}_{actor_net.name}.png' ])
def run_training(data_type="screw", model_dir="models", epochs=256, pretrained=True, test_epochs=10, freeze_resnet=20, learninig_rate=0.03, optim_name="SGD", batch_size=64, head_layer=8): torch.multiprocessing.freeze_support() # TODO: use script params for hyperparameter # Temperature Hyperparameter currently not used temperature = 0.2 device = "cuda" weight_decay = 0.00003 momentum = 0.9 #TODO: use f strings also for the date LOL model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format( date=datetime.datetime.now()) #augmentation: size = 256 min_scale = 0.5 # create Training Dataset and Dataloader after_cutpaste_transform = transforms.Compose([]) after_cutpaste_transform.transforms.append(transforms.ToTensor()) after_cutpaste_transform.transforms.append( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])) train_transform = transforms.Compose([]) # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1))) # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0))) train_transform.transforms.append(transforms.Resize((256, 256))) train_transform.transforms.append( CutPaste(transform=after_cutpaste_transform)) # train_transform.transforms.append(transforms.ToTensor()) train_data = MVTecAT("Data", data_type, transform=train_transform, size=int(size * (1 / min_scale))) dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8, collate_fn=cut_paste_collate_fn, persistent_workers=True, pin_memory=True, prefetch_factor=5) # Writer will output to ./runs/ directory by default writer = SummaryWriter(Path("logdirs") / model_name) # create Model: head_layers = [512] * head_layer + [128] print(head_layers) model = ProjectionNet(pretrained=pretrained, head_layers=head_layers) model.to(device) if freeze_resnet > 0: model.freeze_resnet() loss_fn = torch.nn.CrossEntropyLoss() if optim_name == "sgd": optimizer = optim.SGD(model.parameters(), lr=learninig_rate, momentum=momentum, weight_decay=weight_decay) scheduler = CosineAnnealingWarmRestarts(optimizer, epochs) #scheduler = None elif optim_name == "adam": optimizer = optim.Adam(model.parameters(), lr=learninig_rate, weight_decay=weight_decay) scheduler = None else: print(f"ERROR unkown optimizer: {optim_name}") step = 0 import torch.autograd.profiler as profiler num_batches = len(dataloader) def get_data_inf(): while True: for out in enumerate(dataloader): yield out dataloader_inf = get_data_inf() # From paper: "Note that, unlike conventional definition for an epoch, # we define 256 parameter update steps as one epoch. for step in tqdm(range(epochs * 256)): epoch = int(step / 256) if epoch == freeze_resnet: model.unfreeze() batch_embeds = [] batch_idx, data = next(dataloader_inf) x1, x2 = data x1 = x1.to(device) x2 = x2.to(device) # zero the parameter gradients optimizer.zero_grad() xc = torch.cat((x1, x2), axis=0) embeds, logits = model(xc) # embeds = F.normalize(embeds, p=2, dim=1) # embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0) # ip = torch.matmul(embeds1, embeds2.T) # ip = ip / temperature # y = torch.arange(0,x1.size(0), device=device) # loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device)) y = torch.tensor([0, 1], device=device) y = y.repeat_interleave(x1.size(0)) loss = loss_fn(logits, y) # regulize weights: loss.backward() optimizer.step() if scheduler is not None: scheduler.step(epoch + batch_idx / num_batches) writer.add_scalar('loss', loss.item(), step) # predicted = torch.argmax(ip,axis=0) predicted = torch.argmax(logits, axis=1) # print(logits) # print(predicted) # print(y) accuracy = torch.true_divide(torch.sum(predicted == y), predicted.size(0)) writer.add_scalar('acc', accuracy, step) if scheduler is not None: writer.add_scalar('lr', scheduler.get_last_lr()[0], step) # save embed for validation: if test_epochs > 0 and epoch % test_epochs == 0: batch_embeds.append(embeds.cpu().detach()) writer.add_scalar('epoch', epoch, step) # run tests if test_epochs > 0 and epoch % test_epochs == 0: # run auc calculation #TODO: create dataset only once. #TODO: train predictor here or in the model class itself. Should not be in the eval part #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance??? # batch_embeds = torch.cat(batch_embeds) # print(batch_embeds.shape) model.eval() roc_auc = eval_model(model_name, data_type, device=device, save_plots=False, size=size, show_training_data=False, model=model) #train_embed=batch_embeds) model.train() writer.add_scalar('eval_auc', roc_auc, step) torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
class WarmRestartsCustomScheduler(_LRScheduler): """Custom Learning Rate Scheduler based on the 3rd Place Solution. This is for setting the learning rate schedule: Warm Restarts for epochs (1-28) LR=1e-5 (29-32), LR=1e-6 (33-35) The general version looks like this: # from: # https://github.com/naivelamb/kaggle-cloud-organization/blob/master/main_seg.py if epoch < start_epoch + n_epochs - 1: if epoch != 0: scheduler.step() scheduler=warm_restart(scheduler, T_mult=2) elif (epoch < start_epoch + n_epochs + 2 and epoch >= start_epoch + n_epochs - 1): optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 """ def __init__(self, optimizer, T_0, T_mult=2, eta_min=0, num_wr_epochs=28, mid_const_lr_epochs_range=[29, 32], constant_lrs=[1e-5, 5e-6], last_epoch=-1): """ Args: optimizer (torch.optim.Optimizer): T_0: T_mult: eta_min: num_wr_epochs (int): The number of warm restart epochs to do mid_const_lr_epochs_range (list-like[int]): [min, max] where max is not included. This is the epoch interval where the first lr of constant_lr is used constant_lrs (list-like[float]): the learning rates to use for the mid and end intervals after warm restarts ends. """ self.num_wr_epochs = num_wr_epochs assert len(mid_const_lr_epochs_range) == 2, \ "`constant_lrs` must be a list-like with length 2." self.mid_const_lr_epochs_range = mid_const_lr_epochs_range assert len(constant_lrs) == 2, \ "`constant_lrs` must be a list-like with length 2." self.constant_lrs = constant_lrs self.optimizer = optimizer self.warm_restarts = CosineAnnealingWarmRestarts( self.optimizer, T_0, T_mult, eta_min) super().__init__(optimizer, last_epoch=last_epoch) def get_lr(self): """No calculation done here. """ return self.get_last_lr() def step(self, epoch=None): """Computes a step for the learning rate scheduler. Here, a step is an epoch. This is where the learning rates are set and the last_epoch counter is updated. """ # warm restarts if self.last_epoch < self.num_wr_epochs + 1: self.warm_restarts.step() self.last_epoch = self.warm_restarts.last_epoch self._last_lr = self.warm_restarts.get_last_lr() # constant LR (first round) elif (self.last_epoch >= self.mid_const_lr_epochs_range[0] and self.last_epoch < self.mid_const_lr_epochs_range[1]): self.last_epoch += 1 for param_group in self.optimizer.param_groups: param_group['lr'] = self.constant_lrs[0] self._last_lr = [ group['lr'] for group in self.optimizer.param_groups ] # constant LR (second round) else: for param_group in self.optimizer.param_groups: param_group['lr'] = self.constant_lrs[1] self.last_epoch += 1 self._last_lr = [ group['lr'] for group in self.optimizer.param_groups ]