def forward_log_prob(self, returns, cell, args, n_steps, outputs, conditioned=True): batch_size = returns.shape[1] sample_returns = returns probabilities = torch.ones(batch_size).to(self.device) for j in range(n_steps): next_distrib, cell = self.get_distrib(sample_returns, cell) next_distrib = next_distrib.squeeze(2) if conditioned and j == args.step_condition - 1: value = args.value_condition / outputs[:, j - 1:j] normalized_value = (value - self.mean) / self.dev rescaled_value = normalized_value.squeeze(1) else: if j == 0: value = outputs[:, j:j + 1] else: value = outputs[:, j:j + 1] / outputs[:, j - 1:j] normalized_value = (value - self.mean) / self.dev rescaled_value = normalized_value.squeeze(1) sample_returns = normalized_value.transpose(1, 0) loss = Loss() aux = loss.compute_probs(next_distrib, rescaled_value).squeeze(1) probabilities *= aux return torch.log(probabilities)
def main(): global args net = UNet(3, 1) net.load(opt.ckpt_path) loss = Loss('soft_dice_loss') torch.cuda.set_device(0) net = net.cuda() loss = loss.cuda() if args.phase == 'train': # train dataset = NucleiDetector(opt, phase=args.phase) train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=opt.pin_memory) lr = opt.lr optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=opt.weight_decay) previous_loss = None # haven't run for epoch in range(opt.epoch + 1): now_loss = train(train_loader, net, loss, epoch, optimizer, opt.model_save_freq, opt.model_save_path) if previous_loss is not None and now_loss > previous_loss: lr *= opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr save_lr(net.model_name, opt.lr_save_path, lr) previous_loss = now_loss elif args.phase == 'val': # val phase dataset = NucleiDetector(opt, phase='val') val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=opt.pin_memory) val(val_loader, net, loss) else: # test phase dataset = NucleiDetector(opt, phase='test') test_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=opt.num_workers, pin_memory=opt.pin_memory) test(test_loader, net, opt)
def test(epoch): if not os.path.exists(os.path.join(os.getcwd(), 'Results')): os.mkdir(os.path.join(os.getcwd(), 'Results')) model.eval() test_loss = 0 with torch.no_grad(): for i, (data, _) in enumerate(test_loader): data = data.to(device) recon_batch, mu, logvar = model(data) test_loss += Loss(recon_batch, data, mu, logvar).item() if i == 0: n = min(data.size(0), 8) comparison = torch.cat([ data[:n], recon_batch.view(args.batch_size, 3, 32, 32)[:n] ]) save_image(comparison.data.cpu(), 'Results/epoch_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_loader.dataset) print('====> Test in Epoch %d' % (epoch)) print('====> Test set loss: {:.4f}'.format(test_loss))
def __init__(self, backbone=None, num_classes=21): super(SSD300, self).__init__() self.feature_extractor = backbone self.num_classes = num_classes # number of default bounding boxes in each feature map self.num_defaults = [4, 6, 6, 6, 4, 4] # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 self._build_additional_features(self.feature_extractor.out_channels) # output of location regression and classification location_extractors = list() confidence_extractors = list() # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels): # nd is number_default_boxes, oc is output_channel location_extractors.append( nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) confidence_extractors.append( nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1)) # location regression layers and classification layers self.loc = nn.ModuleList(location_extractors) self.conf = nn.ModuleList(confidence_extractors) self._init_weights() # all default bounding boxes in SSD # shape [8732, 4] default_box = dboxes300() self.compute_loss = Loss(default_box) self.encoder = Encoder(default_box) self.postprocess = PostProcess(default_box)
def forward_conditioned(self, returns, cell, args, n_steps): batch_size = returns.shape[1] sample_returns = returns # Last input is used for converting returns to values return_product = torch.ones(1, batch_size, 1).to(self.device) outputs = torch.zeros(batch_size, 0, 1).to(self.device) for j in range(n_steps): next_distrib, cell = self.get_distrib(sample_returns, cell) next_distrib = next_distrib.squeeze(2) if j == args.step_condition - 1: # Get likelihood of value loss = Loss() value = args.value_condition / outputs[:, j - 1:j] rescaled_value = (value.squeeze(1) - self.mean) / self.dev weights = loss.compute_probs(next_distrib, rescaled_value).squeeze(1) # Next sample returns is the value sample_returns = args.value_condition else: # Sample from next distrib sample_returns = sample_from_distrib_reparametrized( next_distrib, batch_size, j) rescaled_sample_return = (sample_returns * self.dev + self.mean) return_product = return_product * rescaled_sample_return # Cat distrib outputs = torch.cat( [outputs, return_product.permute(1, 0, 2)], 1) return outputs, weights
def evaluate_dataset(csv_path, target_index, problem, model, parameter_dict, method='holdout', seed=20, max_iter=50): print('Now evaluating {}...'.format(csv_path)) x, y = build(csv_path, target_index) wrapper = Loss(model, x, y, method=method, problem=problem) # print('Evaluating PI') # np.random.seed(seed) # sexp = SquaredExponential() # gp = GaussianProcess(sexp, optimize=True, usegrads=True) # acq_pi = Acquisition(mode='probability_improvement') # bo_pi = BO(gp, acq_pi, wrapper.evaluate_loss, parameter_dict, n_jobs=1) # bo_pi.run(max_iter=max_iter) print('Evaluating EI') np.random.seed(seed) sexp = SquaredExponential() gp = GaussianProcess(sexp, optimize=True, usegrads=True) acq_ei = Acquisition(mode='expected_improvement') bo_ei = BO(gp, acq_ei, wrapper.evaluate_loss, parameter_dict, n_jobs=1) bo_ei.run(max_iter=max_iter) # Also add gpucb, beta = 0.5, beta = 1.5 print('Evaluating GP-gpucb beta = 0.5') np.random.seed(seed) sexp = SquaredExponential() gp = GaussianProcess(sexp, optimize=True, usegrads=True) acq_ucb = Acquisition(mode='gpucb', beta=0.5) bo_ucb = BO(gp, acq_ucb, wrapper.evaluate_loss, parameter_dict, n_jobs=1) bo_ucb.run(max_iter=max_iter) # print('Evaluating GP-gpucb beta = 1.5') # np.random.seed(seed) # sexp = SquaredExponential() # gp = GaussianProcess(sexp, optimize=True, usegrads=True) # acq_ucb2 = Acquisition(mode='gpucb', beta=1.5) # bo_ucb2 = BO(gp, acq_ucb2, wrapper.evaluate_loss, parameter_dict, n_jobs=1) # bo_ucb2.run(max_iter=max_iter) print('Evaluating random') np.random.seed(seed) r = evaluate_random(bo_ei, wrapper.evaluate_loss, n_eval=max_iter + 1) r = cum_max(r) # pi_h = np.array(gpgo_pi.history) ei_h = np.array(bo_ei.history) ucb1_h = np.array(bo_ucb.history) # ucb2_h = np.array(gpgo_ucb2.history) return ei_h, ucb1_h, r
def train(): config = Config() train_data, dev_data, vocabulary = get_dataset(config.data_path) poetry_model = PoetryModel(vocabulary_size=len(vocabulary), embedding_size=config.embedding_size, hidden_size=config.hidden_size) loss = Loss(pred='output', target='target') perplexity = Perplexity(pred='output', target='target') print("optimizer:", config.optimizer) print("momentum:", config.momentum) if config.optimizer == 'adam': optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay) elif config.optimizer == 'sgd': optimizer = SGD(lr=config.lr, momentum=config.momentum) elif config.optimizer == 'adagrad': optimizer = Adagrad(lr=config.lr, weight_decay=config.weight_decay) elif config.optimizer == 'adadelta': optimizer = Adadelta(lr=config.lr, rho=config.rho, eps=config.eps, weight_decay=config.weight_decay) timing = TimingCallback() early_stop = EarlyStopCallback(config.patience) trainer = Trainer(train_data=train_data, model=poetry_model, loss=loss, metrics=perplexity, n_epochs=config.epoch, batch_size=config.batch_size, print_every=config.print_every, validate_every=config.validate_every, dev_data=dev_data, save_path=config.save_path, optimizer=optimizer, check_code_level=config.check_code_level, metric_key="-PPL", sampler=RandomSampler(), prefetch=False, use_tqdm=True, device=config.device, callbacks=[timing, early_stop]) trainer.train()
def train(epoch, print_loss=False): model.train() train_loss = 0 for batch_idx, (data, _) in enumerate(train_loader): data = data.to(device) optimizer.zero_grad() recon_batch, mu, logvar = model(data) loss = Loss(recon_batch, data, mu, logvar) loss.backward() train_loss += loss.item() optimizer.step() if print_loss: if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item() / len(data))) print('====> Epoch: {} Average loss: {:.4f}'.format( epoch, train_loss / len(train_loader.dataset)))
batch_size=64, shuffle=False, num_workers=2) # write header with open('log.csv', 'w') as f: writer = csv.writer(f) writer.writerow( ["iteration", "train_loss", "val_loss", "acc", "val_acc"]) # device_ids = [2,] # build model and optimizer # model = nn.DataParallel(ResNet6n(10, n = 18), device_ids = device_ids) model = ResNet6n(10, n=18) model.cuda() criterion = Loss() criterion.cuda() # model.load_state_dict(torch.load("weights.pkl")) # train i = 0 correct, total = 0, 0 train_loss, counter = 0, 0 for epoch in range(1000000): # iteration over all train data for (data1, data2) in zip(loader1, loader2): # update lr if i == 0: optimizer = optim.SGD(model.parameters(), lr=1e-1,
def main(args): config = Config(args.config) cfg = config(vars(args), mode=['train', 'init']) mdname = cfg['train']['model'] vdl_dir = os.path.join(args.save_dir, cfg['init']['vdl_dir']) vdl_dir = os.path.join(vdl_dir, mdname) vdl_name = 'vdlrecords.' + mdname + '.log' vdl_log_dir = os.path.join(vdl_dir, vdl_name) fil_list = os.path.join(cfg['train']['root_path'], cfg['train']['train_list']) mean = cfg['train']['mean'] std = cfg['train']['std'] custom = cfg['train']['custom']['type'] if custom == True: print('use custom data') mean = cfg['train']['custom']['mean'] std = cfg['train']['custom']['std'] # image enhance trfm = imgehance(size=cfg['train']['sz']) # load dataset ds = SDataSet(path=cfg['train']['root_path'], fl=fil_list, sz=cfg['train']['sz']) train_ds = SubSet(ds, mode='train', mean=mean, std=std, transform=trfm) val_ds = SubSet(ds, mode='valid', mean=mean, std=std, transform=None) # select model net = modelset(mode=mdname, num_classes=cfg['init']['num_classes']) # load moel input = InputSpec([None, 3, 64, 64], 'float32', 'image') label = InputSpec([None, 1, 64, 64], 'int64', 'label') model = paddle.Model(net, input, label) #print(model.summary((-1, 3, 64, 64))) # iters = 0 epochs = 0 if args.pretrain: model.load(path=os.path.join(args.save_dir, mdname) + '/' + str(mdname)) vdlreader = LogReader(file_path=vdl_log_dir) iters = vdlreader.get_data('scalar', 'train%miou')[-1].id + 1 epochs = vdlreader.get_data('scalar', 'eval%miou')[-1].id + 1 elif os.path.exists(vdl_dir): shutil.rmtree(vdl_dir) write = LogWriter(logdir=vdl_dir, file_name=vdl_name) opt = paddle.optimizer.Momentum(learning_rate=cfg['train']['lr'], parameters=model.parameters()) model.prepare( optimizer=opt, loss=Loss(), metrics=Miou(num_classes=cfg['init']['num_classes'], name='miou'), ) model.fit( train_ds, val_ds, epochs=cfg['train']['epoch'], batch_size=cfg['train']['batchsz'], log_freq=1, save_freq=cfg['train']['save_freq'], save_dir=os.path.join(args.save_dir, mdname) + '/' + str(mdname), verbose=1, num_workers=cfg['train']['num_workers'], callbacks=VDL(write=write, iters=iters, epochs=epochs) #VDL(logdir=vdl_dir)# ) print('save model in {}'.format(os.path.join(args.save_dir, mdname))) model.save(path=os.path.join(args.save_dir, mdname) + '/' + str(mdname))
def main(): parser = argparse.ArgumentParser( "DINO training CLI", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("-b", "--batch-size", type=int, default=4) parser.add_argument("-d", "--device", type=str, choices=("cpu", "cuda"), default="cuda") parser.add_argument("-l", "--logging-freq", type=int, default=200) parser.add_argument("--momentum-teacher", type=int, default=0.9995) parser.add_argument("-c", "--n-crops", type=int, default=4) parser.add_argument("-e", "--n-epochs", type=int, default=100) parser.add_argument("-o", "--out-dim", type=int, default=1024) parser.add_argument("-t", "--tensorboard-dir", type=str, default="logs") parser.add_argument("--clip-grad", type=float, default=2.0) parser.add_argument("--norm-last-layer", action="store_true") parser.add_argument("--batch-size-eval", type=int, default=8) parser.add_argument("--teacher-temp", type=float, default=0.04) parser.add_argument("--student-temp", type=float, default=0.1) parser.add_argument("--pretrained", action="store_true") parser.add_argument("-w", "--weight-decay", type=float, default=0.4) args = parser.parse_args() print(vars(args)) # Parameters vit_name, dim = "deit_small_patch16_224", 384 path_dataset_train = pathlib.Path("data/imagenette2-320/train") path_dataset_val = pathlib.Path("data/imagenette2-320/val") path_labels = pathlib.Path("data/imagenette_labels.json") logging_path = pathlib.Path(args.tensorboard_dir) device = torch.device(args.device) n_workers = 1 # para mi maquinita solo 2 como maximo # Data related with path_labels.open("r") as f: label_mapping = json.load(f) transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2) transform_plain = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), transforms.Resize((224, 224)), ]) dataset_train_aug = ImageFolder(path_dataset_train, transform=transform_aug) dataset_train_plain = ImageFolder(path_dataset_train, transform=transform_plain) dataset_val_plain = ImageFolder(path_dataset_val, transform=transform_plain) if dataset_train_plain.classes != dataset_val_plain.classes: raise ValueError("Inconsistent classes") data_loader_train_aug = DataLoader( dataset_train_aug, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=n_workers, pin_memory=True, ) data_loader_train_plain = DataLoader( dataset_train_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) data_loader_val_plain = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) data_loader_val_plain_subset = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain), 50))), num_workers=n_workers, ) # Logging writer = SummaryWriter(logging_path) writer.add_text("arguments", json.dumps(vars(args))) # Neural network related student_vit = timm.create_model(vit_name, pretrained=args.pretrained) teacher_vit = timm.create_model(vit_name, pretrained=args.pretrained) student = MultiCropWrapper( student_vit, Head( dim, args.out_dim, norm_last_layer=args.norm_last_layer, ), ) teacher = MultiCropWrapper(teacher_vit, Head(dim, args.out_dim)) student, teacher = student.to(device), teacher.to(device) teacher.load_state_dict(student.state_dict()) for p in teacher.parameters(): p.requires_grad = False # Loss related loss_inst = Loss( args.out_dim, teacher_temp=args.teacher_temp, student_temp=args.student_temp, ).to(device) lr = 0.0005 * args.batch_size / 256 optimizer = torch.optim.AdamW( student.parameters(), lr=lr, weight_decay=args.weight_decay, ) # Training loop n_batches = len(dataset_train_aug) // args.batch_size best_acc = 0 n_steps = 0 for e in range(args.n_epochs): for i, (images, _) in tqdm.tqdm(enumerate(data_loader_train_aug), total=n_batches): if n_steps % args.logging_freq == 0: student.eval() # Embedding embs, imgs, labels_ = compute_embedding( student.backbone, data_loader_val_plain_subset, ) writer.add_embedding( embs, metadata=[label_mapping[l] for l in labels_], label_img=imgs, global_step=n_steps, tag="embeddings", ) # KNN current_acc = compute_knn( student.backbone, data_loader_train_plain, data_loader_val_plain, ) writer.add_scalar("knn-accuracy", current_acc, n_steps) if current_acc > best_acc: torch.save(student, logging_path / "best_model.pth") best_acc = current_acc student.train() images = [img.to(device) for img in images] teacher_output = teacher(images[:2]) student_output = student(images) loss = loss_inst(student_output, teacher_output) optimizer.zero_grad() loss.backward() clip_gradients(student, args.clip_grad) optimizer.step() with torch.no_grad(): for student_ps, teacher_ps in zip(student.parameters(), teacher.parameters()): teacher_ps.data.mul_(args.momentum_teacher) teacher_ps.data.add_( (1 - args.momentum_teacher) * student_ps.detach().data) writer.add_scalar("train_loss", loss, n_steps) n_steps += 1
def main(): parser = argparse.ArgumentParser( "DINO training CLI", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "-m", "--model", type=str, default="vit_tiny", choices=["vit_tiny", "vit_small", "vit_base"], ) parser.add_argument("-b", "--batch-size", type=int, default=32) parser.add_argument("-d", "--device", type=int, default=0) parser.add_argument("--gpu", action="store_true") parser.add_argument("-l", "--logging-freq", type=int, default=200) parser.add_argument("--momentum-teacher", type=int, default=0.9995) parser.add_argument("-c", "--n-crops", type=int, default=4) parser.add_argument("-e", "--n-epochs", type=int, default=100) parser.add_argument("-o", "--out-dim", type=int, default=1024) parser.add_argument("-t", "--tensorboard-dir", type=str, default="") parser.add_argument("--optimizer", type=str, default="AdamW") parser.add_argument("--clip-grad", type=float, default=2.0) parser.add_argument("--norm-last-layer", action="store_true") parser.add_argument("--batch-size-eval", type=int, default=64) parser.add_argument("--teacher-temp", type=float, default=0.04) parser.add_argument("--student-temp", type=float, default=0.1) parser.add_argument("--pretrained", action="store_true") parser.add_argument("-w", "--weight-decay", type=float, default=0.4) args = parser.parse_args() print(vars(args)) # Parameters models = { "vit_tiny": [vit_tiny, 192], "vit_small": [vit_small, 384], "vit_base": [vit_base, 768], } path_dataset_train = pathlib.Path("data/imagenette2-320/train") path_dataset_val = pathlib.Path("data/imagenette2-320/val") path_labels = pathlib.Path("data/imagenette_labels.json") if args.gpu: torch.cuda.empty_cache() torch.cuda.set_device(args.device) device = torch.cuda.current_device() print(f"Current CUDA device: {device}") else: device = torch.device("cpu") print(f"Current device: {device}") n_workers = 4 ################## # Data preparation ################## with path_labels.open("r") as f: label_mapping = json.load(f) transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2) transform_plain = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), transforms.Resize((224, 224)), ]) dataset_train_aug = ImageFolder(path_dataset_train, transform=transform_aug) dataset_train_plain = ImageFolder(path_dataset_train, transform=transform_plain) dataset_val_plain = ImageFolder(path_dataset_val, transform=transform_plain) if dataset_train_plain.classes != dataset_val_plain.classes: raise ValueError("Inconsistent classes") train_dataloader_aug = DataLoader( dataset_train_aug, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=n_workers, pin_memory=True, ) train_dataloader_plain = DataLoader( dataset_train_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) val_dataloader_plain = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) val_dataloader_plain_subset = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain), 50))), num_workers=n_workers, ) print(f"[INFO] Data loaded") ######### # Logging ######### run = neptune.init(project="beomus/dino-test") run["config/parameters"] = json.dumps(vars(args)) writer = SummaryWriter(log_dir=args.tensorboard_dir) writer.add_text("arguments", json.dumps(vars(args))) logging_path = pathlib.Path(writer.log_dir) wandb.init(project="dino", entity="beomus") wandb.config.update(args) print(f"[INFO] Logging started") ####################### # Models initialization ####################### model_fn, dim = models[args.model] student_vit = model_fn() teacher_vit = model_fn() student = MultiCropWrapper( student_vit, MlpHead(in_dim=dim, out_dim=args.out_dim, norm_last_layer=args.norm_last_layer), ) teacher = MultiCropWrapper(teacher_vit, MlpHead(dim, args.out_dim)) student, teacher = student.to(device), teacher.to(device) teacher.load_state_dict(student.state_dict()) for p in teacher.parameters(): p.requires_grad = False print(f"[INFO]: Model initialized") ###### # Loss ###### loss_inst = Loss( out_dim=args.out_dim, teacher_temp=args.teacher_temp, student_temp=args.student_temp, ).to(device) lr = 0.0005 * args.batch_size / 256 optimizer_kwargs = { "params": student.parameters(), "lr": lr, "weight_decay": args.weight_decay, "amsgrad": True, } if args.optimizer == "SGD": optimizer_kwargs["momentum"] = 0.9 optimizer_kwargs.pop("amsgrad") optimizer = getattr(torch.optim, args.optimizer)(**optimizer_kwargs) # optimizer = torch.optim.AdamW( # student.parameters(), lr=lr, weight_decay=args.weight_decay # ) model_name = f"{type(student).__name__}" with open(f"{logging_path / model_name}_arch.txt", "w") as f: f.write(str(student)) run[f"config/model/{model_name}_arch"].upload( f"{logging_path / model_name}_arch.txt") optimizer_name = f"{type(optimizer).__name__}" with open(f"{logging_path / optimizer_name}.txt", "w") as f: f.write(str(optimizer)) run[f"config/{optimizer_name}"].upload( f"{logging_path / optimizer_name}.txt") ############### # Training loop ############### n_batches = len(dataset_train_aug) // args.batch_size n_steps, best_acc = 0, 0 print(f"[INFO]: Training started") for epoch in range(args.n_epochs): for i, (images, _) in tqdm.tqdm(enumerate(train_dataloader_aug), total=n_batches): if n_steps % args.logging_freq == 0: student.eval() # embedding embs, imgs, labels_ = compute_embedding( student.backbone, val_dataloader_plain_subset) writer.add_embedding( embs, metadata=[label_mapping[l] for l in labels_], label_img=imgs, global_step=n_steps, tag="embeddings", ) # KNN current_acc = compute_knn(student.backbone, train_dataloader_plain, val_dataloader_plain) writer.add_scalar("knn-accuracy", current_acc, n_steps) run["metrics/acc"].log(current_acc) wandb.log({"accuracy": current_acc}) if current_acc > best_acc: model_path = str(logging_path / "model_best.pth") torch.save(student, model_path) run["model_checkpoints/my_model"].upload(model_path) best_acc = current_acc student.train() images = [img.to(device) for img in images] teacher_output = teacher(images[:2]) student_output = student(images) loss = loss_inst(student_output, teacher_output) optimizer.zero_grad() loss.backward() clip_gradients(student, args.clip_grad) optimizer.step() with torch.no_grad(): for student_ps, teacher_ps in zip(student.parameters(), teacher.parameters()): teacher_ps.data.mul_(args.momentum_teacher) teacher_ps.data.add_( (1 - args.momentum_teacher) * student_ps.detach().data) writer.add_scalar("train_loss", loss, n_steps) run["metrics/loss"].log(loss) wandb.log({"loss": loss}) n_steps += 1 print(f"[INFO]: Training ended") run.stop()