Exemple #1
0
    def forward_log_prob(self,
                         returns,
                         cell,
                         args,
                         n_steps,
                         outputs,
                         conditioned=True):

        batch_size = returns.shape[1]
        sample_returns = returns

        probabilities = torch.ones(batch_size).to(self.device)
        for j in range(n_steps):

            next_distrib, cell = self.get_distrib(sample_returns, cell)
            next_distrib = next_distrib.squeeze(2)

            if conditioned and j == args.step_condition - 1:
                value = args.value_condition / outputs[:, j - 1:j]
                normalized_value = (value - self.mean) / self.dev
                rescaled_value = normalized_value.squeeze(1)
            else:
                if j == 0:
                    value = outputs[:, j:j + 1]
                else:
                    value = outputs[:, j:j + 1] / outputs[:, j - 1:j]
                normalized_value = (value - self.mean) / self.dev
                rescaled_value = normalized_value.squeeze(1)
                sample_returns = normalized_value.transpose(1, 0)

            loss = Loss()
            aux = loss.compute_probs(next_distrib, rescaled_value).squeeze(1)
            probabilities *= aux

        return torch.log(probabilities)
def main():
    global args
    net = UNet(3, 1)
    net.load(opt.ckpt_path)
    loss = Loss('soft_dice_loss')
    torch.cuda.set_device(0)
    net = net.cuda()
    loss = loss.cuda()

    if args.phase == 'train':
        # train
        dataset = NucleiDetector(opt, phase=args.phase)
        train_loader = DataLoader(dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers,
                                  pin_memory=opt.pin_memory)
        lr = opt.lr
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=lr,
                                     weight_decay=opt.weight_decay)
        previous_loss = None  # haven't run
        for epoch in range(opt.epoch + 1):
            now_loss = train(train_loader, net, loss, epoch, optimizer,
                             opt.model_save_freq, opt.model_save_path)
            if previous_loss is not None and now_loss > previous_loss:
                lr *= opt.lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                save_lr(net.model_name, opt.lr_save_path, lr)
            previous_loss = now_loss
    elif args.phase == 'val':
        # val phase
        dataset = NucleiDetector(opt, phase='val')
        val_loader = DataLoader(dataset,
                                batch_size=opt.batch_size,
                                shuffle=True,
                                num_workers=opt.num_workers,
                                pin_memory=opt.pin_memory)
        val(val_loader, net, loss)
    else:
        # test phase
        dataset = NucleiDetector(opt, phase='test')
        test_loader = DataLoader(dataset,
                                 batch_size=1,
                                 shuffle=True,
                                 num_workers=opt.num_workers,
                                 pin_memory=opt.pin_memory)
        test(test_loader, net, opt)
Exemple #3
0
def test(epoch):

    if not os.path.exists(os.path.join(os.getcwd(), 'Results')):
        os.mkdir(os.path.join(os.getcwd(), 'Results'))

    model.eval()
    test_loss = 0

    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += Loss(recon_batch, data, mu, logvar).item()

            if i == 0:
                n = min(data.size(0), 8)

                comparison = torch.cat([
                    data[:n],
                    recon_batch.view(args.batch_size, 3, 32, 32)[:n]
                ])
                save_image(comparison.data.cpu(),
                           'Results/epoch_' + str(epoch) + '.png',
                           nrow=n)

    test_loss /= len(test_loader.dataset)
    print('====> Test in Epoch %d' % (epoch))
    print('====> Test set loss: {:.4f}'.format(test_loss))
Exemple #4
0
    def __init__(self, backbone=None, num_classes=21):
        super(SSD300, self).__init__()
        self.feature_extractor = backbone
        self.num_classes = num_classes
        # number of default bounding boxes in each feature map
        self.num_defaults = [4, 6, 6, 6, 4, 4]
        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        self._build_additional_features(self.feature_extractor.out_channels)

        # output of location regression and classification
        location_extractors = list()
        confidence_extractors = list()

        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        for nd, oc in zip(self.num_defaults,
                          self.feature_extractor.out_channels):
            # nd is number_default_boxes, oc is output_channel
            location_extractors.append(
                nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
            confidence_extractors.append(
                nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))

        # location regression layers and classification layers
        self.loc = nn.ModuleList(location_extractors)
        self.conf = nn.ModuleList(confidence_extractors)
        self._init_weights()

        # all default bounding boxes in SSD
        # shape [8732, 4]
        default_box = dboxes300()
        self.compute_loss = Loss(default_box)
        self.encoder = Encoder(default_box)
        self.postprocess = PostProcess(default_box)
Exemple #5
0
    def forward_conditioned(self, returns, cell, args, n_steps):

        batch_size = returns.shape[1]
        sample_returns = returns

        # Last input is used for converting returns to values
        return_product = torch.ones(1, batch_size, 1).to(self.device)
        outputs = torch.zeros(batch_size, 0, 1).to(self.device)

        for j in range(n_steps):

            next_distrib, cell = self.get_distrib(sample_returns, cell)
            next_distrib = next_distrib.squeeze(2)

            if j == args.step_condition - 1:

                # Get likelihood of value
                loss = Loss()

                value = args.value_condition / outputs[:, j - 1:j]
                rescaled_value = (value.squeeze(1) - self.mean) / self.dev

                weights = loss.compute_probs(next_distrib,
                                             rescaled_value).squeeze(1)

                # Next sample returns is the value
                sample_returns = args.value_condition

            else:

                # Sample from next distrib
                sample_returns = sample_from_distrib_reparametrized(
                    next_distrib, batch_size, j)

                rescaled_sample_return = (sample_returns * self.dev +
                                          self.mean)
                return_product = return_product * rescaled_sample_return

                # Cat distrib
                outputs = torch.cat(
                    [outputs, return_product.permute(1, 0, 2)], 1)

        return outputs, weights
Exemple #6
0
def evaluate_dataset(csv_path, target_index, problem, model, parameter_dict, method='holdout', seed=20, max_iter=50):
    print('Now evaluating {}...'.format(csv_path))
    x, y = build(csv_path, target_index)

    wrapper = Loss(model, x, y, method=method, problem=problem)

    # print('Evaluating PI')
    # np.random.seed(seed)
    # sexp = SquaredExponential()
    # gp = GaussianProcess(sexp, optimize=True, usegrads=True)
    # acq_pi = Acquisition(mode='probability_improvement')
    # bo_pi = BO(gp, acq_pi, wrapper.evaluate_loss, parameter_dict, n_jobs=1)
    # bo_pi.run(max_iter=max_iter)

    print('Evaluating EI')
    np.random.seed(seed)
    sexp = SquaredExponential()
    gp = GaussianProcess(sexp, optimize=True, usegrads=True)
    acq_ei = Acquisition(mode='expected_improvement')
    bo_ei = BO(gp, acq_ei, wrapper.evaluate_loss, parameter_dict, n_jobs=1)
    bo_ei.run(max_iter=max_iter)

    # Also add gpucb, beta = 0.5, beta = 1.5
    print('Evaluating GP-gpucb beta = 0.5')
    np.random.seed(seed)
    sexp = SquaredExponential()
    gp = GaussianProcess(sexp, optimize=True, usegrads=True)
    acq_ucb = Acquisition(mode='gpucb', beta=0.5)
    bo_ucb = BO(gp, acq_ucb, wrapper.evaluate_loss, parameter_dict, n_jobs=1)
    bo_ucb.run(max_iter=max_iter)

    # print('Evaluating GP-gpucb beta = 1.5')
    # np.random.seed(seed)
    # sexp = SquaredExponential()
    # gp = GaussianProcess(sexp, optimize=True, usegrads=True)
    # acq_ucb2 = Acquisition(mode='gpucb', beta=1.5)
    # bo_ucb2 = BO(gp, acq_ucb2, wrapper.evaluate_loss, parameter_dict, n_jobs=1)
    # bo_ucb2.run(max_iter=max_iter)

    print('Evaluating random')
    np.random.seed(seed)
    r = evaluate_random(bo_ei, wrapper.evaluate_loss, n_eval=max_iter + 1)
    r = cum_max(r)

    # pi_h = np.array(gpgo_pi.history)
    ei_h = np.array(bo_ei.history)
    ucb1_h = np.array(bo_ucb.history)
    # ucb2_h = np.array(gpgo_ucb2.history)

    return ei_h, ucb1_h, r
def train():
    config = Config()

    train_data, dev_data, vocabulary = get_dataset(config.data_path)

    poetry_model = PoetryModel(vocabulary_size=len(vocabulary),
                               embedding_size=config.embedding_size,
                               hidden_size=config.hidden_size)
    loss = Loss(pred='output', target='target')
    perplexity = Perplexity(pred='output', target='target')

    print("optimizer:", config.optimizer)
    print("momentum:", config.momentum)
    if config.optimizer == 'adam':
        optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay)
    elif config.optimizer == 'sgd':
        optimizer = SGD(lr=config.lr, momentum=config.momentum)
    elif config.optimizer == 'adagrad':
        optimizer = Adagrad(lr=config.lr, weight_decay=config.weight_decay)
    elif config.optimizer == 'adadelta':
        optimizer = Adadelta(lr=config.lr,
                             rho=config.rho,
                             eps=config.eps,
                             weight_decay=config.weight_decay)

    timing = TimingCallback()
    early_stop = EarlyStopCallback(config.patience)

    trainer = Trainer(train_data=train_data,
                      model=poetry_model,
                      loss=loss,
                      metrics=perplexity,
                      n_epochs=config.epoch,
                      batch_size=config.batch_size,
                      print_every=config.print_every,
                      validate_every=config.validate_every,
                      dev_data=dev_data,
                      save_path=config.save_path,
                      optimizer=optimizer,
                      check_code_level=config.check_code_level,
                      metric_key="-PPL",
                      sampler=RandomSampler(),
                      prefetch=False,
                      use_tqdm=True,
                      device=config.device,
                      callbacks=[timing, early_stop])
    trainer.train()
Exemple #8
0
def train(epoch, print_loss=False):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):

        data = data.to(device)
        optimizer.zero_grad()

        recon_batch, mu, logvar = model(data)
        loss = Loss(recon_batch, data, mu, logvar)

        loss.backward()
        train_loss += loss.item()
        optimizer.step()

        if print_loss:
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
                    loss.item() / len(data)))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
        epoch, train_loss / len(train_loader.dataset)))
Exemple #9
0
                                             batch_size=64,
                                             shuffle=False,
                                             num_workers=2)

    # write header
    with open('log.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(
            ["iteration", "train_loss", "val_loss", "acc", "val_acc"])

    # device_ids = [2,]
    # build model and optimizer
    # model = nn.DataParallel(ResNet6n(10, n = 18), device_ids = device_ids)
    model = ResNet6n(10, n=18)
    model.cuda()
    criterion = Loss()
    criterion.cuda()
    # model.load_state_dict(torch.load("weights.pkl"))

    # train
    i = 0
    correct, total = 0, 0
    train_loss, counter = 0, 0

    for epoch in range(1000000):
        # iteration over all train data
        for (data1, data2) in zip(loader1, loader2):
            # update lr
            if i == 0:
                optimizer = optim.SGD(model.parameters(),
                                      lr=1e-1,
Exemple #10
0
def main(args):
    config = Config(args.config)
    cfg = config(vars(args), mode=['train', 'init'])
    mdname = cfg['train']['model']
    vdl_dir = os.path.join(args.save_dir, cfg['init']['vdl_dir'])
    vdl_dir = os.path.join(vdl_dir, mdname)
    vdl_name = 'vdlrecords.' + mdname + '.log'
    vdl_log_dir = os.path.join(vdl_dir, vdl_name)

    fil_list = os.path.join(cfg['train']['root_path'],
                            cfg['train']['train_list'])
    mean = cfg['train']['mean']
    std = cfg['train']['std']

    custom = cfg['train']['custom']['type']
    if custom == True:
        print('use custom data')
        mean = cfg['train']['custom']['mean']
        std = cfg['train']['custom']['std']

    # image enhance
    trfm = imgehance(size=cfg['train']['sz'])

    # load dataset
    ds = SDataSet(path=cfg['train']['root_path'],
                  fl=fil_list,
                  sz=cfg['train']['sz'])

    train_ds = SubSet(ds, mode='train', mean=mean, std=std, transform=trfm)
    val_ds = SubSet(ds, mode='valid', mean=mean, std=std, transform=None)

    # select model
    net = modelset(mode=mdname, num_classes=cfg['init']['num_classes'])

    # load moel
    input = InputSpec([None, 3, 64, 64], 'float32', 'image')
    label = InputSpec([None, 1, 64, 64], 'int64', 'label')
    model = paddle.Model(net, input, label)
    #print(model.summary((-1, 3, 64, 64)))  #
    iters = 0
    epochs = 0
    if args.pretrain:
        model.load(path=os.path.join(args.save_dir, mdname) + '/' +
                   str(mdname))
        vdlreader = LogReader(file_path=vdl_log_dir)
        iters = vdlreader.get_data('scalar', 'train%miou')[-1].id + 1
        epochs = vdlreader.get_data('scalar', 'eval%miou')[-1].id + 1
    elif os.path.exists(vdl_dir):
        shutil.rmtree(vdl_dir)

    write = LogWriter(logdir=vdl_dir, file_name=vdl_name)

    opt = paddle.optimizer.Momentum(learning_rate=cfg['train']['lr'],
                                    parameters=model.parameters())
    model.prepare(
        optimizer=opt,
        loss=Loss(),
        metrics=Miou(num_classes=cfg['init']['num_classes'], name='miou'),
    )

    model.fit(
        train_ds,
        val_ds,
        epochs=cfg['train']['epoch'],
        batch_size=cfg['train']['batchsz'],
        log_freq=1,
        save_freq=cfg['train']['save_freq'],
        save_dir=os.path.join(args.save_dir, mdname) + '/' + str(mdname),
        verbose=1,
        num_workers=cfg['train']['num_workers'],
        callbacks=VDL(write=write, iters=iters,
                      epochs=epochs)  #VDL(logdir=vdl_dir)#
    )

    print('save model in {}'.format(os.path.join(args.save_dir, mdname)))
    model.save(path=os.path.join(args.save_dir, mdname) + '/' + str(mdname))
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(
        "DINO training CLI",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("-b", "--batch-size", type=int, default=4)
    parser.add_argument("-d",
                        "--device",
                        type=str,
                        choices=("cpu", "cuda"),
                        default="cuda")
    parser.add_argument("-l", "--logging-freq", type=int, default=200)
    parser.add_argument("--momentum-teacher", type=int, default=0.9995)
    parser.add_argument("-c", "--n-crops", type=int, default=4)
    parser.add_argument("-e", "--n-epochs", type=int, default=100)
    parser.add_argument("-o", "--out-dim", type=int, default=1024)
    parser.add_argument("-t", "--tensorboard-dir", type=str, default="logs")
    parser.add_argument("--clip-grad", type=float, default=2.0)
    parser.add_argument("--norm-last-layer", action="store_true")
    parser.add_argument("--batch-size-eval", type=int, default=8)
    parser.add_argument("--teacher-temp", type=float, default=0.04)
    parser.add_argument("--student-temp", type=float, default=0.1)
    parser.add_argument("--pretrained", action="store_true")
    parser.add_argument("-w", "--weight-decay", type=float, default=0.4)

    args = parser.parse_args()
    print(vars(args))
    # Parameters
    vit_name, dim = "deit_small_patch16_224", 384
    path_dataset_train = pathlib.Path("data/imagenette2-320/train")
    path_dataset_val = pathlib.Path("data/imagenette2-320/val")
    path_labels = pathlib.Path("data/imagenette_labels.json")

    logging_path = pathlib.Path(args.tensorboard_dir)
    device = torch.device(args.device)

    n_workers = 1  # para mi maquinita solo 2 como maximo

    # Data related
    with path_labels.open("r") as f:
        label_mapping = json.load(f)

    transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2)
    transform_plain = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.Resize((224, 224)),
    ])

    dataset_train_aug = ImageFolder(path_dataset_train,
                                    transform=transform_aug)
    dataset_train_plain = ImageFolder(path_dataset_train,
                                      transform=transform_plain)
    dataset_val_plain = ImageFolder(path_dataset_val,
                                    transform=transform_plain)

    if dataset_train_plain.classes != dataset_val_plain.classes:
        raise ValueError("Inconsistent classes")

    data_loader_train_aug = DataLoader(
        dataset_train_aug,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers,
        pin_memory=True,
    )
    data_loader_train_plain = DataLoader(
        dataset_train_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    data_loader_val_plain = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    data_loader_val_plain_subset = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain),
                                               50))),
        num_workers=n_workers,
    )

    # Logging
    writer = SummaryWriter(logging_path)
    writer.add_text("arguments", json.dumps(vars(args)))

    # Neural network related
    student_vit = timm.create_model(vit_name, pretrained=args.pretrained)
    teacher_vit = timm.create_model(vit_name, pretrained=args.pretrained)

    student = MultiCropWrapper(
        student_vit,
        Head(
            dim,
            args.out_dim,
            norm_last_layer=args.norm_last_layer,
        ),
    )
    teacher = MultiCropWrapper(teacher_vit, Head(dim, args.out_dim))
    student, teacher = student.to(device), teacher.to(device)

    teacher.load_state_dict(student.state_dict())

    for p in teacher.parameters():
        p.requires_grad = False

    # Loss related
    loss_inst = Loss(
        args.out_dim,
        teacher_temp=args.teacher_temp,
        student_temp=args.student_temp,
    ).to(device)
    lr = 0.0005 * args.batch_size / 256
    optimizer = torch.optim.AdamW(
        student.parameters(),
        lr=lr,
        weight_decay=args.weight_decay,
    )

    # Training loop
    n_batches = len(dataset_train_aug) // args.batch_size
    best_acc = 0
    n_steps = 0

    for e in range(args.n_epochs):
        for i, (images, _) in tqdm.tqdm(enumerate(data_loader_train_aug),
                                        total=n_batches):
            if n_steps % args.logging_freq == 0:
                student.eval()

                # Embedding
                embs, imgs, labels_ = compute_embedding(
                    student.backbone,
                    data_loader_val_plain_subset,
                )
                writer.add_embedding(
                    embs,
                    metadata=[label_mapping[l] for l in labels_],
                    label_img=imgs,
                    global_step=n_steps,
                    tag="embeddings",
                )

                # KNN
                current_acc = compute_knn(
                    student.backbone,
                    data_loader_train_plain,
                    data_loader_val_plain,
                )
                writer.add_scalar("knn-accuracy", current_acc, n_steps)
                if current_acc > best_acc:
                    torch.save(student, logging_path / "best_model.pth")
                    best_acc = current_acc

                student.train()

            images = [img.to(device) for img in images]

            teacher_output = teacher(images[:2])
            student_output = student(images)

            loss = loss_inst(student_output, teacher_output)

            optimizer.zero_grad()
            loss.backward()
            clip_gradients(student, args.clip_grad)
            optimizer.step()

            with torch.no_grad():
                for student_ps, teacher_ps in zip(student.parameters(),
                                                  teacher.parameters()):
                    teacher_ps.data.mul_(args.momentum_teacher)
                    teacher_ps.data.add_(
                        (1 - args.momentum_teacher) * student_ps.detach().data)

            writer.add_scalar("train_loss", loss, n_steps)

            n_steps += 1
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        "DINO training CLI",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-m",
        "--model",
        type=str,
        default="vit_tiny",
        choices=["vit_tiny", "vit_small", "vit_base"],
    )
    parser.add_argument("-b", "--batch-size", type=int, default=32)
    parser.add_argument("-d", "--device", type=int, default=0)
    parser.add_argument("--gpu", action="store_true")
    parser.add_argument("-l", "--logging-freq", type=int, default=200)
    parser.add_argument("--momentum-teacher", type=int, default=0.9995)
    parser.add_argument("-c", "--n-crops", type=int, default=4)
    parser.add_argument("-e", "--n-epochs", type=int, default=100)
    parser.add_argument("-o", "--out-dim", type=int, default=1024)
    parser.add_argument("-t", "--tensorboard-dir", type=str, default="")
    parser.add_argument("--optimizer", type=str, default="AdamW")
    parser.add_argument("--clip-grad", type=float, default=2.0)
    parser.add_argument("--norm-last-layer", action="store_true")
    parser.add_argument("--batch-size-eval", type=int, default=64)
    parser.add_argument("--teacher-temp", type=float, default=0.04)
    parser.add_argument("--student-temp", type=float, default=0.1)
    parser.add_argument("--pretrained", action="store_true")
    parser.add_argument("-w", "--weight-decay", type=float, default=0.4)

    args = parser.parse_args()
    print(vars(args))
    # Parameters
    models = {
        "vit_tiny": [vit_tiny, 192],
        "vit_small": [vit_small, 384],
        "vit_base": [vit_base, 768],
    }
    path_dataset_train = pathlib.Path("data/imagenette2-320/train")
    path_dataset_val = pathlib.Path("data/imagenette2-320/val")
    path_labels = pathlib.Path("data/imagenette_labels.json")

    if args.gpu:
        torch.cuda.empty_cache()
        torch.cuda.set_device(args.device)
        device = torch.cuda.current_device()
        print(f"Current CUDA device: {device}")
    else:
        device = torch.device("cpu")
        print(f"Current device: {device}")

    n_workers = 4

    ##################
    # Data preparation
    ##################
    with path_labels.open("r") as f:
        label_mapping = json.load(f)

    transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2)
    transform_plain = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.Resize((224, 224)),
    ])

    dataset_train_aug = ImageFolder(path_dataset_train,
                                    transform=transform_aug)
    dataset_train_plain = ImageFolder(path_dataset_train,
                                      transform=transform_plain)
    dataset_val_plain = ImageFolder(path_dataset_val,
                                    transform=transform_plain)

    if dataset_train_plain.classes != dataset_val_plain.classes:
        raise ValueError("Inconsistent classes")

    train_dataloader_aug = DataLoader(
        dataset_train_aug,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers,
        pin_memory=True,
    )
    train_dataloader_plain = DataLoader(
        dataset_train_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    val_dataloader_plain = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    val_dataloader_plain_subset = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain),
                                               50))),
        num_workers=n_workers,
    )
    print(f"[INFO] Data loaded")

    #########
    # Logging
    #########
    run = neptune.init(project="beomus/dino-test")
    run["config/parameters"] = json.dumps(vars(args))
    writer = SummaryWriter(log_dir=args.tensorboard_dir)
    writer.add_text("arguments", json.dumps(vars(args)))
    logging_path = pathlib.Path(writer.log_dir)

    wandb.init(project="dino", entity="beomus")
    wandb.config.update(args)

    print(f"[INFO] Logging started")

    #######################
    # Models initialization
    #######################
    model_fn, dim = models[args.model]
    student_vit = model_fn()
    teacher_vit = model_fn()

    student = MultiCropWrapper(
        student_vit,
        MlpHead(in_dim=dim,
                out_dim=args.out_dim,
                norm_last_layer=args.norm_last_layer),
    )
    teacher = MultiCropWrapper(teacher_vit, MlpHead(dim, args.out_dim))
    student, teacher = student.to(device), teacher.to(device)

    teacher.load_state_dict(student.state_dict())

    for p in teacher.parameters():
        p.requires_grad = False

    print(f"[INFO]: Model initialized")

    ######
    # Loss
    ######
    loss_inst = Loss(
        out_dim=args.out_dim,
        teacher_temp=args.teacher_temp,
        student_temp=args.student_temp,
    ).to(device)
    lr = 0.0005 * args.batch_size / 256

    optimizer_kwargs = {
        "params": student.parameters(),
        "lr": lr,
        "weight_decay": args.weight_decay,
        "amsgrad": True,
    }
    if args.optimizer == "SGD":
        optimizer_kwargs["momentum"] = 0.9
        optimizer_kwargs.pop("amsgrad")
    optimizer = getattr(torch.optim, args.optimizer)(**optimizer_kwargs)

    # optimizer = torch.optim.AdamW(
    #     student.parameters(), lr=lr, weight_decay=args.weight_decay
    # )

    model_name = f"{type(student).__name__}"
    with open(f"{logging_path / model_name}_arch.txt", "w") as f:
        f.write(str(student))
    run[f"config/model/{model_name}_arch"].upload(
        f"{logging_path / model_name}_arch.txt")

    optimizer_name = f"{type(optimizer).__name__}"
    with open(f"{logging_path / optimizer_name}.txt", "w") as f:
        f.write(str(optimizer))
    run[f"config/{optimizer_name}"].upload(
        f"{logging_path / optimizer_name}.txt")

    ###############
    # Training loop
    ###############
    n_batches = len(dataset_train_aug) // args.batch_size
    n_steps, best_acc = 0, 0

    print(f"[INFO]: Training started")
    for epoch in range(args.n_epochs):
        for i, (images, _) in tqdm.tqdm(enumerate(train_dataloader_aug),
                                        total=n_batches):
            if n_steps % args.logging_freq == 0:
                student.eval()

                # embedding
                embs, imgs, labels_ = compute_embedding(
                    student.backbone, val_dataloader_plain_subset)
                writer.add_embedding(
                    embs,
                    metadata=[label_mapping[l] for l in labels_],
                    label_img=imgs,
                    global_step=n_steps,
                    tag="embeddings",
                )

                # KNN
                current_acc = compute_knn(student.backbone,
                                          train_dataloader_plain,
                                          val_dataloader_plain)
                writer.add_scalar("knn-accuracy", current_acc, n_steps)
                run["metrics/acc"].log(current_acc)
                wandb.log({"accuracy": current_acc})
                if current_acc > best_acc:
                    model_path = str(logging_path / "model_best.pth")
                    torch.save(student, model_path)
                    run["model_checkpoints/my_model"].upload(model_path)
                    best_acc = current_acc

                student.train()

            images = [img.to(device) for img in images]

            teacher_output = teacher(images[:2])
            student_output = student(images)

            loss = loss_inst(student_output, teacher_output)

            optimizer.zero_grad()
            loss.backward()
            clip_gradients(student, args.clip_grad)
            optimizer.step()

            with torch.no_grad():
                for student_ps, teacher_ps in zip(student.parameters(),
                                                  teacher.parameters()):
                    teacher_ps.data.mul_(args.momentum_teacher)
                    teacher_ps.data.add_(
                        (1 - args.momentum_teacher) * student_ps.detach().data)

            writer.add_scalar("train_loss", loss, n_steps)
            run["metrics/loss"].log(loss)
            wandb.log({"loss": loss})

            n_steps += 1

    print(f"[INFO]: Training ended")
    run.stop()