Esempio n. 1
0
def main(args):

    args = parse_args()
    tag = args.tag
    device = torch.device('cuda:0')

    no_epochs = args.epochs
    batch_size = args.batch

    linear_hidden = args.linear
    conv_hidden = args.conv

    #Get train test paths -> later on implement cross val
    steps = get_paths(as_tuples=True, shuffle=True, tag=tag)
    steps_train, steps_test = steps[:int(len(steps) *
                                         .8)], steps[int(len(steps) * .2):]

    transform = transforms.Compose(
        [DepthSegmentationPreprocess(no_data_points=1),
         ToSupervised()])

    dataset_train = SimpleDataset(ids=steps_train,
                                  batch_size=batch_size,
                                  transform=transform,
                                  **SENSORS)
    dataset_test = SimpleDataset(ids=steps_test,
                                 batch_size=batch_size,
                                 transform=transform,
                                 **SENSORS)

    dataloader_params = {
        'batch_size': batch_size,
        'shuffle': True,
        'num_workers': 8
    }  #we've already shuffled paths

    dataset_train = DataLoader(dataset_train, **dataloader_params)
    dataset_test = DataLoader(dataset_test, **dataloader_params)

    batch = next(iter(dataset_test))
    action_shape = batch['action'][0].shape
    img_shape = batch['img'][0].shape
    #Nets
    net = DDPGActor(img_shape=img_shape,
                    numeric_shape=[len(NUMERIC_FEATURES)],
                    output_shape=[2],
                    linear_hidden=linear_hidden,
                    conv_filters=conv_hidden)
    # net = DDPGCritic(actor_out_shape=action_shape, img_shape=img_shape, numeric_shape=[len(NUMERIC_FEATURES)],
    #                         linear_hidden=linear_hidden, conv_filters=conv_filters)

    print(len(steps))
    print(net)
    print(get_n_params(net))
    # save path
    net_path = f'../data/models/imitation/{DATE_TIME}/{net.name}'
    os.makedirs(net_path, exist_ok=True)
    optim_steps = args.optim_steps
    logging_idx = int(len(dataset_train.dataset) / (batch_size * optim_steps))

    writer_train = SummaryWriter(f'{net_path}/train',
                                 max_queue=30,
                                 flush_secs=5)
    writer_test = SummaryWriter(f'{net_path}/test', max_queue=1, flush_secs=5)

    #Optimizers
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=0.001,
                                 weight_decay=0.0005)

    if args.scheduler == 'cos':
        scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                T_0=optim_steps,
                                                T_mult=2)
    elif args.scheduler == 'one_cycle':
        scheduler = OneCycleLR(optimizer,
                               max_lr=0.001,
                               epochs=no_epochs,
                               steps_per_epoch=optim_steps)

    #Loss function
    loss_function = torch.nn.MSELoss(reduction='sum')
    test_loss_function = torch.nn.MSELoss(reduction='sum')

    best_train_loss = 1e10
    best_test_loss = 1e10

    for epoch_idx in range(no_epochs):
        train_loss = .0
        running_loss = .0
        # critic_running_loss = .0
        avg_max_grad = 0.
        avg_avg_grad = 0.
        for idx, batch in enumerate(iter(dataset_train)):
            global_step = int((len(dataset_train.dataset) / batch_size *
                               epoch_idx) + idx)
            batch = unpack_batch(batch=batch, device=device)
            loss, grad = train(input=batch,
                               label=batch['action'],
                               net=net,
                               optimizer=optimizer,
                               loss_fn=loss_function)
            # loss, grad = train(input=batch, label=batch['q'], net=net, optimizer=optimizer, loss_fn=loss_function)

            avg_max_grad += max([element.max() for element in grad])
            avg_avg_grad += sum([element.mean()
                                 for element in grad]) / len(grad)

            running_loss += loss
            train_loss += loss

            writer_train.add_scalar(tag=f'{net.name}/running_loss',
                                    scalar_value=loss / batch_size,
                                    global_step=global_step)
            writer_train.add_scalar(tag=f'{net.name}/max_grad',
                                    scalar_value=avg_max_grad,
                                    global_step=global_step)
            writer_train.add_scalar(tag=f'{net.name}/mean_grad',
                                    scalar_value=avg_avg_grad,
                                    global_step=global_step)

            if idx % logging_idx == logging_idx - 1:
                print(
                    f'Actor Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {running_loss/logging_idx}, Lr: {scheduler.get_last_lr()[0]}'
                )
                if (running_loss / logging_idx) < best_train_loss:
                    best_train_loss = running_loss / logging_idx
                    torch.save(net.state_dict(), f'{net_path}/train/train.pt')

                writer_train.add_scalar(
                    tag=f'{net.name}/lr',
                    scalar_value=scheduler.get_last_lr()[0],
                    global_step=global_step)
                running_loss = 0.0
                avg_max_grad = 0.
                avg_avg_grad = 0.
                scheduler.step()

        print(
            f'{net.name} best train loss for epoch {epoch_idx+1} - {best_train_loss}'
        )
        writer_train.add_scalar(tag=f'{net.name}/global_loss',
                                scalar_value=train_loss /
                                len(dataset_train.dataset),
                                global_step=(epoch_idx + 1))
        test_loss = .0
        with torch.no_grad():
            for idx, batch in enumerate(iter(dataset_test)):
                batch = unpack_batch(batch=batch, device=device)
                pred = net(**batch)
                loss = test_loss_function(pred, batch['action'])
                # loss = test_loss_function(pred.view(-1), batch['q'])

                test_loss += loss

        if (test_loss / len(dataset_test)) < best_test_loss:
            best_test_loss = (test_loss / len(dataset_test))

        torch.save(net.state_dict(), f'{net_path}/test/test_{epoch_idx+1}.pt')

        print(f'{net.name} test loss {(test_loss/len(dataset_test)):.3f}')
        print(f'{net.name} best test loss {best_test_loss:.3f}')
        writer_test.add_scalar(tag=f'{net.name}/global_loss',
                               scalar_value=(test_loss /
                                             len(dataset_test.dataset)),
                               global_step=(epoch_idx + 1))

    torch.save(optimizer.state_dict(),
               f=f'{net_path}/{optimizer.__class__.__name__}.pt')
    torch.save(scheduler.state_dict(),
               f=f'{net_path}/{scheduler.__class__.__name__}.pt')
    json.dump(vars(args),
              fp=open(f'{net_path}/args.json', 'w'),
              sort_keys=True,
              indent=4)

    writer_train.flush()
    writer_test.flush()
    writer_train.close()
    writer_test.close()

    batch = next(iter(dataset_test))
    batch = unpack_batch(batch=batch, device=device)
    y = net(**batch)
    g = make_dot(y, params=dict(net.named_parameters()))
    g.save(filename=f'{DATE_TIME}_{net.name}.dot', directory=net_path)
    check_call([
        'dot', '-Tpng', '-Gdpi=200', f'{net_path}/{DATE_TIME}_{net.name}.dot',
        '-o', f'{net_path}/{DATE_TIME}_{net.name}.png'
    ])
Esempio n. 2
0
def main(opt):
    train_data, valid_data = get_train_valid_split_data_names(opt.img_folder, opt.ano_folder, valid_size=1/8)

    # データの読み込み
    print("load data")
    train_dataset = Phase1Dataset(train_data, load_size=(640, 640), augment=True, limit=opt.limit)
    print("train data length : %d" % (len(train_dataset)))
    valid_dataset = Phase1Dataset(valid_data, load_size=(640, 640), augment=False, limit=opt.limit)
    print("valid data length : %d" % (len(valid_dataset)))
    # DataLoaderの作成
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=opt.num_workers,
        pin_memory=True,
        drop_last=True
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=opt.num_workers,
        pin_memory=True,
        drop_last=True
    )

    # GPUの設定(PyTorchでは明示的に指定する必要がある)
    device = torch.device('cuda' if opt.gpus > 0 else 'cpu')

    # モデルの作成
    heads = {'hm': 1}
    model = get_pose_net(18, heads, 256).to(device)
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(
            model, opt.load_model, optimizer)

    # 最適化手法を定義
    if opt.optimizer == "SGD":
        optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr)#, momentum=m, dampening=d, weight_decay=w, nesterov=n)
    elif opt.optimizer == "Adam":
        optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    elif opt.optimizer == "RAdam":
        optimizer = optim.RAdam(model.parameters(), lr=opt.lr)
    
    # 損失関数を定義
    criterion = HMLoss()
    # 学習率のスケジューリングを定義
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=0.00001)

    start_epoch = 0
    best_validation_loss = 1e10
    # 保存用フォルダの作成
    os.makedirs(os.path.join(opt.save_dir, opt.task, 'visualized'), exist_ok=True)

    # 学習 TODO エポック終了時点ごとにテスト用データで評価とモデル保存
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        print("learning rate : %f" % scheduler.get_last_lr()[0])
        train(train_loader, model, optimizer, criterion, device, opt.num_epochs, epoch)
        if opt.optimizer == "SGD":
            scheduler.step()

        # 最新モデルの保存
        save_model(os.path.join(opt.save_dir, opt.task, 'model_last.pth'),
                   epoch, model, optimizer, scheduler)

        # テスト用データで評価
        validation_loss, accumulate_datas = valid(valid_loader, model, criterion, device)
        # ベストスコア更新でモデルの保存
        if validation_loss < best_validation_loss:
            best_validation_loss = validation_loss
            save_model(os.path.join(opt.save_dir, opt.task, 'model_best.pth'),
                       epoch, model, optimizer, scheduler)
            print("saved best model")
            visualization(os.path.join(opt.save_dir, opt.task, 'visualized'),
                        accumulate_datas)
Esempio n. 3
0
def main():
    global args, best_performance

    set_seed(args.rand_seed)

    if args.model == 'FCNet':
        # dataloader
        train_loader, valid_loader, test_loader = get_FCNet_train_valid_test_loader(
            root=args.data_root,
            target=args.target,
            max_Miller=args.max_Miller,
            diffraction=args.diffraction,
            cell_type=args.cell_type,
            permute_hkl=args.fcnet_permute_hkl,
            randomize_hkl=args.fcnet_randomize_hkl,
            batch_size=args.batch_size,
            num_data_workers=args.num_data_workers)
        # construct model
        model = FCNet(max_Miller=args.max_Miller,
                      fc_dims=args.fcnet_fc_dims,
                      dropout=args.dropout)
    elif args.model == 'PointNet':
        # dataloader
        train_loader, valid_loader, test_loader = get_PointNet_train_valid_test_loader(
            root=args.data_root,
            target=args.target,
            max_Miller=args.max_Miller,
            diffraction=args.diffraction,
            cell_type=args.cell_type,
            randomly_scale_intensity=args.pointnet_randomly_scale_intensity,
            systematic_absence=args.pointnet_systematic_absence,
            batch_size=args.batch_size,
            num_data_workers=args.num_data_workers)
        # construct model
        model = PointNet(conv_filters=args.pointnet_conv_filters,
                         fc_dims=args.pointnet_fc_dims,
                         dropout=args.dropout)
    else:
        raise NotImplementedError

    # send model to device
    if torch.cuda.is_available():
        print('running on GPU:\n')
    else:
        print('running on CPU\n')
    model = model.to(args.device)

    # show number of trainable model parameters
    trainable_params = sum(p.numel() for p in model.parameters()
                           if p.requires_grad)
    print(
        'Number of trainable model parameters: {:d}'.format(trainable_params))

    # define loss function
    criterion = torch.nn.NLLLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # HDFS
    if args.hdfs_dir is not None:
        os.system(f'hdfs dfs -mkdir -p {args.hdfs_dir}')

    # optionally resume from a checkpoint
    if args.restore_path != '':
        assert os.path.isfile(args.restore_path)
        print("=> loading checkpoint '{}'".format(args.restore_path),
              flush=True)
        checkpoint = torch.load(args.restore_path,
                                map_location=torch.device('cpu'))
        args.start_epoch = checkpoint['epoch'] + 1
        best_performance = checkpoint['best_performance']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.restore_path, checkpoint['epoch']),
              flush=True)

    # learning-rate scheduler
    scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer,
                                            T_0=args.epochs,
                                            eta_min=1E-8)

    print('\nStart training..', flush=True)
    for epoch in range(args.start_epoch, args.start_epoch + args.epochs):
        lr = scheduler.get_last_lr()
        logging.info('Epoch: {}, LR: {:.6f}'.format(epoch, lr[0]))

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        performance = validate(valid_loader, model, criterion)

        scheduler.step()

        # check performance
        is_best = performance > best_performance
        best_performance = max(performance, best_performance)

        # save checkpoint
        save_checkpoint(
            {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_performance': best_performance,
                'optimizer': optimizer.state_dict(),
            }, is_best, args)

    # test best model
    print('---------Evaluate Model on Test Set---------------', flush=True)
    best_model = load_best_model()
    print('best validation performance: {:.3f}'.format(
        best_model['best_performance']))
    model.load_state_dict(best_model['state_dict'])
    validate(test_loader, model, criterion, test_mode=True)
Esempio n. 4
0
while True:
    # for j in range(50):
    for batch, (inp, target) in enumerate(dl):
        inp, target = inp.to(device), target.to(device)
        opt.zero_grad()

        out = model(inp)

        loss = F.cross_entropy(out, target)
        loss.backward()

        opt.step()
        sched.step()

        acc = accuracy(F.softmax(out), target)[0].item()
        loss_i = loss.item()

        print(
            f'{epoch:2}/{batch:3} Loss: {loss_i:.7f} Accuracy: {acc:.7f} LR: {sched.get_last_lr()[0]:.7f}'
        )
        wandb.log(
            {
                'loss': loss_i,
                'accuracy': acc,
                'lr': sched.get_last_lr()[0]
            },
            step=global_step)

        global_step += 1
    epoch += 1
Esempio n. 5
0
def main(args):

    args = parse_args()
    tag = args.tag
    device = torch.device('cuda:0')

    no_epochs = args.epochs
    batch_size = args.batch

    linear_hidden = args.linear
    conv_hidden = args.conv

    #Get train test paths -> later on implement cross val
    steps = get_paths(as_tuples=True, shuffle=True, tag=tag)
    steps_train, steps_test = steps[:int(len(steps) *
                                         .8)], steps[int(len(steps) * .2):]

    transform = transforms.Compose([
        DepthSegmentationPreprocess(no_data_points=args.no_data),
        ToSupervised()
    ])

    dataset_train = SimpleDataset(ids=steps_train,
                                  batch_size=batch_size,
                                  transform=transform,
                                  **SENSORS)
    dataset_test = SimpleDataset(ids=steps_test,
                                 batch_size=batch_size,
                                 transform=transform,
                                 **SENSORS)

    dataloader_params = {
        'batch_size': batch_size,
        'shuffle': True,
        'num_workers': 8
    }  #we've already shuffled paths

    dataset_train = DataLoader(dataset_train, **dataloader_params)
    dataset_test = DataLoader(dataset_test, **dataloader_params)

    batch = next(iter(dataset_test))
    action_shape = batch['action'][0].shape
    img_shape = batch['img'][0].shape
    #Nets
    actor_net = DDPGActor(img_shape=img_shape,
                          numeric_shape=[len(NUMERIC_FEATURES)],
                          output_shape=[2],
                          linear_hidden=linear_hidden,
                          conv_filters=conv_hidden)
    critic_net = DDPGCritic(actor_out_shape=action_shape,
                            img_shape=img_shape,
                            numeric_shape=[len(NUMERIC_FEATURES)],
                            linear_hidden=linear_hidden,
                            conv_filters=conv_hidden)

    print(len(steps))
    print(actor_net)
    print(get_n_params(actor_net))
    print(critic_net)
    print(get_n_params(critic_net))
    # save path
    actor_net_path = f'../data/models/offline/{DATE_TIME}/{actor_net.name}'
    critic_net_path = f'../data/models/offline/{DATE_TIME}/{critic_net.name}'
    os.makedirs(actor_net_path, exist_ok=True)
    os.makedirs(critic_net_path, exist_ok=True)
    optim_steps = args.optim_steps
    logging_idx = int(len(dataset_train.dataset) / (batch_size * optim_steps))

    actor_writer_train = SummaryWriter(f'{actor_net_path}/train',
                                       max_queue=30,
                                       flush_secs=5)
    critic_writer_train = SummaryWriter(f'{critic_net_path}/train',
                                        max_queue=1,
                                        flush_secs=5)
    actor_writer_test = SummaryWriter(f'{actor_net_path}/test',
                                      max_queue=30,
                                      flush_secs=5)
    critic_writer_test = SummaryWriter(f'{critic_net_path}/test',
                                       max_queue=1,
                                       flush_secs=5)

    #Optimizers
    actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=0.001)
    critic_optimizer = torch.optim.Adam(critic_net.parameters(), lr=0.001)

    actor_scheduler = CosineAnnealingWarmRestarts(actor_optimizer,
                                                  T_0=optim_steps,
                                                  T_mult=2)
    critic_scheduler = CosineAnnealingWarmRestarts(critic_optimizer,
                                                   T_0=optim_steps,
                                                   T_mult=2)
    #Loss function
    loss_function = torch.nn.MSELoss(reduction='sum')

    actor_best_train_loss = 1e10
    critic_best_train_loss = 1e10
    actor_best_test_loss = 1e10
    critic_best_test_loss = 1e10

    for epoch_idx in range(no_epochs):
        actor_train_loss = .0
        critic_train_loss = .0
        actor_running_loss = .0
        critic_running_loss = .0
        actor_avg_max_grad = .0
        critic_avg_max_grad = .0
        actor_avg_avg_grad = .0
        critic_avg_avg_grad = .0
        for idx, batch in enumerate(iter(dataset_train)):
            global_step = int((len(dataset_train.dataset) / batch_size *
                               epoch_idx) + idx)
            batch = unpack_batch(batch=batch, device=device)
            actor_loss, critic_loss, actor_grad, critic_grad = train_rl(
                batch=batch,
                actor_net=actor_net,
                critic_net=critic_net,
                actor_optimizer=actor_optimizer,
                critic_optimizer=critic_optimizer,
                loss_fn=loss_function)
            del batch
            gc.collect()

            actor_avg_max_grad += max(
                [element.max() for element in actor_grad])
            critic_avg_max_grad += max(
                [element.max() for element in critic_grad])
            actor_avg_avg_grad += sum(
                [element.mean() for element in actor_grad]) / len(actor_grad)
            critic_avg_avg_grad += sum(
                [element.mean() for element in critic_grad]) / len(critic_grad)

            actor_running_loss += actor_loss
            critic_train_loss += critic_loss
            actor_train_loss += actor_loss
            critic_running_loss += critic_loss

            actor_writer_train.add_scalar(tag=f'{actor_net.name}/running_loss',
                                          scalar_value=actor_loss / batch_size,
                                          global_step=global_step)
            actor_writer_train.add_scalar(tag=f'{actor_net.name}/max_grad',
                                          scalar_value=actor_avg_max_grad,
                                          global_step=global_step)
            actor_writer_train.add_scalar(tag=f'{actor_net.name}/mean_grad',
                                          scalar_value=actor_avg_avg_grad,
                                          global_step=global_step)

            critic_writer_train.add_scalar(
                tag=f'{critic_net.name}/running_loss',
                scalar_value=critic_loss / batch_size,
                global_step=global_step)
            critic_writer_train.add_scalar(tag=f'{critic_net.name}/max_grad',
                                           scalar_value=critic_avg_max_grad,
                                           global_step=global_step)
            critic_writer_train.add_scalar(tag=f'{critic_net.name}/mean_grad',
                                           scalar_value=critic_avg_avg_grad,
                                           global_step=global_step)

            if idx % logging_idx == logging_idx - 1:
                print(
                    f'Actor Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {actor_running_loss/logging_idx}'
                )
                print(
                    f'Critic Epoch: {epoch_idx + 1}, Batch: {idx+1}, Loss: {critic_running_loss/logging_idx}'
                )
                if (critic_running_loss /
                        logging_idx) < critic_best_train_loss:
                    critic_best_train_loss = critic_running_loss / logging_idx
                    torch.save(actor_net.state_dict(),
                               f'{actor_net_path}/train/train.pt')
                    torch.save(critic_net.state_dict(),
                               f'{critic_net_path}/train/train.pt')

                actor_writer_train.add_scalar(
                    tag=f'{actor_net.name}/lr',
                    scalar_value=actor_scheduler.get_last_lr()[0],
                    global_step=global_step)
                critic_writer_train.add_scalar(
                    tag=f'{critic_net.name}/lr',
                    scalar_value=critic_scheduler.get_last_lr()[0],
                    global_step=global_step)

                actor_scheduler.step()
                critic_scheduler.step()
                actor_running_loss = .0
                actor_avg_max_grad = .0
                actor_avg_avg_grad = .0
                critic_running_loss = .0
                critic_avg_max_grad = .0
                critic_avg_avg_grad = .0

        print(
            f'{critic_net.name} best train loss for epoch {epoch_idx+1} - {critic_best_train_loss}'
        )
        actor_writer_train.add_scalar(
            tag=f'{actor_net.name}/global_loss',
            scalar_value=(actor_train_loss / (len(dataset_train.dataset))),
            global_step=(epoch_idx + 1))
        critic_writer_train.add_scalar(
            tag=f'{critic_net.name}/global_loss',
            scalar_value=(critic_train_loss / (len(dataset_train.dataset))),
            global_step=(epoch_idx + 1))
        actor_test_loss = .0
        critic_test_loss = .0
        with torch.no_grad():
            for idx, batch in enumerate(iter(dataset_test)):
                batch = unpack_batch(batch=batch, device=device)
                q_pred = critic_net(**batch)
                action_pred = actor_net(**batch)
                critic_loss = loss_function(q_pred.view(-1),
                                            batch['q']).abs().sum()
                actor_loss = loss_function(action_pred,
                                           batch['action']).abs().sum()

                critic_test_loss += critic_loss
                actor_test_loss += actor_loss

        if critic_test_loss / len(
                dataset_test.dataset) < critic_best_test_loss:
            critic_best_test_loss = (critic_test_loss /
                                     len(dataset_test.dataset))
        if actor_test_loss / len(dataset_test.dataset) < actor_best_test_loss:
            actor_best_test_loss = (actor_test_loss /
                                    len(dataset_test.dataset))

        torch.save(critic_net.state_dict(),
                   f'{critic_net_path}/test/test_{epoch_idx+1}.pt')
        torch.save(actor_net.state_dict(),
                   f'{actor_net_path}/test/test_{epoch_idx+1}.pt')

        print(
            f'{critic_net.name} test loss {(critic_test_loss/len(dataset_test.dataset)):.3f}'
        )
        print(
            f'{actor_net.name} test loss {(actor_test_loss/len(dataset_test.dataset)):.3f}'
        )
        print(f'{critic_net.name} best test loss {critic_best_test_loss:.3f}')
        print(f'{actor_net.name} best test loss {actor_best_test_loss:.3f}')

        critic_writer_test.add_scalar(
            tag=f'{critic_net.name}/global_loss',
            scalar_value=(critic_test_loss / (len(dataset_test.dataset))),
            global_step=(epoch_idx + 1))
        actor_writer_test.add_scalar(
            tag=f'{actor_net.name}/global_loss',
            scalar_value=(actor_test_loss / (len(dataset_test.dataset))),
            global_step=(epoch_idx + 1))
        torch.cuda.empty_cache()
        gc.collect()

    torch.save(actor_optimizer.state_dict(),
               f=f'{actor_net_path}/{actor_optimizer.__class__.__name__}.pt')
    torch.save(critic_optimizer.state_dict(),
               f=f'{critic_net_path}/{critic_optimizer.__class__.__name__}.pt')
    json.dump(vars(args),
              fp=open(f'{actor_net_path}/args.json', 'w'),
              sort_keys=True,
              indent=4)
    json.dump(vars(args),
              fp=open(f'{critic_net_path}/args.json', 'w'),
              sort_keys=True,
              indent=4)

    actor_writer_train.flush()
    actor_writer_test.flush()
    actor_writer_train.close()
    actor_writer_test.close()
    critic_writer_train.flush()
    critic_writer_test.flush()
    critic_writer_train.close()
    critic_writer_test.close()

    batch = next(iter(dataset_test))
    batch = unpack_batch(batch=batch, device=device)

    #Actor architecture save
    y = actor_net(**batch)
    g = make_dot(y, params=dict(actor_net.named_parameters()))
    g.save(filename=f'{DATE_TIME}_{actor_net.name}.dot',
           directory=actor_net_path)
    #Critic architecture save
    y = critic_net(**batch)
    g = make_dot(y, params=dict(critic_net.named_parameters()))
    g.save(filename=f'{DATE_TIME}_{critic_net.name}.dot',
           directory=critic_net_path)

    check_call([
        'dot', '-Tpng', '-Gdpi=200',
        f'{critic_net_path}/{DATE_TIME}_{critic_net.name}.dot', '-o',
        f'{critic_net_path}/{DATE_TIME}_{critic_net.name}.png'
    ])
    check_call([
        'dot', '-Tpng', '-Gdpi=200',
        f'{actor_net_path}/{DATE_TIME}_{actor_net.name}.dot', '-o',
        f'{actor_net_path}/{DATE_TIME}_{actor_net.name}.png'
    ])
Esempio n. 6
0
def run_training(data_type="screw",
                 model_dir="models",
                 epochs=256,
                 pretrained=True,
                 test_epochs=10,
                 freeze_resnet=20,
                 learninig_rate=0.03,
                 optim_name="SGD",
                 batch_size=64,
                 head_layer=8):
    torch.multiprocessing.freeze_support()
    # TODO: use script params for hyperparameter
    # Temperature Hyperparameter currently not used
    temperature = 0.2
    device = "cuda"

    weight_decay = 0.00003
    momentum = 0.9
    #TODO: use f strings also for the date LOL
    model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format(
        date=datetime.datetime.now())

    #augmentation:
    size = 256
    min_scale = 0.5

    # create Training Dataset and Dataloader
    after_cutpaste_transform = transforms.Compose([])
    after_cutpaste_transform.transforms.append(transforms.ToTensor())
    after_cutpaste_transform.transforms.append(
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]))

    train_transform = transforms.Compose([])
    # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1)))
    # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0)))
    train_transform.transforms.append(transforms.Resize((256, 256)))
    train_transform.transforms.append(
        CutPaste(transform=after_cutpaste_transform))
    # train_transform.transforms.append(transforms.ToTensor())

    train_data = MVTecAT("Data",
                         data_type,
                         transform=train_transform,
                         size=int(size * (1 / min_scale)))
    dataloader = DataLoader(train_data,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=8,
                            collate_fn=cut_paste_collate_fn,
                            persistent_workers=True,
                            pin_memory=True,
                            prefetch_factor=5)

    # Writer will output to ./runs/ directory by default
    writer = SummaryWriter(Path("logdirs") / model_name)

    # create Model:
    head_layers = [512] * head_layer + [128]
    print(head_layers)
    model = ProjectionNet(pretrained=pretrained, head_layers=head_layers)
    model.to(device)

    if freeze_resnet > 0:
        model.freeze_resnet()

    loss_fn = torch.nn.CrossEntropyLoss()
    if optim_name == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=learninig_rate,
                              momentum=momentum,
                              weight_decay=weight_decay)
        scheduler = CosineAnnealingWarmRestarts(optimizer, epochs)
        #scheduler = None
    elif optim_name == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=learninig_rate,
                               weight_decay=weight_decay)
        scheduler = None
    else:
        print(f"ERROR unkown optimizer: {optim_name}")

    step = 0
    import torch.autograd.profiler as profiler
    num_batches = len(dataloader)

    def get_data_inf():
        while True:
            for out in enumerate(dataloader):
                yield out

    dataloader_inf = get_data_inf()
    # From paper: "Note that, unlike conventional definition for an epoch,
    #              we define 256 parameter update steps as one epoch.
    for step in tqdm(range(epochs * 256)):
        epoch = int(step / 256)
        if epoch == freeze_resnet:
            model.unfreeze()

        batch_embeds = []
        batch_idx, data = next(dataloader_inf)
        x1, x2 = data
        x1 = x1.to(device)
        x2 = x2.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        xc = torch.cat((x1, x2), axis=0)
        embeds, logits = model(xc)

        #         embeds = F.normalize(embeds, p=2, dim=1)
        #         embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0)
        #         ip = torch.matmul(embeds1, embeds2.T)
        #         ip = ip / temperature

        #         y = torch.arange(0,x1.size(0), device=device)
        #         loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device))

        y = torch.tensor([0, 1], device=device)
        y = y.repeat_interleave(x1.size(0))
        loss = loss_fn(logits, y)

        # regulize weights:
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step(epoch + batch_idx / num_batches)

        writer.add_scalar('loss', loss.item(), step)

        #         predicted = torch.argmax(ip,axis=0)
        predicted = torch.argmax(logits, axis=1)
        #         print(logits)
        #         print(predicted)
        #         print(y)
        accuracy = torch.true_divide(torch.sum(predicted == y),
                                     predicted.size(0))
        writer.add_scalar('acc', accuracy, step)
        if scheduler is not None:
            writer.add_scalar('lr', scheduler.get_last_lr()[0], step)

        # save embed for validation:
        if test_epochs > 0 and epoch % test_epochs == 0:
            batch_embeds.append(embeds.cpu().detach())

        writer.add_scalar('epoch', epoch, step)

        # run tests
        if test_epochs > 0 and epoch % test_epochs == 0:
            # run auc calculation
            #TODO: create dataset only once.
            #TODO: train predictor here or in the model class itself. Should not be in the eval part
            #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance???
            # batch_embeds = torch.cat(batch_embeds)
            # print(batch_embeds.shape)
            model.eval()
            roc_auc = eval_model(model_name,
                                 data_type,
                                 device=device,
                                 save_plots=False,
                                 size=size,
                                 show_training_data=False,
                                 model=model)
            #train_embed=batch_embeds)
            model.train()
            writer.add_scalar('eval_auc', roc_auc, step)

    torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
class WarmRestartsCustomScheduler(_LRScheduler):
    """Custom Learning Rate Scheduler based on the 3rd Place Solution.

    This is for setting the learning rate schedule:
    Warm Restarts for epochs (1-28)
    LR=1e-5 (29-32), LR=1e-6 (33-35)

    The general version looks like this:
    # from:
    # https://github.com/naivelamb/kaggle-cloud-organization/blob/master/main_seg.py
    if epoch < start_epoch + n_epochs - 1:
        if epoch != 0:
            scheduler.step()
            scheduler=warm_restart(scheduler, T_mult=2)
    elif (epoch < start_epoch + n_epochs + 2 and
          epoch >= start_epoch + n_epochs - 1):
        optimizer.param_groups[0]['lr'] = 1e-5
    else:
        optimizer.param_groups[0]['lr'] = 5e-6

    """
    def __init__(self,
                 optimizer,
                 T_0,
                 T_mult=2,
                 eta_min=0,
                 num_wr_epochs=28,
                 mid_const_lr_epochs_range=[29, 32],
                 constant_lrs=[1e-5, 5e-6],
                 last_epoch=-1):
        """
        Args:
            optimizer (torch.optim.Optimizer):
            T_0:
            T_mult:
            eta_min:
            num_wr_epochs (int): The number of warm restart epochs to do
            mid_const_lr_epochs_range (list-like[int]): [min, max] where max
                is not included. This is the epoch interval where the first
                lr of constant_lr is used
            constant_lrs (list-like[float]): the learning rates to use for the
                mid and end intervals after warm restarts ends.
        """
        self.num_wr_epochs = num_wr_epochs
        assert len(mid_const_lr_epochs_range) == 2, \
            "`constant_lrs` must be a list-like with length 2."
        self.mid_const_lr_epochs_range = mid_const_lr_epochs_range
        assert len(constant_lrs) == 2, \
            "`constant_lrs` must be a list-like with length 2."
        self.constant_lrs = constant_lrs

        self.optimizer = optimizer
        self.warm_restarts = CosineAnnealingWarmRestarts(
            self.optimizer, T_0, T_mult, eta_min)
        super().__init__(optimizer, last_epoch=last_epoch)

    def get_lr(self):
        """No calculation done here.
        """
        return self.get_last_lr()

    def step(self, epoch=None):
        """Computes a step for the learning rate scheduler.

        Here, a step is an epoch. This is where the learning rates are set
        and the last_epoch counter is updated.
        """
        # warm restarts
        if self.last_epoch < self.num_wr_epochs + 1:
            self.warm_restarts.step()
            self.last_epoch = self.warm_restarts.last_epoch
            self._last_lr = self.warm_restarts.get_last_lr()
        # constant LR (first round)
        elif (self.last_epoch >= self.mid_const_lr_epochs_range[0]
              and self.last_epoch < self.mid_const_lr_epochs_range[1]):
            self.last_epoch += 1
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.constant_lrs[0]
            self._last_lr = [
                group['lr'] for group in self.optimizer.param_groups
            ]
        # constant LR (second round)
        else:
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.constant_lrs[1]
            self.last_epoch += 1
            self._last_lr = [
                group['lr'] for group in self.optimizer.param_groups
            ]