Beispiel #1
0
def main(cfg: DictConfig):
    print('Cassava Leaf Disease Classification')
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Config  -------------------------------------------------------------------
    data_dir = './input'
    seed_everything(cfg.data.seed)

    # Comet_ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
                            project_name=cfg.comet_ml.project_name,
                            auto_param_logging=False,
                            auto_metric_logging=False)

    # Log Parameters
    experiment.log_parameters(dict(cfg.data))
    experiment.log_parameters(dict(cfg.train))

    # Data Module  ---------------------------------------------------------------
    transform = get_transforms(transform_name=cfg.data.transform,
                               img_size=cfg.data.img_size)
    cv = StratifiedKFold(n_splits=cfg.data.n_splits,
                         shuffle=True,
                         random_state=cfg.data.seed)
    dm = CassavaDataModule(data_dir,
                           cfg,
                           transform,
                           cv,
                           use_merge=True,
                           sample=DEBUG)

    # Model  ----------------------------------------------------------------------
    net = Timm_model(cfg.train.model_type, pretrained=True)

    # Log Model Graph
    experiment.set_model_graph(str(net))

    # Loss fn  ---------------------------------------------------------------------
    df = pd.read_csv('./input/merged.csv')
    weight = df['label'].value_counts().sort_index().tolist()
    weight = [w / len(df) for w in weight]
    weight = torch.tensor(weight).cuda()
    del df

    criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05)

    # Optimizer, Scheduler  --------------------------------------------------------
    if cfg.train.use_sam:
        base_optimizer = RAdam
        optimizer = SAM(net.parameters(),
                        base_optimizer,
                        lr=cfg.train.lr,
                        weight_decay=cfg.train.weight_decay)
    else:
        optimizer = RAdam(net.parameters(),
                          lr=cfg.train.lr,
                          weight_decay=cfg.train.weight_decay)

    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=cfg.train.epoch,
                                               eta_min=0)

    # Lightning Module  -------------------------------------------------------------
    model = CassavaLightningSystem(net,
                                   cfg,
                                   criterion=criterion,
                                   optimizer=optimizer,
                                   scheduler=scheduler,
                                   experiment=experiment)

    # Trainer  -------------------------------------------------------------------------
    trainer = Trainer(
        logger=False,
        max_epochs=cfg.train.epoch,
        gpus=-1,
        amp_backend='apex',
        amp_level='O2',
        num_sanity_val_steps=0,  # Skip Sanity Check
        automatic_optimization=False if cfg.train.use_sam else True,
        # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt'
    )

    # Train
    trainer.fit(model, datamodule=dm)
Beispiel #2
0
def train_runner(model: nn.Module,
                 model_name: str,
                 results_dir: str,
                 experiment: str = '',
                 debug: bool = False,
                 img_size: int = IMG_SIZE,
                 learning_rate: float = 1e-2,
                 fold: int = 0,
                 checkpoint: str = '',
                 epochs: int = 15,
                 batch_size: int = 8,
                 num_workers: int = 4,
                 from_epoch: int = 0,
                 save_oof: bool = False,
                 save_train_oof: bool = False,
                 gpu_number: int = 1):
    """
    Model training runner

    Args: 
        model        : PyTorch model
        model_name   : string name for model for checkpoints saving
        results_dir  : directory to save results
        experiment   : string name for naming experiments
        debug        : if True, runs the debugging on few images 
        img_size     : size of images for training 
        learning_rate: initial learning rate (default = 1e-2) 
        fold         : training fold (default = 0)
        epochs       : number of the last epochs to train
        batch_size   : number of images in batch
        num_workers  : number of workers available
        from_epoch   : number of epoch to continue training   
        save_oof     : saves oof validation predictions. Default = False 
    """
    device = torch.device(
        f'cuda:{gpu_number}' if torch.cuda.is_available() else 'cpu')
    print(device)

    # load model weights to continue training
    if checkpoint != '':
        model, ckpt = load_model(model, checkpoint)
        best_val_metric = ckpt['valid_miou']
        best_val_loss = ckpt['valid_loss']
        start_epoch = ckpt['epoch'] + 1
        print('Loaded model from {}, epoch {}'.format(checkpoint,
                                                      start_epoch - 1))
    model.to(device)

    # creates directories for checkpoints, tensorboard and predicitons
    checkpoints_dir = f'{results_dir}/checkpoints/{model_name}'
    predictions_dir = f'{results_dir}/oof/{model_name}'
    tensorboard_dir = f'{results_dir}/tensorboard/{model_name}'
    validations_dir = f'{results_dir}/oof_val/{model_name}'
    os.makedirs(checkpoints_dir, exist_ok=True)
    os.makedirs(predictions_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(validations_dir, exist_ok=True)
    logger = Logger(tensorboard_dir)
    print('\n', model_name, '\n')
    model = model.to(device)

    # datasets for train and validation
    df = pd.read_csv(f'{TRAIN_DIR}folds.csv')
    df_train = df[df.fold != fold]
    df_val = df[df.fold == fold]
    print(len(df_train.ImageId.values), len(df_val.ImageId.values))

    train_dataset = SARDataset(
        sars_dir=TRAIN_SAR,
        masks_dir=TRAIN_MASKS,
        labels_df=df_train,
        img_size=img_size,
        transforms=TRANSFORMS["medium"],
        preprocess=True,
        normalise=True,
        debug=debug,
    )
    valid_dataset = SARDataset(
        sars_dir=TRAIN_SAR,
        masks_dir=TRAIN_MASKS,
        labels_df=df_val,
        img_size=img_size,
        transforms=TRANSFORMS["d4"],
        preprocess=True,
        normalise=True,
        debug=debug,
    )
    # dataloaders for train and validation
    dataloader_train = DataLoader(train_dataset,
                                  num_workers=num_workers,
                                  batch_size=batch_size,
                                  shuffle=True)

    dataloader_valid = DataLoader(valid_dataset,
                                  num_workers=num_workers,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  drop_last=True)
    print('{} training images, {} validation images'.format(
        len(train_dataset), len(valid_dataset)))

    # optimizers and schedulers
    # optimizer = AdamW(model.parameters(), lr=learning_rate)
    optimizer = RAdam(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='max',
                                               patience=3,
                                               verbose=True,
                                               factor=0.2,
                                               min_lr=1e-7)
    num_batches = len(train_dataset) // batch_size + 1
    scheduler_cos = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=num_batches,
                                                   eta_min=1e-6,
                                                   last_epoch=-1)
    # load optimizer state continue training
    #if checkpoint != '':
    #    optimizer = load_optim(optimizer, checkpoint, device)

    # criteria
    criterion1 = nn.BCEWithLogitsLoss()
    criterion = BCEJaccardLoss(bce_weight=2,
                               jaccard_weight=0.5,
                               log_loss=False,
                               log_sigmoid=True)
    #criterion = JaccardLoss(log_sigmoid=True, log_loss=False)

    # logging
    #if make_log:
    report_batch = 20
    report_epoch = 20
    log_file = os.path.join(checkpoints_dir, f'{experiment}fold_{fold}.log')
    logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG)
    logging.info(
        f'Parameters:\n model_name: {model_name}\n, results_dir: {results_dir}\n, experiment: {experiment}\n, img_size: {img_size}\n, \
                 learning_rate: {learning_rate}\n, fold: {fold}\n, epochs: {epochs}\n, batch_size: {batch_size}\n, num_workers: {num_workers}\n, \
                 start_epoch: {start_epoch}\n, save_oof: {save_oof}\n, optimizer: {optimizer}\n, scheduler: {scheduler} \n, checkpoint: {start_epoch} \n'
    )

    train_losses, val_losses = [], []
    best_val_loss = 1e+5
    best_val_metric = 0
    # training cycle
    print("Start training")
    for epoch in range(start_epoch, start_epoch + epochs + 1):
        print("Epoch", epoch)
        epoch_losses = []
        progress_bar = tqdm(dataloader_train, total=len(dataloader_train))
        progress_bar.set_description('Epoch {}'.format(epoch))
        with torch.set_grad_enabled(
                True):  # --> sometimes people write it, idk
            for batch_num, (img, target, _) in enumerate(progress_bar):
                img = img.to(device)
                target = target.float().to(device)
                prediction = model(img).to(device)

                loss = criterion(prediction, target)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
                optimizer.step()
                epoch_losses.append(loss.detach().cpu().numpy())

                scheduler_cos.step()
                if debug:
                    # get current learning rate
                    for param_group in optimizer.param_groups:
                        learning_rate = param_group['lr']
                    print(f'current learning_rate: {learning_rate}')

                if batch_num and batch_num % report_batch == 0:
                    for param_group in optimizer.param_groups:
                        learning_rate = param_group['lr']
                    logging.info(
                        f'epoch: {epoch}; step: {batch_num}; learning_rate: {learning_rate}; loss: {np.mean(epoch_losses)} \n'
                    )

        # log loss history
        print("Epoch {}, Train Loss: {}".format(epoch, np.mean(epoch_losses)))
        train_losses.append(np.mean(epoch_losses))
        logger.scalar_summary('loss_train', np.mean(epoch_losses), epoch)
        logging.info(
            f'epoch: {epoch}; step: {batch_num}; loss: {np.mean(epoch_losses)} \n'
        )

        # validate model
        val_loss = validate_loss(model, dataloader_valid, criterion1, epoch,
                                 validations_dir, device)

        valid_metrics = validate(model, dataloader_valid, criterion, epoch,
                                 validations_dir, save_oof, debug, device)
        # logging metrics
        logger.scalar_summary('loss_valid', valid_metrics['val_loss'], epoch)
        logger.scalar_summary('miou_valid', valid_metrics['miou'], epoch)
        valid_loss, val_metric = valid_metrics['val_loss'], valid_metrics[
            'miou']
        logging.info(
            f'epoch: {epoch}; val_bce: {val_loss}; val_loss: {valid_loss}; val_miou: {val_metric}\n'
        )
        val_losses.append(valid_metrics['val_loss'])

        # get current learning rate
        for param_group in optimizer.param_groups:
            learning_rate = param_group['lr']
        print(f'learning_rate: {learning_rate}')
        logging.info(f'learning_rate: {learning_rate}\n')
        scheduler.step(val_metric)

        # save the best metric
        if valid_metrics['miou'] > best_val_metric:
            best_val_metric = valid_metrics['miou']
            # save model, optimizer and losses after every epoch
            print(
                f"Saving model with the best val metric {valid_metrics['miou']}, epoch {epoch}"
            )
            checkpoint_filename = "{}_best_val_miou.pth".format(model_name)
            checkpoint_filepath = os.path.join(checkpoints_dir,
                                               checkpoint_filename)
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epoch': epoch,
                    'loss': np.mean(epoch_losses),
                    'valid_loss': valid_metrics['val_loss'],
                    'valid_miou': valid_metrics['miou'],
                }, checkpoint_filepath)
        # save the best loss
        if valid_metrics['val_loss'] < best_val_loss:
            best_val_loss = valid_metrics['val_loss']
            # save model, optimizer and losses after every epoch
            print(
                f"Saving model with the best val loss {valid_metrics['val_loss']}, epoch {epoch}"
            )
            checkpoint_filename = "{}_best_val_loss.pth".format(model_name)
            checkpoint_filepath = os.path.join(checkpoints_dir,
                                               checkpoint_filename)
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epoch': epoch,
                    'loss': np.mean(epoch_losses),
                    'valid_loss': valid_metrics['val_loss'],
                    'valid_miou': valid_metrics['miou'],
                }, checkpoint_filepath)
        # save model, optimizer and losses after every n epoch
        elif epoch % report_epoch == 0:
            print(
                f"Saving model at epoch {epoch}, val loss {valid_metrics['val_loss']}"
            )
            checkpoint_filename = "{}_epoch_{}.pth".format(model_name, epoch)
            checkpoint_filepath = os.path.join(checkpoints_dir,
                                               checkpoint_filename)
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epoch': epoch,
                    'loss': np.mean(epoch_losses),
                    'valid_loss': valid_metrics['val_loss'],
                    'valid_miou': valid_metrics['miou'],
                }, checkpoint_filepath)
        # Early stopping
        if learning_rate == 1e-7:
            print(
                f"Stop trainig, reached minimal LR: {learning_rate} at epoch {epoch}"
            )
            break
Beispiel #3
0
 def __init__(self, T_max, eta_min=0):
     super().__init__(lambda opt: _scheduler.CosineAnnealingLR(
         opt, T_max, eta_min=eta_min))
Beispiel #4
0
def train():

    ### Load Dataset
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),  # 3*H*W, [0, 1]
        normalize,
    ])

    dataset = PETADataLoader(
        listfile=
        '/home/duycuong/PycharmProjects/research_py3/tensorflow_slim/data/PETA/train_list_v2.txt',
        transform=transform)
    data_loader = data.DataLoader(dataset,
                                  BATCH_SIZE,
                                  num_workers=args.num_worker,
                                  shuffle=True)

    #print (dataset.get_classNum())
    ### Build Model
    net_ = dm.DeepMAR_res50(dataset.get_classNum())

    if args.resume:
        print('Resuming training, loading {}...'.format(args.resume))
        net_.load_weights(args.resume)

    if args.cuda:
        if len(gpu_ids) > 1:
            net_ = torch.nn.DataParallel(net_, device_ids=gpu_ids).cuda()
        else:
            #device = torch.device('cuda:1')
            #torch.cuda.set_device(gpu_ids[0])
            net_ = net_.cuda()
        cudnn.benchmark = True

    optimizer = optim.SGD(net_.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    criterion = F.binary_cross_entropy_with_logits  #https://discuss.pytorch.org/t/bceloss-vs-bcewithlogitsloss/33586/8
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.max_epoch)

    net_.train()

    # loss counters
    total_loss = 0
    epoch = 0
    print('Loading the dataset...', len(dataset))

    epoch_size = len(dataset) // BATCH_SIZE
    print('Using the specified args:')
    print(args)

    step_index = 0

    # create batch iterator
    batch_iterator = iter(data_loader)

    iter_counter = 0
    for iteration in range(0, args.max_iter):
        iter_counter += 1
        if iteration != 0 and (iteration % epoch_size == 0):
            # reset epoch loss counters
            writer.add_scalar('data/total_loss_epoch',
                              total_loss / len(dataset), epoch)
            total_loss = 0
            epoch += 1

            if epoch > args.max_epoch:
                break

        if iteration % epoch_size == 0:
            if epoch > args.warm_up:
                scheduler.step(epoch - args.warm_up)
            elif epoch == args.warm_up:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = args.lr
            else:
                lrr = 1e-4 + (args.lr - 1e-4) * iteration / (epoch_size *
                                                             args.warm_up)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lrr

        if iter_counter >= len(batch_iterator):
            batch_iterator = iter(data_loader)
            iter_counter = 0

        # load train data
        images, targets = next(batch_iterator)
        #print (images.shape)

        if args.cuda:
            images = Variable(images.cuda())
            with torch.no_grad():
                targets = Variable(targets.cuda())
                #targets = [Variable(ann.cuda()) for ann in targets]
        else:
            images = Variable(images)
            with torch.no_grad():
                targets = Variable(targets)
                #targets = [Variable(ann) for ann in targets]

        # forward
        t0 = time.time()
        out = net_(images)

        # backprop
        optimizer.zero_grad()
        loss_ = criterion(out, targets)
        loss_.backward()
        optimizer.step()
        t1 = time.time()

        # add log
        total_loss += loss_.item()
        #map_loss += 0#loss_map.item()

        if iteration % 10 == 0:
            print('timer: %.4f sec.' % (t1 - t0))
            print('iter ' + repr(iteration) + '||epoch:' + repr(epoch) +
                  ' || Loss: %.4f ||' % (loss_.item()),
                  end=' ')
            writer.add_scalar('data/total_loss_iter', loss_.item(), iteration)

        ###  save
        if iteration != 0 and iteration % 1000 == 0:
            print('Saving state, iter:', iteration)
            torch.save(
                net_.state_dict(),
                os.path.join(save_dir, 'models',
                             modelName + '_iter-' + repr(iteration) + '.pth'))
    torch.save(
        net_.state_dict(),
        os.path.join(save_dir, 'models',
                     modelName + '_iter-' + repr(iteration) + '.pth'))
Beispiel #5
0
def train(args):
    device = torch.device(args.device)

    text_field = TextField()
    label_field = LabelField()
    train_dataset, valid_dataset, test_dataset = load_data(
        root='data', text_field=text_field, label_field=label_field)
    # Our model will be run in 'open-vocabulary' mode.
    text_field.build_vocab(train_dataset, valid_dataset, test_dataset)
    label_field.build_vocab(train_dataset)
    text_field.vocab.load_vectors(args.word_vector)

    train_loader, valid_loader, test_loader = data.Iterator.splits(
        datasets=(train_dataset, valid_dataset, test_dataset),
        batch_size=args.batch_size,
        device=device)

    config_path = os.path.join(args.save_dir, 'config.yml')
    with open(config_path, 'r') as f:
        config = yaml.load(f)
    model = SSTModel(num_words=len(text_field.vocab),
                     num_classes=len(label_field.vocab),
                     **config['model'])
    model.word_embedding.weight.data.set_(text_field.vocab.vectors)
    model.word_embedding.weight.requires_grad = not args.fix_word_embeddings
    print(model)
    model.to(device)

    num_params = sum(p.numel() for p in model.parameters())
    num_intrinsic_params = num_params - model.word_embedding.weight.numel()
    logger.info(f'* # of params: {num_params}')
    logger.info(f'  - Intrinsic: {num_intrinsic_params}')
    logger.info(f'  - Word embedding: {num_params - num_intrinsic_params}')

    trainable_params = [p for p in model.parameters() if p.requires_grad]
    if args.optimizer == 'adam':
        optimizer = optim.Adam(trainable_params)
    elif args.optimizer == 'adadelta':
        optimizer = optim.Adadelta(trainable_params)
    else:
        raise ValueError('Unknown optimizer')
    assert not args.warm_restart or args.cosine_lr
    if args.cosine_lr:
        if not args.warm_restart:
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer=optimizer, T_max=len(train_loader) * args.max_epoch)
        else:
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer=optimizer, T_max=len(train_loader) * 2)
    else:
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                   mode='max',
                                                   factor=0.5,
                                                   patience=4,
                                                   verbose=True)
    criterion = nn.CrossEntropyLoss()

    def run_iter(batch):
        text, length = batch.text
        label = batch.label
        logit = model(inputs=text, length=length)
        clf_loss = criterion(input=logit, target=label)
        pred = logit.max(1)[1]
        accuracy = torch.eq(pred, label).float().mean()
        if model.training:
            if args.l2_weight > 0:
                l2_norm = sum(p.pow(2).sum() for p in trainable_params).sqrt()
            else:
                l2_norm = 0
            loss = clf_loss + args.l2_weight * l2_norm
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(trainable_params, max_norm=5)
            optimizer.step()
        return clf_loss.item(), accuracy.item()

    def validate(loader):
        model.eval()
        clf_loss_sum = accuracy_sum = 0
        num_valid_data = len(loader.dataset)
        with torch.no_grad():
            for valid_batch in loader:
                clf_loss, accuracy = run_iter(valid_batch)
                clf_loss_sum += clf_loss * valid_batch.batch_size
                accuracy_sum += accuracy * valid_batch.batch_size
        clf_loss = clf_loss_sum / num_valid_data
        accuracy = accuracy_sum / num_valid_data
        return clf_loss, accuracy

    train_summary_writer = SummaryWriter(
        os.path.join(args.save_dir, 'log', 'train'))
    valid_summary_writer = SummaryWriter(
        os.path.join(args.save_dir, 'log', 'valid'))

    validate_every = len(train_loader) // args.verbosity
    best_valid_accuracy = 0
    global_step = 0
    logger.info('Training starts!')
    for train_batch in train_loader:
        if not model.training:
            model.train()
        train_clf_loss, train_accuracy = run_iter(train_batch)
        global_step += 1
        if args.cosine_lr:
            if not args.warm_restart:
                scheduler.step()
            else:
                if scheduler.last_epoch == scheduler.T_max:
                    scheduler.T_max = scheduler.T_max * 2
                    scheduler.step(0)
                    logger.info('Warm-restarted the learning rate!')
                else:
                    scheduler.step()

        train_summary_writer.add_scalar(tag='clf_loss',
                                        scalar_value=train_clf_loss,
                                        global_step=global_step)
        train_summary_writer.add_scalar(tag='accuracy',
                                        scalar_value=train_accuracy,
                                        global_step=global_step)

        if global_step % validate_every == 0:
            progress = train_loader.iterations / len(train_loader)
            logger.info(f'* Epoch {progress:.2f}')
            logger.info(f'  - lr = {optimizer.param_groups[0]["lr"]:.6f}')
            logger.info(f'  - Validation starts')
            valid_clf_loss, valid_accuracy = validate(valid_loader)
            _, test_accuracy = validate(test_loader)
            if not args.cosine_lr:
                scheduler.step(valid_accuracy)
            valid_summary_writer.add_scalar(tag='clf_loss',
                                            scalar_value=valid_clf_loss,
                                            global_step=global_step)
            valid_summary_writer.add_scalar(tag='accuracy',
                                            scalar_value=valid_accuracy,
                                            global_step=global_step)
            valid_summary_writer.add_scalar(
                tag='lr',
                scalar_value=optimizer.param_groups[0]['lr'],
                global_step=global_step)
            logger.info(f'  - Valid clf loss: {valid_clf_loss:.5f}')
            logger.info(f'  - Valid accuracy: {valid_accuracy:.5f}')
            logger.info(f'  - Test accuracy: {test_accuracy:.5f}')
            if valid_accuracy > best_valid_accuracy:
                best_valid_accuracy = valid_accuracy
                model_filename = (f'best-{progress:.2f}'
                                  f'-{valid_clf_loss:.5f}'
                                  f'-{valid_accuracy:.5f}.pt')
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                logger.info(f'  - Saved the new best model to: {model_path}')
            elif args.save_every_epoch and global_step % (validate_every *
                                                          10) == 0:
                model_filename = (f'model-{progress:.2f}'
                                  f'-{valid_clf_loss:.5f}'
                                  f'-{valid_accuracy:.5f}.pt')
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                logger.info(f'  - Saved the new model to: {model_path}')

        if train_loader.epoch > args.max_epoch:
            break
Beispiel #6
0
def train_prune_32bit(model,
                      dataloader,
                      test_loader,
                      best_model_wts_init,
                      args,
                      prune_rate=50.):
    device = model.device
    momentum = model.momentum
    learning_rate = model.lr
    num_epochs = model.num_epochs
    milestones = model.milestones
    gamma = model.gamma
    weight_decay = model.weight_decay
    nesterov = model.nesterov

    if args.label_regularize == 'labelsmooth':
        criterion = LabelSmoothingLoss(model.device, model.num_classes, 0.1, 1)
    else:
        criterion = model.criterion
    batch_number = len(dataloader.dataset) // dataloader.batch_size

    if args.batch_wd:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=learning_rate,
                                    momentum=momentum,
                                    nesterov=nesterov,
                                    weight_decay=weight_decay)
    else:
        batch_params = [
            module for module in model.parameters() if module.ndimension() == 1
        ]
        other_params = [
            module for module in model.parameters() if module.ndimension() > 1
        ]
        optimizer = torch.optim.SGD([{
            'params': batch_params,
            'weight_decay': 0
        }, {
            'params': other_params,
            'weight_decay': weight_decay
        }],
                                    lr=learning_rate,
                                    momentum=momentum,
                                    nesterov=nesterov)

    if args.lr_type == 'step':
        scheduler = lr_scheduler.MultiStepLR(gamma=gamma,
                                             milestones=milestones,
                                             optimizer=optimizer)
    elif args.lr_type == 'cos':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
                                                   T_max=num_epochs,
                                                   eta_min=0.0005,
                                                   last_epoch=-1)
    losses = []
    test_losses = []
    accuracies = []
    test_accuracies = []
    best_acc = 0
    best_model_wts = copy.deepcopy(model.state_dict())
    ortho_miles = []

    for epoch in range(num_epochs):
        #get mask
        if epoch == 0:
            masks = weight_prune(model, prune_rate)
            model.load_state_dict(best_model_wts_init, strict=False)
            model.set_masks(masks)
        model.train()
        if args.label_regularize == 'labelsimilar':
            similarity = fc_similarity(model, device)
            criterion = LabelSimilarLoss(model.device, model.num_classes,
                                         similarity, 0.1, 1)
        scheduler.step()

        for i, (images, labels) in enumerate(tqdm(dataloader)):
            images = images.type(torch.FloatTensor).to(device)
            labels = labels.type(torch.LongTensor).to(device)

            if args.input_regularize:
                if args.input_regularize == 'cutmix':
                    lam, images, labels_a, labels_b = cutmix_32bit(
                        images, labels, device)
                elif args.input_regularize == 'mixup':
                    lam, images, labels_a, labels_b = mixup_32bit(
                        images, labels, device)
                optimizer.zero_grad()

                outputs = model(images)

                loss = lam * criterion(outputs, labels_a) + (
                    1 - lam) * criterion(outputs, labels_b)
            else:
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)

            if args.ortho:
                loss += args.ortho_lr * l2_reg_ortho_32bit(model, device)

            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            if (i + 1) % (batch_number // 4) == 0:
                tqdm.write(
                    'Epoch[{}/{}] , Step[{}/{}], Loss: {:.4f}, lr = {}'.format(
                        epoch + 1, num_epochs, i + 1, len(dataloader),
                        loss.item(), optimizer.param_groups[0]['lr']))
        #print('|| Train || === ', end = '')
        model.set_masks(masks)
        #tr_accuracy, tr_loss = eval_16bit(model, dataloader)
        print('|| Test  || === ', end='')
        test_accuracy, test_loss = eval_32bit(model, test_loader)
        if test_accuracy > best_acc:
            best_acc = test_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
    # accuracies.append(tr_accuracy)
    #losses.append(tr_loss)
        test_accuracies.append(test_accuracy)
        test_losses.append(test_loss)

    return losses, accuracies, test_losses, test_accuracies, best_model_wts
Beispiel #7
0
def main():
    config = vars(parse_args())

    if config['name'] is None:
        if config['deep_supervision']:
            config['name'] = '%s_%s_wDS' % (config['dataset'], config['arch'])
        else:
            config['name'] = '%s_%s_woDS' % (config['dataset'], config['arch'])
    os.makedirs('models/%s' % config['name'], exist_ok=True)

    print('-' * 20)
    for key in config:
        print('%s: %s' % (key, config[key]))
    print('-' * 20)

    with open('models/%s/config.yml' % config['name'], 'w') as f:
        yaml.dump(config, f)

    # define loss function (criterion)
    if config['loss'] == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda(
        )  #WithLogits 就是先将输出结果经过sigmoid再交叉熵
    else:
        criterion = losses.__dict__[config['loss']]().cuda()

    cudnn.benchmark = True

    # create model
    print("=> creating model %s" % config['arch'])
    model = archs.__dict__[config['arch']](config['num_classes'],
                                           config['input_channels'],
                                           config['deep_supervision'])

    model = model.cuda()

    params = filter(lambda p: p.requires_grad, model.parameters())
    if config['optimizer'] == 'Adam':
        optimizer = optim.Adam(params,
                               lr=config['lr'],
                               weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'SGD':
        optimizer = optim.SGD(params,
                              lr=config['lr'],
                              momentum=config['momentum'],
                              nesterov=config['nesterov'],
                              weight_decay=config['weight_decay'])
    else:
        raise NotImplementedError

    if config['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=config['epochs'],
                                                   eta_min=config['min_lr'])
    elif config['scheduler'] == 'ReduceLROnPlateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   factor=config['factor'],
                                                   patience=config['patience'],
                                                   verbose=1,
                                                   min_lr=config['min_lr'])
    elif config['scheduler'] == 'MultiStepLR':
        scheduler = lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[int(e) for e in config['milestones'].split(',')],
            gamma=config['gamma'])
    elif config['scheduler'] == 'ConstantLR':
        scheduler = None
    else:
        raise NotImplementedError

    # Data loading code
    img_ids = glob(
        os.path.join('inputs', config['dataset'], 'images',
                     '*' + config['img_ext']))
    img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    train_img_ids, val_img_ids = train_test_split(img_ids,
                                                  test_size=0.2,
                                                  random_state=41)
    #数据增强:
    train_transform = Compose([
        transforms.RandomRotate90(),
        transforms.Flip(),
        OneOf([
            transforms.HueSaturationValue(),
            transforms.RandomBrightness(),
            transforms.RandomContrast(),
        ],
              p=1),  #按照归一化的概率选择执行哪一个
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(),
    ])

    val_transform = Compose([
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(),
    ])

    train_dataset = Dataset(img_ids=train_img_ids,
                            img_dir=os.path.join('inputs', config['dataset'],
                                                 'images'),
                            mask_dir=os.path.join('inputs', config['dataset'],
                                                  'masks'),
                            img_ext=config['img_ext'],
                            mask_ext=config['mask_ext'],
                            num_classes=config['num_classes'],
                            transform=train_transform)
    val_dataset = Dataset(img_ids=val_img_ids,
                          img_dir=os.path.join('inputs', config['dataset'],
                                               'images'),
                          mask_dir=os.path.join('inputs', config['dataset'],
                                                'masks'),
                          img_ext=config['img_ext'],
                          mask_ext=config['mask_ext'],
                          num_classes=config['num_classes'],
                          transform=val_transform)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=config['num_workers'],
        drop_last=True)  #不能整除的batch是否就不要了
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=config['batch_size'],
                                             shuffle=False,
                                             num_workers=config['num_workers'],
                                             drop_last=False)

    log = OrderedDict([
        ('epoch', []),
        ('lr', []),
        ('loss', []),
        ('iou', []),
        ('val_loss', []),
        ('val_iou', []),
    ])

    best_iou = 0
    trigger = 0
    for epoch in range(config['epochs']):
        print('Epoch [%d/%d]' % (epoch, config['epochs']))

        # train for one epoch
        train_log = train(config, train_loader, model, criterion, optimizer)
        # evaluate on validation set
        val_log = validate(config, val_loader, model, criterion)

        if config['scheduler'] == 'CosineAnnealingLR':
            scheduler.step()
        elif config['scheduler'] == 'ReduceLROnPlateau':
            scheduler.step(val_log['loss'])

        print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' %
              (train_log['loss'], train_log['iou'], val_log['loss'],
               val_log['iou']))

        log['epoch'].append(epoch)
        log['lr'].append(config['lr'])
        log['loss'].append(train_log['loss'])
        log['iou'].append(train_log['iou'])
        log['val_loss'].append(val_log['loss'])
        log['val_iou'].append(val_log['iou'])

        pd.DataFrame(log).to_csv('models/%s/log.csv' % config['name'],
                                 index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(model.state_dict(),
                       'models/%s/model.pth' % config['name'])
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        # early stopping
        if config['early_stopping'] >= 0 and trigger >= config[
                'early_stopping']:
            print("=> early stopping")
            break

        torch.cuda.empty_cache()
Beispiel #8
0
def train():
    if args.dataset == 'COCO':
        if args.dataset_root == VOC_ROOT:
            if not os.path.exists(COCO_ROOT):
                parser.error('Must specify dataset_root if specifying dataset')
            print("WARNING: Using default COCO dataset_root because " +
                  "--dataset_root was not specified.")
            args.dataset_root = COCO_ROOT
        cfg = coco
        dataset = COCODetection(root=args.dataset_root,
                                transform=SSDAugmentation(
                                    cfg['min_dim'], MEANS))
    elif args.dataset == 'VOC':
        # if args.dataset_root == COCO_ROOT:
        #     parser.error('Must specify dataset if specifying dataset_root')
        cfg = voc
        dataset = VOCDetection(root=args.dataset_root,
                               transform=SSDAugmentation(
                                   cfg['min_dim'], MEANS))

    if args.visdom:
        import visdom
        viz = visdom.Visdom()

    ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
    net = ssd_net

    if args.cuda:
        net = torch.nn.DataParallel(ssd_net)
        cudnn.benchmark = True

    if args.resume:
        print('[DEBUG] Resuming training, loading {}...'.format(args.resume))
        ssd_net.load_weights(args.resume)
    else:
        vgg_weights = torch.load(args.save_folder + args.basenet)
        print('Loading base network...')
        ssd_net.vgg.load_state_dict(vgg_weights)

    if args.cuda:
        net = net.cuda()

    if not args.resume:
        print('[DEBUG] Initializing weights...')
        # initialize newly added layers' weights with xavier method
        ssd_net.extras.apply(weights_init)
        ssd_net.loc.apply(weights_init)
        ssd_net.conf.apply(weights_init)

    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'],
                             0.5,
                             True,
                             0,
                             True,
                             3,
                             0.5,
                             False,
                             args.cuda,
                             lm=True)
    sche = lr_scheduler.CosineAnnealingLR(optimizer, T_max=250)

    # initSummaty()
    log_folder = './results/' + net.__class__.__name__ + '/' + optimizer.__class__.__name__ + '/' + 'lm/' + str(
        1001) + '/'
    print("log_folder:  ", log_folder)
    writer = SummaryWriter(log_folder)

    net.train()
    # loss counters
    loc_loss = 0
    conf_loss = 0
    print('Loading the dataset...')

    epoch_size = len(dataset) // args.batch_size
    dataset_len = epoch_size
    print("[DEBUG] dataset len: {}".format(len(dataset)))

    print('Training SSD on:', dataset.name)
    print('Using the specified args:')
    print(args)

    step_index = 0

    if args.visdom:
        vis_title = 'SSD.PyTorch on ' + dataset.name
        vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
        iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
        epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)

    data_loader = data.DataLoader(dataset,
                                  args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=True,
                                  collate_fn=detection_collate,
                                  pin_memory=True)

    for epoch in range(args.start_iter, 250):
        batch_iterator = iter(data_loader)

        t0 = time.time()
        sche.step(epoch)
        for iteration in range(epoch_size):

            # load train data
            images, targets = next(batch_iterator)

            if args.cuda:
                images = Variable(images.cuda())
                targets = [
                    Variable(anno.cuda(), volatile=True) for anno in targets
                ]
            else:
                images = Variable(images)
                targets = np.array(
                    [Variable(ann, volatile=True) for ann in targets])

            # forward
            out = net(images)
            # backprop
            optimizer.zero_grad()
            loss_l, loss_c = criterion(out, targets)
            # loss_l, loss_c = loss_l.sum(), loss_c.sum()

            loss = loss_l + loss_c
            loss.backward()
            optimizer.step()
            loc_loss += loss_l.data[0]
            conf_loss += loss_c.data[0]
            lr_now = optimizer.param_groups[0]['lr']

            print('Epoch [{}/{}] '.format(epoch, 400) + 'iter ' +
                  repr(iteration) + '||Loss: %.4f|loss_l:%.4f|loss_c:%.4f||' %
                  (loss.data[0], loss_l.data[0], loss_c.data[0]) +
                  'lr={}'.format(lr_now),
                  end='\r')

        t1 = time.time()
        print('Epoch [{}/{}] '.format(epoch, 400) + 'timer: %.4f sec.' %
              (t1 - t0),
              end='\n')

        writer.add_scalar('loc_loss', loc_loss, epoch)
        writer.add_scalar('conf_loss', conf_loss, epoch)
        lr_now = optimizer.param_groups[0]['lr']
        writer.add_scalar('learning rate', lr_now, epoch)
        # reset epoch loss counters
        loc_loss = 0
        conf_loss = 0

        if epoch % 10 == 0:
            print('Saving state, epoch:', epoch)
            torch.save(
                ssd_net.state_dict(), 'weights/lm/ssd300_' + args.dataset +
                '_' + repr(epoch) + '.pth')
        torch.save(ssd_net.state_dict(),
                   args.save_folder + '' + args.dataset + '.pth')
Beispiel #9
0
def train_model_snapshot(model, criterion, lr, dataloaders, dataset_sizes,
                         device, num_cycles, num_epochs_per_cycle):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_loss = 1000000.0
    model_w_arr = []
    prob = torch.zeros((dataset_sizes['val'], 3),
                       dtype=torch.float32).to(device)
    lbl = torch.zeros((dataset_sizes['val'], ), dtype=torch.long).to(device)
    for cycle in range(num_cycles):
        optimizer = optim.SGD(model.parameters(), lr=lr,
                              momentum=0.9)  #, weight_decay = 0.0005)
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer, num_epochs_per_cycle * len(dataloaders['train']))
        for epoch in range(num_epochs_per_cycle):
            #print('Cycle {}: Epoch {}/{}'.format(cycle, epoch, num_epochs_per_cycle - 1))
            #print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()  # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0
                idx = 0
                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        if (epoch == num_epochs_per_cycle - 1) and (phase
                                                                    == 'val'):
                            prob[idx:idx + inputs.shape[0]] += F.softmax(
                                outputs, dim=1)
                            lbl[idx:idx + inputs.shape[0]] = labels
                            idx += inputs.shape[0]
                        loss = criterion(outputs, labels)
                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                            scheduler.step()
                            #print(optimizer.param_groups[0]['lr'])

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                #print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                #    phase, epoch_loss, epoch_acc))

                # deep copy the model
                if phase == 'val' and epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_model_wts = copy.deepcopy(model.state_dict())
            #print()
        model_w_arr.append(copy.deepcopy(model.state_dict()))

    prob /= num_cycles
    ensemble_loss = F.nll_loss(torch.log(prob), lbl)
    ensemble_loss = ensemble_loss.item()
    time_elapsed = time.time() - since
    #print('Training complete in {:.0f}m {:.0f}s'.format(
    #    time_elapsed // 60, time_elapsed % 60))
    #print('Ensemble Loss : {:4f}, Best val Loss: {:4f}'.format(ensemble_loss, best_loss))

    # load best model weights
    model_arr = []
    for weights in model_w_arr:
        model.load_state_dict(weights)
        model_arr.append(model)
    return model_arr, ensemble_loss, best_loss, prob
Beispiel #10
0
def main_worker(gpu, ngpus_per_node, argss):
    global args, best_acc1
    args, best_acc1 = argss, 0
    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    model = san(args.sa_type, args.layers, args.kernels, args.classes)
    criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.base_lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    print(args.base_lr, args.momentum, args.weight_decay)
    if args.scheduler == 'step':
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             milestones=args.step_epochs,
                                             gamma=0.1)
    elif args.scheduler == 'cosine':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=args.epochs)

    if main_process():
        global logger, writer
        logger = get_logger()
        writer = SummaryWriter(args.save_path)
        logger.info(args)
        logger.info("=> creating model ...")
        logger.info("Classes: {}".format(args.classes))
        logger.info(model)
    if args.distributed:
        torch.cuda.set_device(gpu)
        args.batch_size = int(args.batch_size / ngpus_per_node)
        args.batch_size_val = int(args.batch_size_val / ngpus_per_node)
        args.workers = int(
            (args.workers + ngpus_per_node - 1) / ngpus_per_node)
        model = torch.nn.parallel.DistributedDataParallel(model.cuda(),
                                                          device_ids=[gpu])
    else:
        model = torch.nn.DataParallel(model.cuda())
    if args.weight:
        if os.path.isfile(args.weight):
            if main_process():
                logger.info("=> loading weight '{}'".format(args.weight))
            checkpoint = torch.load(args.weight)
            model.load_state_dict(checkpoint['state_dict'])
            if main_process():
                logger.info("=> loaded weight '{}'".format(args.weight))
        else:
            if main_process():
                logger.info("=> no weight found at '{}'".format(args.weight))

    if args.resume:
        if os.path.isfile(args.resume):
            if main_process():
                logger.info("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(gpu))
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['top1_val']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            if main_process():
                logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
        else:
            if main_process():
                logger.info("=> no checkpoint found at '{}'".format(
                    args.resume))

    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    train_set = torchvision.datasets.ImageFolder(
        os.path.join(args.data_root, 'train'), train_transform)
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    val_set = torchvision.datasets.ImageFolder(
        os.path.join(args.data_root, 'val'), val_transform)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_set)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_set)
    else:
        train_sampler = None
        val_sampler = None
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=args.batch_size_val,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler)

    loss_val, mIoU_val, mAcc_val, allAcc_val, top1_val, top5_val = validate(
        val_loader, model, criterion)
    exit(0)
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        loss_train, mIoU_train, mAcc_train, allAcc_train, top1_train, top5_train = train(
            train_loader, model, criterion, optimizer, epoch)
Beispiel #11
0
]
lrs = [
    0.000001, 0.000001, 0.000001, 0.000001, 0.00001, 0.00001, 0.00001, 0.0001,
    0.001, 0.01
]

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD([{
    'params': p,
    'lr': l
} for p, l in zip(layers, lrs)],
                      momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                           5,
                                           eta_min=0,
                                           last_epoch=-1)


def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    #since = time.time()
    loss_train = []
    loss_test = []
    acc_test = []
    acc_train = []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
Beispiel #12
0
    sample_sub.update(sub)

    sample_sub.to_csv('submission.csv')


if __name__ == '__main__':
    global_start_time = time.time()
    train_loader, test_loader, label_encoder, num_classes = load_data(
        train, test, train_dir, test_dir)

    model = EfficientNetEncoderHead(depth=0, num_classes=num_classes)
    model.cuda()

    criterion = nn.CrossEntropyLoss()

    optimizer = radam(model.parameters(),
                      lr=1e-3,
                      betas=(0.9, 0.999),
                      eps=1e-3,
                      weight_decay=1e-4)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=len(train_loader) *
                                               NUM_EPOCHS,
                                               eta_min=1e-6)

    for epoch in range(1, NUM_EPOCHS + 1):
        print('-' * 50)
        train_step(train_loader, model, criterion, optimizer, epoch, scheduler)

    print('inference mode')
    generate_submission(test_loader, model, label_encoder)
Beispiel #13
0
                                     reduction='mean',
                                     beta=1.0)
# L2
criterion_L2 = nn.MSELoss(reduction='mean')
if True:
    # Hyperparameter(超參數)
    learning_rate = 1e-1
    num_epochs = 200
    weight_decay = 0
    # 定義優化器
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.9)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=200,
                                               eta_min=1e-2,
                                               last_epoch=-1)
    model = model.float()
    switch = True
    # 訓練模型
    train_ls, val_ls, lr = [], [], []
    for epoch in range(num_epochs):
        # 切割資料
        train_x = train_x.clone().detach()
        val_x = val_x.clone().detach()
        test_x = test_x.clone().detach()
        train_y = train_y.clone().detach()
        val_y = val_y.clone().detach()

        # 切割訓練集
        batch_size = 12967
Beispiel #14
0
def estimate(X_train,y_train):
    i = 0
    ii = 0
    nrows=256
    ncolumns=256
    channels=1
    ntrain=0.8*len(X_train)
    nval=0.2*len(X_train)
    batch_size=16
    epochs= 2
    num_cpu = multiprocessing.cpu_count()
    num_classes = 2
    torch.manual_seed(8)
    torch.cuda.manual_seed(8)
    np.random.seed(8)
    random.seed(8)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X = []
    X_train=np.reshape(np.array(X_train),[len(X_train),])
    for img in list(range(0,len(X_train))):
        if X_train[img].ndim>=3:
            X.append(np.moveaxis(cv2.resize(X_train[img][:,:,:3], (nrows,ncolumns),interpolation=cv2.INTER_CUBIC), -1, 0))
        else:
            smimg= cv2.cvtColor(X_train[img],cv2.COLOR_GRAY2RGB)
            X.append(np.moveaxis(cv2.resize(smimg, (nrows,ncolumns),interpolation=cv2.INTER_CUBIC), -1, 0))
        
        if y_train[img]=='COVID':
            y_train[img]=1
        elif y_train[img]=='NonCOVID' :
            y_train[img]=0
        else:
            continue

    x = np.array(X)
    y_train = np.array(y_train)
    
    
    outputs_all = []
    labels_all = []
    
    X_train, X_val, y_train, y_val = train_test_split(x, y_train, test_size=0.2, random_state=2)
    
    
    
    image_transforms = { 
     'train': transforms.Compose([
         transforms.Lambda(lambda x: x/255),
        transforms.ToPILImage(), 
         transforms.Resize((230, 230)),
    transforms.RandomResizedCrop((224),scale=(0.75,1.0)),     
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    #transforms.Affine(10,shear =(0.1,0.1)),
    # random brightness and random contrast
    #transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.45271412, 0.45271412, 0.45271412],
                             [0.33165374, 0.33165374, 0.33165374])
     ]),
     'valid': transforms.Compose([
         transforms.Lambda(lambda x: x/255),
         transforms.ToPILImage(), 
       transforms.Resize((230, 230)),
        transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.45271412, 0.45271412, 0.45271412],
                             [0.33165374, 0.33165374, 0.33165374])
     ])
    }
    
    
    
    train_data = MyDataset(X_train, y_train,image_transforms['train'])
    
    valid_data = MyDataset(X_val, y_val,image_transforms['valid'])
    
    
    dataset_sizes = {
    'train':len(train_data),
    'valid':len(valid_data)
}
    
    dataloaders = {
        'train' : data.DataLoader(train_data, batch_size=batch_size, shuffle=True,
                            num_workers=num_cpu, pin_memory=True, worker_init_fn=np.random.seed(7), drop_last=False),
        'valid' : data.DataLoader(valid_data, batch_size=batch_size, shuffle=True,
                            num_workers=num_cpu, pin_memory=True, worker_init_fn=np.random.seed(7), drop_last=False)  
}
    
   
   
     
    
        
    model = ResidualAttentionModel(10)
    checkpoint0 = torch.load('model_resAttention.pth')
    model.load_state_dict(checkpoint0)
    num_ftrs = model.fc.in_features
       
    model.fc = nn.Linear(num_ftrs,num_classes)
   
      
    
    model = nn.DataParallel(model, device_ids=[ 0, 1,2, 3]).cuda()
    criterion = nn.CrossEntropyLoss()
    #optimizer = optim.SGD(model.parameters(), lr=0.06775, momentum=0.5518,weight_decay=0.000578)
    optimizer = optim.Adam(model.parameters(), lr=0.0001,weight_decay=0.05)
    scheduler =  lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    #scheduler = lr_scheduler.StepLR(optimizer, step_size=35, gamma=0.1)
    
   
    best_acc = 0.0
    best_f1 = 0.0
    best_epoch = 0
    best_loss = 100000
    since = time.time()
    writer = SummaryWriter()
    
    model.train()
    
    for epoch in range(epochs): 
            print('epoch',epoch)
            jj=0
            for phase in ['train', 'valid']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0
                predictions=FloatTensor()
                all_labels=FloatTensor()
                

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device, non_blocking=True)
                    labels = labels.to(device, non_blocking=True)
                    predictions = predictions.to(device, non_blocking=True)
                    all_labels = all_labels.to(device, non_blocking=True)

                # zero the parameter gradients
                    optimizer.zero_grad()

                # forward
                # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                         outputs = model(inputs)
                         _, preds = torch.max(outputs, 1)
                         loss = criterion(outputs, labels)
                    
                   
                         predictions=torch.cat([predictions,preds.float()])
                         all_labels=torch.cat([all_labels,labels.float()])
                    
                        
                    # backward + optimize only if in training phase
                         if phase == 'train':
                               loss.backward()
                               optimizer.step()
                        
                                
                         if phase == 'train':
                                     jj+= 1
                                    
                                     if len(inputs) >=16 :
            
                                             writer.add_figure('predictions vs. actuals epoch '+str(epoch)+' '+str(jj) ,
                                             plot_classes_preds(model, inputs, labels))
                                            
                                          
                # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                     
                if phase == 'train':
                    scheduler.step()
                

              
                epoch_f1=f1_score(all_labels.tolist(), predictions.tolist(),average='weighted')
               
                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = accuracy_score(all_labels.tolist(), predictions.tolist())
              
                if phase == 'train':
                    writer.add_scalar('Train/Loss', epoch_loss, epoch)
                    writer.add_scalar('Train/Accuracy', epoch_acc, epoch)
                
                    writer.flush()
                elif phase == 'valid':
                    writer.add_scalar('Valid/Loss', epoch_loss, epoch)
                    writer.add_scalar('Valid/Accuracy', epoch_acc, epoch)
                    writer.flush()
                

            # deep copy the model
                if phase == 'valid' and epoch_acc > best_acc:
                
                    best_f1 = epoch_f1
                    best_acc = epoch_acc
                    best_loss = epoch_loss
                    best_epoch = epoch
                    best_model_wts = copy.deepcopy(model.module.state_dict())
                    best_model_wts_module = copy.deepcopy(model.state_dict())
                
    model.load_state_dict(best_model_wts_module)
    torch.save(model, "Model_res.pth")
    torch.save(best_model_wts,"Model_res_state.pth")
    time_elapsed = time.time() - since
        
    print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
    print('Best validation Acc: {:4f}'.format(best_acc))
    print('Best validation f1: {:4f}'.format(best_f1))
    print('best epoch: ', best_epoch)
     
    ## Replacing the last fully connected layer with SVM or ExtraTrees Classifiers  
    model.module.fc = nn.Identity()
   
    for param in model.parameters():
             param.requires_grad_(False)
            
    clf = svm.SVC(kernel='rbf', probability=True)
    all_best_accs = {}
    all_best_f1s = {}
    #clf = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=30, random_state=0)
    
    for phase in ['train','valid']:
                outputs_all = []
                labels_all = []
                model.eval()   # Set model to evaluate mode

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device, non_blocking=True)
                    labels = labels.to(device, non_blocking=True)
                  
                    outputs = model(inputs)
                    outputs_all.append(outputs)
                    labels_all.append(labels)
                    
                outputs = torch.cat(outputs_all)
                labels = torch.cat(labels_all)
                
                 # fit the classifier on training set and then predict on test 
                if phase == 'train': 
                         clf.fit(outputs.cpu(), labels.cpu())
                         filename = 'classifier_model.sav'
                         joblib.dump(clf, filename)
                         all_best_accs[phase]=accuracy_score(labels.cpu(), clf.predict(outputs.cpu()))
                         all_best_f1s[phase]= f1_score(labels.cpu(), clf.predict(outputs.cpu()))
                         print(phase, ' ',accuracy_score(labels.cpu(), clf.predict(outputs.cpu())))   
                if phase != 'train' :
                         predict = clf.predict(outputs.cpu())
                         all_best_accs[phase]=accuracy_score(labels.cpu(), clf.predict(outputs.cpu()))
                         all_best_f1s[phase]= f1_score(labels.cpu(), clf.predict(outputs.cpu()))
                         print(phase, ' ',accuracy_score(labels.cpu(), clf.predict(outputs.cpu())))  
                            
    print('Best Acc: ',all_best_accs)
    print('Best f1: ',all_best_f1s)
    
   
    return model
Beispiel #15
0
if P.optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=P.lr_init, momentum=0.9, weight_decay=P.weight_decay)
    lr_decay_gamma = 0.1
elif P.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=P.lr_init, betas=(.9, .999), weight_decay=P.weight_decay)
    lr_decay_gamma = 0.3
elif P.optimizer == 'lars':
    from torchlars import LARS
    base_optimizer = optim.SGD(model.parameters(), lr=P.lr_init, momentum=0.9, weight_decay=P.weight_decay)
    optimizer = LARS(base_optimizer, eps=1e-8, trust_coef=0.001)
    lr_decay_gamma = 0.1
else:
    raise NotImplementedError()

if P.lr_scheduler == 'cosine':
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, P.epochs)
elif P.lr_scheduler == 'step_decay':
    milestones = [int(0.5 * P.epochs), int(0.75 * P.epochs)]
    scheduler = lr_scheduler.MultiStepLR(optimizer, gamma=lr_decay_gamma, milestones=milestones)
else:
    raise NotImplementedError()

from training.scheduler import GradualWarmupScheduler
scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10.0, total_epoch=P.warmup, after_scheduler=scheduler)

if P.multi_gpu:
    linear = model.module.linear
else:
    linear = model.linear

linear_optim = torch.optim.Adam(linear.parameters(), lr=1e-3, betas=(.9, .999), weight_decay=P.weight_decay)
Beispiel #16
0
def train(cfg):
    log_dir = create_log_dir(cfg)
    device = set_device(cfg)
    # --------------------------------------------------------------------------
    # Data Loading and Preprocessing
    # --------------------------------------------------------------------------
    # __________________________________________________________________________
    # Build MONAI preprocessing
    train_preprocess = Compose([
        ToTensorD(keys="image"),
        TorchVisionD(keys="image",
                     name="ColorJitter",
                     brightness=64.0 / 255.0,
                     contrast=0.75,
                     saturation=0.25,
                     hue=0.04),
        ToNumpyD(keys="image"),
        RandFlipD(keys="image", prob=0.5),
        RandRotate90D(keys="image", prob=0.5),
        CastToTypeD(keys="image", dtype=np.float32),
        RandZoomD(keys="image", prob=0.5, min_zoom=0.9, max_zoom=1.1),
        ScaleIntensityRangeD(keys="image",
                             a_min=0.0,
                             a_max=255.0,
                             b_min=-1.0,
                             b_max=1.0),
        ToTensorD(keys=("image", "label")),
    ])
    valid_preprocess = Compose([
        CastToTypeD(keys="image", dtype=np.float32),
        ScaleIntensityRangeD(keys="image",
                             a_min=0.0,
                             a_max=255.0,
                             b_min=-1.0,
                             b_max=1.0),
        ToTensorD(keys=("image", "label")),
    ])
    # __________________________________________________________________________
    # Create MONAI dataset
    train_json_info_list = load_decathlon_datalist(
        data_list_file_path=cfg["dataset_json"],
        data_list_key="training",
        base_dir=cfg["data_root"],
    )
    valid_json_info_list = load_decathlon_datalist(
        data_list_file_path=cfg["dataset_json"],
        data_list_key="validation",
        base_dir=cfg["data_root"],
    )

    train_dataset = PatchWSIDataset(
        train_json_info_list,
        cfg["region_size"],
        cfg["grid_shape"],
        cfg["patch_size"],
        train_preprocess,
        image_reader_name="openslide" if cfg["use_openslide"] else "cuCIM",
    )
    valid_dataset = PatchWSIDataset(
        valid_json_info_list,
        cfg["region_size"],
        cfg["grid_shape"],
        cfg["patch_size"],
        valid_preprocess,
        image_reader_name="openslide" if cfg["use_openslide"] else "cuCIM",
    )

    # __________________________________________________________________________
    # DataLoaders
    train_dataloader = DataLoader(train_dataset,
                                  num_workers=cfg["num_workers"],
                                  batch_size=cfg["batch_size"],
                                  pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,
                                  num_workers=cfg["num_workers"],
                                  batch_size=cfg["batch_size"],
                                  pin_memory=True)

    # __________________________________________________________________________
    # Get sample batch and some info
    first_sample = first(train_dataloader)
    if first_sample is None:
        raise ValueError("Fist sample is None!")

    print("image: ")
    print("    shape", first_sample["image"].shape)
    print("    type: ", type(first_sample["image"]))
    print("    dtype: ", first_sample["image"].dtype)
    print("labels: ")
    print("    shape", first_sample["label"].shape)
    print("    type: ", type(first_sample["label"]))
    print("    dtype: ", first_sample["label"].dtype)
    print(f"batch size: {cfg['batch_size']}")
    print(f"train number of batches: {len(train_dataloader)}")
    print(f"valid number of batches: {len(valid_dataloader)}")

    # --------------------------------------------------------------------------
    # Deep Learning Classification Model
    # --------------------------------------------------------------------------
    # __________________________________________________________________________
    # initialize model
    model = TorchVisionFCModel("resnet18",
                               num_classes=1,
                               use_conv=True,
                               pretrained=cfg["pretrain"])
    model = model.to(device)

    # loss function
    loss_func = torch.nn.BCEWithLogitsLoss()
    loss_func = loss_func.to(device)

    # optimizer
    if cfg["novograd"]:
        optimizer = Novograd(model.parameters(), cfg["lr"])
    else:
        optimizer = SGD(model.parameters(), lr=cfg["lr"], momentum=0.9)

    # AMP scaler
    if cfg["amp"]:
        cfg["amp"] = True if monai.utils.get_torch_version_tuple() >= (
            1, 6) else False
    else:
        cfg["amp"] = False

    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=cfg["n_epochs"])

    # --------------------------------------------
    # Ignite Trainer/Evaluator
    # --------------------------------------------
    # Evaluator
    val_handlers = [
        CheckpointSaver(save_dir=log_dir,
                        save_dict={"net": model},
                        save_key_metric=True),
        StatsHandler(output_transform=lambda x: None),
        TensorBoardStatsHandler(log_dir=log_dir,
                                output_transform=lambda x: None),
    ]
    val_postprocessing = Compose([
        ActivationsD(keys="pred", sigmoid=True),
        AsDiscreteD(keys="pred", threshold=0.5)
    ])
    evaluator = SupervisedEvaluator(
        device=device,
        val_data_loader=valid_dataloader,
        network=model,
        postprocessing=val_postprocessing,
        key_val_metric={
            "val_acc":
            Accuracy(output_transform=from_engine(["pred", "label"]))
        },
        val_handlers=val_handlers,
        amp=cfg["amp"],
    )

    # Trainer
    train_handlers = [
        LrScheduleHandler(lr_scheduler=scheduler, print_lr=True),
        CheckpointSaver(save_dir=cfg["logdir"],
                        save_dict={
                            "net": model,
                            "opt": optimizer
                        },
                        save_interval=1,
                        epoch_level=True),
        StatsHandler(tag_name="train_loss",
                     output_transform=from_engine(["loss"], first=True)),
        ValidationHandler(validator=evaluator, interval=1, epoch_level=True),
        TensorBoardStatsHandler(log_dir=cfg["logdir"],
                                tag_name="train_loss",
                                output_transform=from_engine(["loss"],
                                                             first=True)),
    ]
    train_postprocessing = Compose([
        ActivationsD(keys="pred", sigmoid=True),
        AsDiscreteD(keys="pred", threshold=0.5)
    ])

    trainer = SupervisedTrainer(
        device=device,
        max_epochs=cfg["n_epochs"],
        train_data_loader=train_dataloader,
        network=model,
        optimizer=optimizer,
        loss_function=loss_func,
        postprocessing=train_postprocessing,
        key_train_metric={
            "train_acc":
            Accuracy(output_transform=from_engine(["pred", "label"]))
        },
        train_handlers=train_handlers,
        amp=cfg["amp"],
    )
    trainer.run()
Beispiel #17
0
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    # epoch = checkpoint['epoch']
    # loss = checkpoint['loss']

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model_ft = model_ft.to(device)
    #Loss Function
    criterion = nn.CrossEntropyLoss()
    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(),
                             lr=lr,
                             momentum=0.9,
                             weight_decay=0.0001)
    # optimizer_ft = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)
    schedule = scheduler.CosineAnnealingLR(optimizer_ft,
                                           num_epoch,
                                           eta_min=1e-7)

    # checkpoint = torch.load('./models/model32/model_14_epoch.pth')
    # model_ft.load_state_dict(checkpoint['model_state_dict'])
    # optimizer_ft.load_state_dict(checkpoint['optimizer_state_dict'])

    # schedule = None
    train_model("test-refine",
                model_ft,
                dataloaders,
                dataset_sizes,
                criterion,
                optimizer_ft,
                num_epochs=num_epoch,
                lr=lr,
Beispiel #18
0
def exp(subject_id):
    import torch
    test_subj = np.r_[subject_id]

    print('test subj:' + str(test_subj))

    #20% validation
    train_size = int(0.9* len(splitted['session_T']))
    test_size = len(splitted['session_T']) - train_size



    # train_set, valid_set = torch.utils.data.random_split(splitted['session_T'], [train_size, test_size])
    train_set = splitted['session_T']
    test_set = splitted['session_E']



    # model = Deep4Net(
    #     n_chans,
    #     n_classes,
    #     input_window_samples=input_window_samples,
    #     final_conv_length="auto",
    # )

    from torch.utils.data import Dataset, ConcatDataset




    crop_size = 1000
    # embedding_net = Deep4Net_origin(n_classes, n_chans, crop_size)
    # model = FcClfNet(embedding_net)

    model = ShallowFBCSPNet(
        n_chans,
        n_classes,
        input_window_samples=input_window_samples,
        final_conv_length='auto',
    )

    from braindecode.models.util import to_dense_prediction_model, get_output_shape
    to_dense_prediction_model(model)

    n_preds_per_input = get_output_shape(model, 22, input_window_samples)[2]
    print("n_preds_per_input : ", n_preds_per_input)
    print(model)


    batch_size =8
    epochs = 200






    lr = 0.0625 * 0.01
    weight_decay = 0



    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)



    # Send model to GPU
    if cuda:
        model.cuda()

    from torch.optim import lr_scheduler
    import torch.optim as optim

    import argparse
    parser = argparse.ArgumentParser(description='cross subject domain adaptation')
    parser.add_argument('--batch-size', type=int, default=50, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=50, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=100, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=True,
                        help='For Saving the current Model')
    args = parser.parse_args()
    args.gpuidx = 0
    args.seed = 0
    args.use_tensorboard = False
    args.save_model = False

    optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.5 * 0.001)
    # scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs-1)



    import pandas as pd
    results_columns = ['test_loss',  'test_accuracy']
    df = pd.DataFrame(columns=results_columns)

    for epochidx in range(1, epochs):
        print(epochidx)
        train_crop(10, model, device, train_loader,optimizer,scheduler,cuda, args.gpuidx)
        test_loss, test_score = eval_crop(model, device, test_loader)
        results = { 'test_loss': test_loss, 'test_accuracy': test_score}
        df = df.append(results, ignore_index=True)
        print(results)

    return df
Beispiel #19
0
def main_worker(gpu, ngpus_per_node, args):
    cudnn.benchmark = args.cudnn_benchmark
    args.gpu = gpu

    num_classes, train_list_name, val_list_name, test_list_name, filename_seperator, image_tmpl, filter_video, label_file = get_dataset_config(
        args.dataset)
    args.num_classes = num_classes

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    if args.modality == 'rgb':
        args.input_channels = 3
    elif args.modality == 'flow':
        args.input_channels = 2 * 5

    model, arch_name = build_model(args)
    mean = model.mean(args.modality)
    std = model.std(args.modality)

    # overwrite mean and std if they are presented in command
    if args.mean is not None:
        if args.modality == 'rgb':
            if len(args.mean) != 3:
                raise ValueError(
                    "When training with rgb, dim of mean must be three.")
        elif args.modality == 'flow':
            if len(args.mean) != 1:
                raise ValueError(
                    "When training with flow, dim of mean must be three.")
        mean = args.mean

    if args.std is not None:
        if args.modality == 'rgb':
            if len(args.std) != 3:
                raise ValueError(
                    "When training with rgb, dim of std must be three.")
        elif args.modality == 'flow':
            if len(args.std) != 1:
                raise ValueError(
                    "When training with flow, dim of std must be three.")
        std = args.std

    model = model.cuda(args.gpu)
    model.eval()

    if args.show_model:
        if args.rank == 0:
            print(model)
        return 0

    if args.pretrained is not None:
        if args.rank == 0:
            print("=> using pre-trained model '{}'".format(arch_name))
        checkpoint = torch.load(args.pretrained, map_location='cpu')
        model.load_state_dict(checkpoint['state_dict'], strict=False)
        del checkpoint  # dereference seems crucial
        torch.cuda.empty_cache()
    else:
        if args.rank == 0:
            print("=> creating model '{}'".format(arch_name))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # the batch size should be divided by number of nodes as well
            args.batch_size = int(args.batch_size / args.world_size)
            args.workers = int(args.workers / ngpus_per_node)

            if args.sync_bn:
                process_group = torch.distributed.new_group(
                    list(range(args.world_size)))
                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                    model, process_group)

            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        # assign rank to 0
        model = torch.nn.DataParallel(model).cuda()
        args.rank = 0

    # define loss function (criterion) and optimizer
    train_criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    val_criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # Data loading code
    val_list = os.path.join(args.datadir, val_list_name)

    val_augmentor = get_augmentor(
        False,
        args.input_size,
        scale_range=args.scale_range,
        mean=mean,
        std=std,
        disable_scaleup=args.disable_scaleup,
        threed_data=args.threed_data,
        is_flow=True if args.modality == 'flow' else False,
        version=args.augmentor_ver)

    val_dataset = VideoDataSet(args.datadir,
                               val_list,
                               args.groups,
                               args.frames_per_group,
                               num_clips=args.num_clips,
                               modality=args.modality,
                               image_tmpl=image_tmpl,
                               dense_sampling=args.dense_sampling,
                               transform=val_augmentor,
                               is_train=False,
                               test_mode=False,
                               seperator=filename_seperator,
                               filter_video=filter_video)

    val_loader = build_dataflow(val_dataset,
                                is_train=False,
                                batch_size=args.batch_size,
                                workers=args.workers,
                                is_distributed=args.distributed)

    log_folder = os.path.join(args.logdir, arch_name)
    if args.rank == 0:
        if not os.path.exists(log_folder):
            os.makedirs(log_folder)

    if args.evaluate:
        val_top1, val_top5, val_losses, val_speed = validate(val_loader,
                                                             model,
                                                             val_criterion,
                                                             gpu_id=args.gpu)
        if args.rank == 0:
            logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a')
            print(
                'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'
                .format(args.input_size, val_losses, val_top1, val_top5,
                        val_speed * 1000.0),
                flush=True)
            print(
                'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'
                .format(args.input_size, val_losses, val_top1, val_top5,
                        val_speed * 1000.0),
                flush=True,
                file=logfile)
        return

    train_list = os.path.join(args.datadir, train_list_name)

    train_augmentor = get_augmentor(
        True,
        args.input_size,
        scale_range=args.scale_range,
        mean=mean,
        std=std,
        disable_scaleup=args.disable_scaleup,
        threed_data=args.threed_data,
        is_flow=True if args.modality == 'flow' else False,
        version=args.augmentor_ver)

    train_dataset = VideoDataSet(args.datadir,
                                 train_list,
                                 args.groups,
                                 args.frames_per_group,
                                 num_clips=args.num_clips,
                                 modality=args.modality,
                                 image_tmpl=image_tmpl,
                                 dense_sampling=args.dense_sampling,
                                 transform=train_augmentor,
                                 is_train=True,
                                 test_mode=False,
                                 seperator=filename_seperator,
                                 filter_video=filter_video)

    train_loader = build_dataflow(train_dataset,
                                  is_train=True,
                                  batch_size=args.batch_size,
                                  workers=args.workers,
                                  is_distributed=args.distributed)

    sgd_polices = model.parameters()
    optimizer = torch.optim.SGD(sgd_polices,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                nesterov=args.nesterov)

    if args.lr_scheduler == 'step':
        scheduler = lr_scheduler.StepLR(optimizer, args.lr_steps[0], gamma=0.1)
    elif args.lr_scheduler == 'multisteps':
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             args.lr_steps,
                                             gamma=0.1)
    elif args.lr_scheduler == 'cosine':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   args.epochs,
                                                   eta_min=0)
    elif args.lr_scheduler == 'plateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   'min',
                                                   verbose=True)

    best_top1 = 0.0
    # optionally resume from a checkpoint
    if args.resume:
        if args.rank == 0:
            logfile = open(os.path.join(log_folder, 'log.log'), 'a')
        if os.path.isfile(args.resume):
            if args.rank == 0:
                print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume, map_location='cpu')
            else:
                checkpoint = torch.load(args.resume,
                                        map_location='cuda:{}'.format(
                                            args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_top1 = checkpoint['best_top1']
            if args.gpu is not None:
                if not isinstance(best_top1, float):
                    best_top1 = best_top1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            try:
                scheduler.load_state_dict(checkpoint['scheduler'])
            except:
                pass
            if args.rank == 0:
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            del checkpoint  # dereference seems crucial
            torch.cuda.empty_cache()
        else:
            raise ValueError("Checkpoint is not found: {}".format(args.resume))
    else:
        if os.path.exists(os.path.join(log_folder,
                                       'log.log')) and args.rank == 0:
            shutil.copyfile(
                os.path.join(log_folder, 'log.log'),
                os.path.join(log_folder,
                             'log.log.{}'.format(int(time.time()))))
        if args.rank == 0:
            logfile = open(os.path.join(log_folder, 'log.log'), 'w')

    if args.rank == 0:
        command = " ".join(sys.argv)
        tensorboard_logger.configure(os.path.join(log_folder))
        print(command, flush=True)
        print(args, flush=True)
        print(model, flush=True)
        print(command, file=logfile, flush=True)
        print(args, file=logfile, flush=True)

    if args.resume == '' and args.rank == 0:
        print(model, file=logfile, flush=True)

    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_top1, train_top5, train_losses, train_speed, speed_data_loader, train_steps = \
            train(train_loader, model, train_criterion, optimizer, epoch + 1,
                  display=args.print_freq, clip_gradient=args.clip_gradient,
                  gpu_id=args.gpu, rank=args.rank)
        if args.distributed:
            dist.barrier()

        # evaluate on validation set
        val_top1, val_top5, val_losses, val_speed = validate(val_loader,
                                                             model,
                                                             val_criterion,
                                                             gpu_id=args.gpu)

        # update current learning rate
        if args.lr_scheduler == 'plateau':
            scheduler.step(val_losses)
        else:
            scheduler.step()

        if args.distributed:
            dist.barrier()

        # only logging at rank 0
        if args.rank == 0:
            print(
                'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t'
                'Speed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format(
                    epoch + 1, args.epochs, train_losses, train_top1,
                    train_top5, train_speed * 1000.0,
                    speed_data_loader * 1000.0),
                file=logfile,
                flush=True)
            print(
                'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t'
                'Speed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format(
                    epoch + 1, args.epochs, train_losses, train_top1,
                    train_top5, train_speed * 1000.0,
                    speed_data_loader * 1000.0),
                flush=True)
            print(
                'Val  : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t'
                'Speed: {:.2f} ms/batch'.format(epoch + 1, args.epochs,
                                                val_losses, val_top1, val_top5,
                                                val_speed * 1000.0),
                file=logfile,
                flush=True)
            print(
                'Val  : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t'
                'Speed: {:.2f} ms/batch'.format(epoch + 1, args.epochs,
                                                val_losses, val_top1, val_top5,
                                                val_speed * 1000.0),
                flush=True)

            # remember best prec@1 and save checkpoint
            is_best = val_top1 > best_top1
            best_top1 = max(val_top1, best_top1)

            save_dict = {
                'epoch': epoch + 1,
                'arch': arch_name,
                'state_dict': model.state_dict(),
                'best_top1': best_top1,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            }

            save_checkpoint(save_dict, is_best, filepath=log_folder)
            try:
                # get_lr get all lrs for every layer of current epoch, assume the lr for all layers are identical
                lr = scheduler.optimizer.param_groups[0]['lr']
            except Exception as e:
                lr = None
            if lr is not None:
                tensorboard_logger.log_value('learning-rate', lr, epoch + 1)
            tensorboard_logger.log_value('val-top1', val_top1, epoch + 1)
            tensorboard_logger.log_value('val-loss', val_losses, epoch + 1)
            tensorboard_logger.log_value('train-top1', train_top1, epoch + 1)
            tensorboard_logger.log_value('train-loss', train_losses, epoch + 1)
            tensorboard_logger.log_value('best-val-top1', best_top1, epoch + 1)

        if args.distributed:
            dist.barrier()

    if args.rank == 0:
        logfile.close()
Beispiel #20
0
    print('Test Acc: {:.4f}'.format(epoch_acc))


#합성곱 신경망 미세조정
#model_ft = models.alexnet(pretrained=True)
#num_ftrs = model_ft.classifier[6].in_features

model_ft = models.vgg16(pretrained=False)
num_ftrs = model_ft.classifier[6].in_features
# features = list(model_ft.classifier.children())[:-1] # Remove last layer
# features.extend([nn.Linear(num_ftrs, len(class_names))]) # Add our layer with 4 outputs
# model_ft = nn.Sequential(*features) # Replace the model classifier

model_ft.classifier[6] = nn.Linear(num_ftrs, 196)  #Linear(입력, 출력) 폴더갯수 200개씩임!

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(model_ft.parameters(), lr=1e-4)
exp_lr_scheduler = lr_scheduler.CosineAnnealingLR(optimizer_ft, 100)  #100이 뭐지

#학습 및 평가하기
model_ft = train_model(model_ft,
                       criterion,
                       optimizer_ft,
                       exp_lr_scheduler,
                       num_epochs=100)
#visualize_model(model_ft)
#test_model(model_ft, criterion, optimizer_ft)
sys.exit(0)
Beispiel #21
0
def get_scheduler(cfg, optimizer):
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=cfg.general.epochs,
                                               eta_min=cfg.optimizer.lr)
    return scheduler
def exp(subject_id):
    PATH = '../datasets/'
    with open(PATH + 'bcic_datasets_prep.pkl', 'rb') as f:
        data = pickle.load(f)

    test_subj = np.r_[subject_id - 1]

    print('test subj:' + str(test_subj))
    train_subj = np.setdiff1d(np.r_[0:9], test_subj)

    #rearange subject label
    new_subj_id = 0
    for ids in train_subj:
        data[ids].subj_id = new_subj_id
        new_subj_id += 1
        print(data[ids].subj_id)

    tr = []
    val = []
    #10%씩 떼어내서 val만듬
    for ids in train_subj:
        train_size = int(0.9 * len(data[ids]))
        test_size = len(data[ids]) - train_size
        tr_i = torch.utils.data.Subset(data[ids], indices=train_indices)
        val_i = torch.utils.data.Subset(data[ids], indices=val_indices)
        # tr_i, val_i = torch.utils.data.random_split(data[ids], [train_size, test_size])
        tr.append(tr_i)
        val.append(val_i)

    train_set = torch.utils.data.ConcatDataset(tr)
    valid_set = torch.utils.data.ConcatDataset(val)

    test_set = torch.utils.data.ConcatDataset([data[ids] for ids in test_subj])

    crop_size = 1125
    embedding_net = EEGNet_v2(n_classes, 22, 1125)

    model = FcClfNet_mult(embedding_net)
    discriminator = Discriminator([model.embedding_net.num_hidden, 8],
                                  grl=True,
                                  reverse=True)
    print(model)

    batch_size = 64
    epochs = 100

    # For deep4 they should be:
    lr = 1 * 0.01
    weight_decay = 0.5 * 0.001

    batch_size = 64
    n_epochs = 200

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_set,
                                               batch_size=batch_size,
                                               shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=batch_size,
                                              shuffle=False)

    # Send model to GPU
    if cuda:
        model.cuda(device=device)
        discriminator.cuda(device=device)

    from torch.optim import lr_scheduler
    import torch.optim as optim

    import argparse
    parser = argparse.ArgumentParser(
        description='cross subject domain adaptation')
    parser.add_argument('--batch-size',
                        type=int,
                        default=50,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=50,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=True,
                        help='For Saving the current Model')
    args = parser.parse_args()
    args.gpuidx = 1
    args.seed = 0
    args.use_tensorboard = False
    args.save_model = False

    # optimizer_C = optim.AdamW(list(model.embedding_net.parameters())+list(model.class_classifier.parameters()), lr=0.001, weight_decay=0.5 * 0.001)
    # optimizer_D = optim.AdamW(discriminator.parameters(), lr=0.001, weight_decay=0.5 * 0.001)

    optimizer_C = optim.AdamW(model.parameters(),
                              lr=0.01,
                              weight_decay=0.5 * 0.001)
    optimizer_D = optim.AdamW(discriminator.parameters(),
                              lr=0.01,
                              weight_decay=0.5 * 0.001)

    scheduler_C = lr_scheduler.CosineAnnealingLR(optimizer_C, T_max=100)
    scheduler_D = lr_scheduler.CosineAnnealingLR(optimizer_D, T_max=100)
    scheduler = [scheduler_C, scheduler_D]
    # scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer_C, T_0 = 50)

    #
    # #test lr
    # lr = []
    # for i in range(200):
    #     scheduler.step()
    #     lr.append(scheduler.get_lr())
    #
    # import matplotlib.pyplot as plt
    # plt.plot(lr)

    import pandas as pd
    results_columns = [
        'val_loss', 'test_loss', 'val_accuracy', 'test_accuracy'
    ]
    df = pd.DataFrame(columns=results_columns)

    for epochidx in range(1, epochs):
        print(epochidx)
        train_mult(10,
                   model,
                   discriminator,
                   device,
                   train_loader,
                   optimizer_C,
                   optimizer_D,
                   scheduler,
                   cuda,
                   args.gpuidx,
                   epoch=epochidx)

        val_loss, val_score = eval_mult(model, device, valid_loader)
        test_loss, test_score = eval_mult(model, device, test_loader)
        results = {
            'val_loss': val_loss,
            'test_loss': test_loss,
            'val_accuracy': val_score,
            'test_accuracy': test_score
        }
        df = df.append(results, ignore_index=True)
        print(results)

    return df
embedding_net = EmbeddingNet()
model = ClassificationNet(embedding_net, nclasses)
if cuda:
    model = nn.DataParallel(model).cuda()
    loss_fn = loss_fn.cuda()

optimizer = optim.SGD([{
    'params': model.parameters()
}, {
    'params': loss_fn.parameters()
}],
                      lr=lr,
                      nesterov=True,
                      momentum=0.9,
                      weight_decay=1e-4)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                           20,
                                           eta_min=1e-5,
                                           last_epoch=-1)

fit(dataset_tr,
    model,
    loss_fn,
    optimizer,
    scheduler,
    niterations,
    cuda,
    log_interval,
    metrics=[AccumulatedAccuracyMetric()],
    mining_tech='Doppleganger')
def train_model_snapshot(model, criterion, eval_criterion, lr, dataloaders,
                         dataset_sizes, device, num_cycles,
                         num_epochs_per_cycle):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_loss = 1000000.0
    model_w_arr = []
    for cycle in range(num_cycles):
        #initialize optimizer and scheduler each cycle
        #optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        optimizer = optim.Adam(
            [
                {
                    "params": model.model_ft.parameters(),
                    "lr": lr * 3 / time_steps
                },
                #{"params": model.fc1.parameters(), "lr": lr},
                {
                    "params": model.fc.parameters(),
                    "lr": lr
                }
            ],
            lr=lr)
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer, num_epochs_per_cycle * len(dataloaders['train']))
        for epoch in range(num_epochs_per_cycle):
            print('Cycle {}: Epoch {}/{}'.format(cycle, epoch,
                                                 num_epochs_per_cycle - 1))
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()  # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        loss = criterion(outputs, labels.reshape(-1, 1))
                        eval_loss = eval_criterion(outputs,
                                                   labels.reshape(-1, 1))
                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                            scheduler.step()

                    # statistics
                    running_loss += eval_loss.item() * inputs.size(0)

                epoch_loss = np.sqrt(running_loss / dataset_sizes[phase])

                print('{} Loss: {:.4f}'.format(phase, epoch_loss))

                # deep copy the model
                if phase == 'val' and epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_model_wts = copy.deepcopy(model.state_dict())
            print()
        # deep copy snapshot
        model_w_arr.append(copy.deepcopy(model.state_dict()))

    ensemble_loss = 0.0

    #predict on validation using snapshots
    for inputs, labels in dataloaders['val']:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # forward
        # track history if only in train
        pred = torch.zeros((inputs.shape[0], 1),
                           dtype=torch.float32).to(device)
        for weights in model_w_arr:
            model.load_state_dict(weights)
            model.eval()
            outputs = model(inputs)
            pred += outputs

        pred /= num_cycles
        eval_loss = eval_criterion(pred, labels.reshape(-1, 1))
        ensemble_loss += eval_loss.item() * inputs.size(0)

    ensemble_loss /= dataset_sizes['val']
    ensemble_loss = np.sqrt(ensemble_loss)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Ensemble Loss : {:4f}, Best val Loss: {:4f}'.format(
        ensemble_loss, best_loss))

    return model_w_arr, ensemble_loss, best_loss
from __future__ import print_function
Beispiel #26
0
def main_worker(gpu, ngpus_per_node, config):
    """

    Args:
      gpu:
      ngpus_per_node:
      config:
    """
    global CONFIG, best_acc1
    CONFIG, best_acc1 = config, 0
    train_set = config.dataset_type(CONFIG.dataset_path, SplitEnum.training)
    val_set = config.dataset_type(CONFIG.dataset_path, SplitEnum.validation)

    if CONFIG.distributed:
        if CONFIG.dist_url == "env://" and CONFIG.rank == -1:
            CONFIG.rank = int(os.environ["RANK"])
        if CONFIG.multiprocessing_distributed:
            CONFIG.rank = CONFIG.rank * ngpus_per_node + gpu
        distributed.init_process_group(
            backend=CONFIG.dist_backend,
            init_method=CONFIG.dist_url,
            world_size=CONFIG.world_size,
            rank=CONFIG.rank,
        )

    model = make_san(
        self_attention_type=SelfAttentionTypeEnum(CONFIG.self_attention_type),
        layers=CONFIG.layers,
        kernels=CONFIG.kernels,
        num_classes=train_set.response_shape[0],
    )
    criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.ignore_label)
    optimiser = torch.optim.SGD(
        model.parameters(),
        lr=CONFIG.base_lr,
        momentum=CONFIG.momentum,
        weight_decay=CONFIG.weight_decay,
    )
    if CONFIG.scheduler == "step":
        scheduler = lr_scheduler.MultiStepLR(
            optimiser, milestones=CONFIG.step_epochs, gamma=0.1
        )
    elif CONFIG.scheduler == "cosine":
        scheduler = lr_scheduler.CosineAnnealingLR(optimiser, T_max=CONFIG.epochs)

    if is_main_process():
        global logger, writer
        logger = get_logger()
        writer = TensorBoardPytorchWriter(str(CONFIG.save_path))
        logger.info(CONFIG)
        logger.info("=> creating model ...")
        logger.info(f"Classes: {train_set.response_shape[0]}")
        logger.info(model)
    if CONFIG.distributed:
        torch.cuda.set_device(gpu)
        CONFIG.batch_size = int(CONFIG.batch_size / ngpus_per_node)
        CONFIG.batch_size_val = int(CONFIG.batch_size_val / ngpus_per_node)
        CONFIG.workers = int((CONFIG.workers + ngpus_per_node - 1) / ngpus_per_node)
        model = torch.nn.parallel.DistributedDataParallel(
            model.cuda(), device_ids=[gpu]
        )
    else:
        model = torch.nn.DataParallel(model.cuda())

    if CONFIG.weight:
        if Path(CONFIG.weight).is_file():
            if is_main_process():
                global logger
                logger.info(f"=> loading weight '{CONFIG.weight}'")
            checkpoint = torch.load(CONFIG.weight)
            model.load_state_dict(checkpoint["state_dict"])
            if is_main_process():
                global logger
                logger.info(f"=> loaded weight '{CONFIG.weight}'")
        else:
            if is_main_process():
                global logger
                logger.info(f"=> no weight found at '{CONFIG.weight}'")

    if CONFIG.resume:
        if Path(CONFIG.resume).is_file():
            if is_main_process():
                global logger
                logger.info(f"=> loading checkpoint '{CONFIG.resume}'")
            checkpoint = torch.load(
                CONFIG.resume, map_location=lambda storage, loc: storage.cuda(gpu)
            )
            CONFIG.start_epoch = checkpoint["epoch"]
            best_acc1 = checkpoint["top1_val"]
            model.load_state_dict(checkpoint["state_dict"])
            optimiser.load_state_dict(checkpoint["optimiser"])
            scheduler.load_state_dict(checkpoint["scheduler"])
            if is_main_process():
                global logger
                logger.info(
                    f"=> loaded checkpoint '{CONFIG.resume}' (epoch {checkpoint['epoch']})"
                )
        else:
            if is_main_process():
                global logger
                logger.info(f"=> no checkpoint found at '{CONFIG.resume}'")

    if CONFIG.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_set)
    else:
        train_sampler = None
        val_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=CONFIG.batch_size,
        shuffle=(train_sampler is None),
        num_workers=CONFIG.workers,
        pin_memory=True,
        sampler=train_sampler,
    )
    val_loader = torch.utils.data.DataLoader(
        val_set,
        batch_size=CONFIG.batch_size_val,
        shuffle=False,
        num_workers=CONFIG.workers,
        pin_memory=True,
        sampler=val_sampler,
    )

    for epoch in range(CONFIG.start_epoch, CONFIG.epochs):
        if CONFIG.distributed:
            train_sampler.set_epoch(epoch)
        (
            loss_train,
            mIoU_train,
            mAcc_train,
            allAcc_train,
            top1_train,
            top5_train,
        ) = train(train_loader, model, criterion, optimiser, epoch)
        loss_val, mIoU_val, mAcc_val, allAcc_val, top1_val, top5_val = validate(
            val_loader, model, criterion
        )
        scheduler.step()
        epoch_log = epoch + 1
        if is_main_process():
            global writer
            writer.scalar("loss_train", loss_train, epoch_log)
            writer.scalar("mIoU_train", mIoU_train, epoch_log)
            writer.scalar("mAcc_train", mAcc_train, epoch_log)
            writer.scalar("allAcc_train", allAcc_train, epoch_log)
            writer.scalar("top1_train", top1_train, epoch_log)
            writer.scalar("top5_train", top5_train, epoch_log)
            writer.scalar("loss_val", loss_val, epoch_log)
            writer.scalar("mIoU_val", mIoU_val, epoch_log)
            writer.scalar("mAcc_val", mAcc_val, epoch_log)
            writer.scalar("allAcc_val", allAcc_val, epoch_log)
            writer.scalar("top1_val", top1_val, epoch_log)
            writer.scalar("top5_val", top5_val, epoch_log)

        if (epoch_log % CONFIG.save_freq == 0) and is_main_process():
            filename = CONFIG.save_path / "train_epoch_" + str(epoch_log) + ".pth"
            global logger
            logger.info("Saving checkpoint to: " + filename)
            torch.save(
                {
                    "epoch": epoch_log,
                    "state_dict": model.state_dict(),
                    "optimiser": torch.optim.Optimizer.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "top1_val": top1_val,
                    "top5_val": top5_val,
                },
                filename,
            )
            if top1_val > best_acc1:
                best_acc1 = top1_val
                shutil.copyfile(filename, CONFIG.save_path / "model_best.pth")
            if epoch_log / CONFIG.save_freq > 2:
                deletename = (
                    CONFIG.save_path
                    / f"train_epoch_{str(epoch_log - CONFIG.save_freq * 2)}.pth"
                )
                os.remove(deletename)
Beispiel #27
0
                                        [transforms.ToTensor(), normalize]))

train_dataset.classes = cifar_load_meta(dataset_root, base_folder, 'cifar10')
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=NUM_WORKERS,
                                           pin_memory=True)
validate_loader = torch.utils.data.DataLoader(dataset=validate_dataset,
                                              batch_size=VAL_BATCH_SIZE,
                                              shuffle=False,
                                              num_workers=NUM_WORKERS,
                                              pin_memory=True)

# schedulers config
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 160)

# criterisons config
criterion = nn.CrossEntropyLoss()

# plugins config
plugins = []

plugins.append(LossMonitor())
plugins.append(TopKAccuracy(topk=(1, 5)))
plugins.append(IterationSummaryMonitor())
plugins.append(DistributionOfBNMonitor())
plugins.append(ClassAccuracy())


def dataforward(self, data, target):
Beispiel #28
0
if torch.cuda.is_available():
    model = model.cuda()
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
model.to(device)

trainloader, testloader = get_cifar_loaders(args.data_loc)
optimizer = optim.SGD(
    [w for name, w in model.named_parameters() if not "mask" in name],
    lr=args.lr,
    momentum=0.9,
    weight_decay=args.weight_decay,
)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                           args.epochs,
                                           eta_min=1e-10)
criterion = nn.CrossEntropyLoss()

error_history = []
for epoch in tqdm(range(args.epochs)):
    train(model, trainloader, criterion, optimizer)
    validate(
        model,
        epoch,
        testloader,
        criterion,
        checkpoint=args.checkpoint if epoch != 2 else args.checkpoint +
        "_init",
    )
    scheduler.step()
Beispiel #29
0
def main():
    args = parse_args()

    if args.name is None:
        args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H'))

    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('- %s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('- %s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    if args.loss == 'CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss().cuda()
    elif args.loss == 'FocalLoss':
        criterion = FocalLoss().cuda()
    elif args.loss == 'MSELoss':
        criterion = nn.MSELoss().cuda()
    elif args.loss == 'multitask':
        criterion = {
            'classification': nn.CrossEntropyLoss().cuda(),
            'regression': nn.MSELoss().cuda(),
        }
    else:
        raise NotImplementedError

    if args.pred_type == 'classification':
        num_outputs = 5
    elif args.pred_type == 'regression':
        num_outputs = 1
    elif args.loss == 'multitask':
        num_outputs = 6
    else:
        raise NotImplementedError

    cudnn.benchmark = True

    model = get_model(model_name=args.arch,
                      num_outputs=num_outputs,
                      freeze_bn=args.freeze_bn,
                      dropout_p=args.dropout_p)

    train_transform = []
    train_transform = transforms.Compose([
        transforms.Resize((args.img_size, args.img_size)),
        transforms.RandomAffine(
            degrees=(args.rotate_min, args.rotate_max) if args.rotate else 0,
            translate=(args.translate_min, args.translate_max) if args.translate else None,
            scale=(args.rescale_min, args.rescale_max) if args.rescale else None,
            shear=(args.shear_min, args.shear_max) if args.shear else None,
        ),
        transforms.CenterCrop(args.input_size),
        transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0),
        transforms.RandomVerticalFlip(p=0.5 if args.flip else 0),
        transforms.ColorJitter(
            brightness=0,
            contrast=args.contrast,
            saturation=0,
            hue=0),
        RandomErase(
            prob=args.random_erase_prob if args.random_erase else 0,
            sl=args.random_erase_sl,
            sh=args.random_erase_sh,
            r=args.random_erase_r),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    val_transform = transforms.Compose([
        transforms.Resize((args.img_size, args.input_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    # data loading code
    if 'diabetic_retinopathy' in args.train_dataset:
        diabetic_retinopathy_dir = preprocess(
            'diabetic_retinopathy',
            args.img_size,
            scale=args.scale_radius,
            norm=args.normalize,
            pad=args.padding,
            remove=args.remove)
        diabetic_retinopathy_df = pd.read_csv('inputs/diabetic-retinopathy-resized/trainLabels.csv')
        diabetic_retinopathy_img_paths = \
            diabetic_retinopathy_dir + '/' + diabetic_retinopathy_df['image'].values + '.jpeg'
        diabetic_retinopathy_labels = diabetic_retinopathy_df['level'].values

    if 'aptos2019' in args.train_dataset:
        aptos2019_dir = preprocess(
            'aptos2019',
            args.img_size,
            scale=args.scale_radius,
            norm=args.normalize,
            pad=args.padding,
            remove=args.remove)
        aptos2019_df = pd.read_csv('inputs/train.csv')
        aptos2019_img_paths = aptos2019_dir + '/' + aptos2019_df['id_code'].values + '.png'
        aptos2019_labels = aptos2019_df['diagnosis'].values

    if args.train_dataset == 'aptos2019':
        skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41)
        img_paths = []
        labels = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)):
            img_paths.append((aptos2019_img_paths[train_idx], aptos2019_img_paths[val_idx]))
            labels.append((aptos2019_labels[train_idx], aptos2019_labels[val_idx]))
    elif args.train_dataset == 'diabetic_retinopathy':
        img_paths = [(diabetic_retinopathy_img_paths, aptos2019_img_paths)]
        labels = [(diabetic_retinopathy_labels, aptos2019_labels)]
    elif 'diabetic_retinopathy' in args.train_dataset and 'aptos2019' in args.train_dataset:
        skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41)
        img_paths = []
        labels = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)):
            img_paths.append((np.hstack((aptos2019_img_paths[train_idx], diabetic_retinopathy_img_paths)), aptos2019_img_paths[val_idx]))
            labels.append((np.hstack((aptos2019_labels[train_idx], diabetic_retinopathy_labels)), aptos2019_labels[val_idx]))
    # else:
    #     raise NotImplementedError

    if args.pseudo_labels:
        test_df = pd.read_csv('probs/%s.csv' % args.pseudo_labels)
        test_dir = preprocess(
            'test',
            args.img_size,
            scale=args.scale_radius,
            norm=args.normalize,
            pad=args.padding,
            remove=args.remove)
        test_img_paths = test_dir + '/' + test_df['id_code'].values + '.png'
        test_labels = test_df['diagnosis'].values
        for fold in range(len(img_paths)):
            img_paths[fold] = (np.hstack((img_paths[fold][0], test_img_paths)), img_paths[fold][1])
            labels[fold] = (np.hstack((labels[fold][0], test_labels)), labels[fold][1])

    if 'messidor' in args.train_dataset:
        test_dir = preprocess(
            'messidor',
            args.img_size,
            scale=args.scale_radius,
            norm=args.normalize,
            pad=args.padding,
            remove=args.remove)

    folds = []
    best_losses = []
    best_scores = []

    for fold, ((train_img_paths, val_img_paths), (train_labels, val_labels)) in enumerate(zip(img_paths, labels)):
        print('Fold [%d/%d]' %(fold+1, len(img_paths)))

        if os.path.exists('models/%s/model_%d.pth' % (args.name, fold+1)):
            log = pd.read_csv('models/%s/log_%d.csv' %(args.name, fold+1))
            best_loss, best_score = log.loc[log['val_loss'].values.argmin(), ['val_loss', 'val_score']].values
            folds.append(str(fold + 1))
            best_losses.append(best_loss)
            best_scores.append(best_score)
            continue

        if args.remove_duplicate:
            md5_df = pd.read_csv('inputs/strMd5.csv')
            duplicate_img_paths = aptos2019_dir + '/' + md5_df[(md5_df.strMd5_count > 1) & (~md5_df.diagnosis.isnull())]['id_code'].values + '.png'
            print(duplicate_img_paths)
            for duplicate_img_path in duplicate_img_paths:
                train_labels = train_labels[train_img_paths != duplicate_img_path]
                train_img_paths = train_img_paths[train_img_paths != duplicate_img_path]
                val_labels = val_labels[val_img_paths != duplicate_img_path]
                val_img_paths = val_img_paths[val_img_paths != duplicate_img_path]

        # train
        train_set = Dataset(
            train_img_paths,
            train_labels,
            transform=train_transform)

        _, class_sample_counts = np.unique(train_labels, return_counts=True)
        # print(class_sample_counts)
        # weights = 1. / torch.tensor(class_sample_counts, dtype=torch.float)
        # weights = np.array([0.2, 0.1, 0.6, 0.1, 0.1])
        # samples_weights = weights[train_labels]
        # sampler = WeightedRandomSampler(
        #     weights=samples_weights,
        #     num_samples=11000,
        #     replacement=False)
        train_loader = torch.utils.data.DataLoader(
            train_set,
            batch_size=args.batch_size,
            shuffle=False if args.class_aware else True,
            num_workers=4,
            sampler=sampler if args.class_aware else None)

        val_set = Dataset(
            val_img_paths,
            val_labels,
            transform=val_transform)
        val_loader = torch.utils.data.DataLoader(
            val_set,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=4)

        # create model
        model = get_model(model_name=args.arch,
                          num_outputs=num_outputs,
                          freeze_bn=args.freeze_bn,
                          dropout_p=args.dropout_p)
        model = model.cuda()
        if args.pretrained_model is not None:
            model.load_state_dict(torch.load('models/%s/model_%d.pth' % (args.pretrained_model, fold+1)))

        # print(model)

        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)
        elif args.optimizer == 'AdamW':
            optimizer = optim.AdamW(
                filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)
        elif args.optimizer == 'RAdam':
            optimizer = RAdam(
                filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)
        elif args.optimizer == 'SGD':
            optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr,
                                  momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov)

        if args.scheduler == 'CosineAnnealingLR':
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=args.epochs, eta_min=args.min_lr)
        elif args.scheduler == 'ReduceLROnPlateau':
            scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=args.factor, patience=args.patience,
                                                       verbose=1, min_lr=args.min_lr)

        log = pd.DataFrame(index=[], columns=[
            'epoch', 'loss', 'score', 'val_loss', 'val_score'
        ])
        log = {
            'epoch': [],
            'loss': [],
            'score': [],
            'val_loss': [],
            'val_score': [],
        }

        best_loss = float('inf')
        best_score = 0
        for epoch in range(args.epochs):
            print('Epoch [%d/%d]' % (epoch + 1, args.epochs))

            # train for one epoch
            train_loss, train_score = train(
                args, train_loader, model, criterion, optimizer, epoch)
            # evaluate on validation set
            val_loss, val_score = validate(args, val_loader, model, criterion)

            if args.scheduler == 'CosineAnnealingLR':
                scheduler.step()
            elif args.scheduler == 'ReduceLROnPlateau':
                scheduler.step(val_loss)

            print('loss %.4f - score %.4f - val_loss %.4f - val_score %.4f'
                  % (train_loss, train_score, val_loss, val_score))

            log['epoch'].append(epoch)
            log['loss'].append(train_loss)
            log['score'].append(train_score)
            log['val_loss'].append(val_loss)
            log['val_score'].append(val_score)

            pd.DataFrame(log).to_csv('models/%s/log_%d.csv' % (args.name, fold+1), index=False)

            if val_loss < best_loss:
                torch.save(model.state_dict(), 'models/%s/model_%d.pth' % (args.name, fold+1))
                best_loss = val_loss
                best_score = val_score
                print("=> saved best model")

        print('val_loss:  %f' % best_loss)
        print('val_score: %f' % best_score)

        folds.append(str(fold + 1))
        best_losses.append(best_loss)
        best_scores.append(best_score)

        results = pd.DataFrame({
            'fold': folds + ['mean'],
            'best_loss': best_losses + [np.mean(best_losses)],
            'best_score': best_scores + [np.mean(best_scores)],
        })

        print(results)
        results.to_csv('models/%s/results.csv' % args.name, index=False)

        torch.cuda.empty_cache()

        if not args.cv:
            break
Beispiel #30
0
def main():
    args = vars(parse_args_func())

    #config_file = "../configs/config_SN7.json"
    config_file = args['config']  # "../configs/config_v1.json"
    config_dict = json.loads(open(config_file, 'rt').read())
    #config_dict = json.loads(open(sys.argv[1], 'rt').read())

    file_dict = config_dict['file_path']
    config = config_dict['opt_config']

    input_folder = file_dict['input_path']  # '../inputs'
    checkpoint_folder = file_dict['checkpoint_path']  # '../checkpoint'
    model_folder = file_dict['model_path']  # '../models'

    if 'False' in config['deep_supervision']:
        config['deep_supervision'] = False
    else:
        config['deep_supervision'] = True

    if 'False' in config['nesterov']:
        config['nesterov'] = False
    else:
        config['nesterov'] = True

    if 'None' in config['name']:
        config['name'] = None

    if config['name'] is None:
        config['name'] = '%s_%s_segmodel' % (config['dataset'], config['arch'])
    os.makedirs(os.path.join(model_folder, '%s' % config['name']),
                exist_ok=True)

    if not os.path.isdir(checkpoint_folder):
        os.mkdir(checkpoint_folder)
    log_name = config['name']
    log_dir = os.path.join(checkpoint_folder, log_name)
    writer = SummaryWriter(logdir=log_dir)

    print('-' * 20)
    for key in config:
        print('%s: %s' % (key, config[key]))
    print('-' * 20)

    with open(os.path.join(model_folder, '%s/config.yml' % config['name']),
              'w') as f:
        yaml.dump(config, f)

    # define loss function (criterion)
    if config['loss'] == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[config['loss']]().cuda()

    cudnn.benchmark = True

    # create model
    print("=> creating model %s" % config['arch'])
    model = archs.__dict__[config['arch']](config['num_classes'],
                                           config['input_channels'],
                                           config['deep_supervision'])

    if 'False' in config['resume']:
        config['resume'] = False
    else:
        config['resume'] = True
    resume_flag = False
    if resume_flag == True:
        save_path = os.path.join(model_folder, config['name'], 'model.pth')
        weights = torch.load(save_path)
        model.load_state_dict(weights)
        name_yaml = config['name']
        with open(os.path.join(model_folder, '%s/config.yml' % name_yaml),
                  'r') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        #start_epoch = config['epochs']
        start_epoch = 0
    else:
        start_epoch = 0

    model = model.cuda()
    if 'effnet' in config['arch']:
        eff_flag = True
    else:
        eff_flag = False

    if eff_flag == True:
        cnn_subs = list(model.encoder.eff_conv.children())[1:]
        #cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
        #cnn_params = [item for sublist in cnn_params for item in sublist]

    summary(model,
            (config['input_channels'], config['input_w'], config['input_h']))
    params = filter(lambda p: p.requires_grad, model.parameters())
    if eff_flag == True:
        params = list(params) + list(model.encoder.conv_a.parameters())
    model = torch.nn.DataParallel(model)

    if config['optimizer'] == 'Adam':
        optimizer = optim.Adam(params,
                               lr=config['lr'],
                               weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'SGD':
        optimizer = optim.SGD(params,
                              lr=config['lr'],
                              momentum=config['momentum'],
                              nesterov=config['nesterov'],
                              weight_decay=config['weight_decay'])
    else:
        raise NotImplementedError

    if eff_flag == True:
        cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
        cnn_params = [item for sublist in cnn_params for item in sublist]
        cnn_optimizer = torch.optim.Adam(cnn_params,
                                         lr=0.001,
                                         weight_decay=config['weight_decay'])
        #cnn_optimizer = None

    else:
        cnn_optimizer = None
    if config['optimizer'] == 'SGD':
        if config['scheduler'] == 'CosineAnnealingLR':
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=config['epochs'], eta_min=config['min_lr'])
        elif config['scheduler'] == 'ReduceLROnPlateau':
            scheduler = lr_scheduler.ReduceLROnPlateau(
                optimizer,
                factor=config['factor'],
                patience=config['patience'],
                verbose=1,
                min_lr=config['min_lr'])
        elif config['scheduler'] == 'MultiStepLR':
            scheduler = lr_scheduler.MultiStepLR(
                optimizer,
                milestones=[int(e) for e in config['milestones'].split(',')],
                gamma=config['gamma'])
        elif config['scheduler'] == 'ConstantLR':
            scheduler = None
        else:
            raise NotImplementedError
    else:
        scheduler = None

    # Data loading code
    img_ids = glob(
        os.path.join(input_folder, config['dataset'], 'images', 'training',
                     '*' + config['img_ext']))
    train_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    #img_dir = os.path.join(input_folder, config['dataset'], 'images', 'training')
    #mask_dir = os.path.join(input_folder, config['dataset'], 'annotations', 'training')
    #train_image_mask = image_to_afile(img_dir, mask_dir, None, train_img_ids, config)

    img_ids = glob(
        os.path.join(input_folder, config['val_dataset'], 'images',
                     'validation', '*' + config['img_ext']))
    val_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    img_ids = glob(
        os.path.join(input_folder, config['val_dataset'], 'images', 'test',
                     '*' + config['img_ext']))
    test_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids]

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    train_transform = Compose([
        #transforms.RandomScale ([config['scale_min'], config['scale_max']]),
        #transforms.RandomRotate90(),
        transforms.Rotate([config['rotate_min'], config['rotate_max']],
                          value=mean,
                          mask_value=0),
        transforms.Flip(),
        #transforms.HorizontalFlip (),
        transforms.HueSaturationValue(hue_shift_limit=10,
                                      sat_shift_limit=10,
                                      val_shift_limit=10),
        transforms.RandomBrightnessContrast(brightness_limit=0.10,
                                            contrast_limit=0.10,
                                            brightness_by_max=True),
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(mean=mean, std=std),
    ])

    val_transform = Compose([
        transforms.Resize(config['input_h'], config['input_w']),
        transforms.Normalize(mean=mean, std=std),
    ])

    train_dataset = Dataset(img_ids=train_img_ids,
                            img_dir=os.path.join(input_folder,
                                                 config['dataset'], 'images',
                                                 'training'),
                            mask_dir=os.path.join(input_folder,
                                                  config['dataset'],
                                                  'annotations', 'training'),
                            img_ext=config['img_ext'],
                            mask_ext=config['mask_ext'],
                            num_classes=config['num_classes'],
                            input_channels=config['input_channels'],
                            transform=train_transform,
                            from_file=None)
    val_dataset = Dataset(img_ids=val_img_ids,
                          img_dir=os.path.join(input_folder,
                                               config['val_dataset'], 'images',
                                               'validation'),
                          mask_dir=os.path.join(input_folder,
                                                config['val_dataset'],
                                                'annotations', 'validation'),
                          img_ext=config['img_ext'],
                          mask_ext=config['mask_ext'],
                          num_classes=config['num_classes'],
                          input_channels=config['input_channels'],
                          transform=val_transform,
                          from_file=None)
    test_dataset = Dataset(img_ids=test_img_ids,
                           img_dir=os.path.join(input_folder,
                                                config['val_dataset'],
                                                'images', 'test'),
                           mask_dir=os.path.join(input_folder,
                                                 config['val_dataset'],
                                                 'annotations', 'test'),
                           img_ext=config['img_ext'],
                           mask_ext=config['mask_ext'],
                           num_classes=config['num_classes'],
                           input_channels=config['input_channels'],
                           transform=val_transform,
                           from_file=None)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=config['num_workers'],
        drop_last=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=1,  #config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        drop_last=False)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,  #config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        drop_last=False)

    log = OrderedDict([
        ('epoch', []),
        ('lr', []),
        ('loss', []),
        ('iou', []),
        ('dice', []),
        ('val_loss', []),
        ('val_iou', []),
        ('val_dice', []),
    ])

    best_iou = 0
    trigger = 0
    Best_dice = 0
    iou_AtBestDice = 0
    for epoch in range(start_epoch, config['epochs']):
        print('{:s} Epoch [{:d}/{:d}]'.format(config['arch'], epoch,
                                              config['epochs']))
        # train for one epoch
        train_log = train(epoch, config, train_loader, model, criterion,
                          optimizer, cnn_optimizer)
        if config['optimizer'] == 'SGD':
            if config['scheduler'] == 'CosineAnnealingLR':
                scheduler.step()
            elif config['scheduler'] == 'ReduceLROnPlateau':
                scheduler.step(val_log['loss'])
            elif config['scheduler'] == 'MultiStepLR':
                scheduler.step()

        # evaluate on validation set
        val_log = validate(config, val_loader, model, criterion)
        test_log = validate(config, test_loader, model, criterion)

        if Best_dice < test_log['dice']:
            Best_dice = test_log['dice']
            iou_AtBestDice = test_log['iou']
        print(
            'loss %.4f - iou %.4f - dice %.4f - val_loss %.4f - val_iou %.4f - val_dice %.4f - test_iou %.4f - test_dice %.4f - Best_dice %.4f - iou_AtBestDice %.4f'
            % (train_log['loss'], train_log['iou'], train_log['dice'],
               val_log['loss'], val_log['iou'], val_log['dice'],
               test_log['iou'], test_log['dice'], Best_dice, iou_AtBestDice))

        save_tensorboard(writer, train_log, val_log, test_log, epoch)
        log['epoch'].append(epoch)
        log['lr'].append(config['lr'])
        log['loss'].append(train_log['loss'])
        log['iou'].append(train_log['iou'])
        log['dice'].append(train_log['dice'])
        log['val_loss'].append(val_log['loss'])
        log['val_iou'].append(val_log['iou'])
        log['val_dice'].append(val_log['dice'])

        pd.DataFrame(log).to_csv(os.path.join(model_folder,
                                              '%s/log.csv' % config['name']),
                                 index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(
                model.state_dict(),
                os.path.join(model_folder, '%s/model.pth' % config['name']))
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        # early stopping
        if config['early_stopping'] >= 0 and trigger >= config[
                'early_stopping']:
            print("=> early stopping")
            break

        torch.cuda.empty_cache()