コード例 #1
0
def eval_tta(config, augment, reporter):
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[
        'cv_fold'], augment['save_path']

    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'],
                                    augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path + '.pth')
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model = nn.DataParallel(model).cuda()
    model.eval()

    src_loaders = []
    # for _ in range(augment['num_policy']):
    _, src_tl, src_validloader, src_ttl = get_dataloaders(
        C.get()['dataset'],
        C.get()['batch'],
        augment['dataroot'],
        cv_ratio_test,
        cv_num,
        split_idx=cv_fold,
        target=False,
        random_range=C.get()['args'].random_range)

    del src_tl, src_ttl

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')

    emd_loss = nn.DataParallel(emdModule()).cuda()

    losses = []
    corrects = []
    for data in src_validloader:
        with torch.no_grad():
            point_cloud = data['point_cloud'].cuda()
            label = torch.ones_like(data['label'], dtype=torch.int64).cuda()
            trans_pc = data['transformed']

            pred = model(trans_pc)

            if C.get()['args'].use_emd_false:
                loss_emd = (torch.mean(emd_loss(point_cloud.permute(0, 2, 1),
                                                trans_pc.permute(0, 2, 1), 0.05, 3000)[0])).unsqueeze(0) \
                           * C.get()['args'].emd_coeff
            else:
                loss_emd = torch.tensor([0.0])

            if C.get()['args'].no_dc:
                loss = loss_emd
            else:
                loss = loss_emd + loss_fn(pred, label)
            # print(loss)
            losses.append(loss.detach().cpu().numpy())

            pred = pred.max(dim=1)[1]
            pred = pred.t()
            correct = float(
                torch.sum(pred == label).item()) / pred.size(0) * 100
            corrects.append(correct)
            del loss, correct, pred, data, label, loss_emd

    losses = np.concatenate(losses)
    losses_min = np.min(losses, axis=0).squeeze()
    corrects_max = max(corrects)
    metrics.add_dict({
        'minus_loss': -1 * np.sum(losses_min),
        'correct': np.sum(corrects_max),
        # 'cnt': len(corrects_max)
    })
    del corrects, corrects_max

    del model
    # metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    # print(metrics)
    reporter(minus_loss=metrics['minus_loss'],
             top1_valid=metrics['correct'],
             elapsed_time=gpu_secs,
             done=True)
    return metrics['minus_loss']
コード例 #2
0
def train_and_eval(tag,
                   dataroot,
                   metric='last',
                   save_path=None,
                   only_eval=False,
                   unsupervised=False,
                   mode=None):
    max_epoch = C.get()['epoch']
    trainloader, unsuploader, testloader = get_dataloaders(
        C.get()['dataset'],
        C.get()['batch'],
        C.get()['batch_unsup'],
        dataroot,
        mode=mode,
        n_labeled=args.n_labeled)

    # create a model & an optimizer
    model = get_model(C.get()['model'],
                      num_class(C.get()['dataset']),
                      data_parallel=True)

    criterion = nn.CrossEntropyLoss()
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=C.get()['lr'],
                              momentum=C.get()['optimizer'].get(
                                  'momentum', 0.9),
                              weight_decay=C.get()['optimizer']['decay'],
                              nesterov=C.get()['optimizer']['nesterov'])
    else:
        raise ValueError('invalid optimizer type=%s' %
                         C.get()['optimizer']['type'])

    lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        t_max = C.get()['epoch']
        if C.get()['lr_schedule'].get('warmup', None):
            t_max -= C.get()['lr_schedule']['warmup']['epoch']
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=t_max,
                                                               eta_min=0.)
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if C.get()['lr_schedule'].get('warmup', None):
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()['lr_schedule']['warmup']['multiplier'],
            total_epoch=C.get()['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler)

    if not tag.strip():
        from metrics import SummaryWriterDummy as SummaryWriter
        logger.warning('tag not provided, no tensorboard log.')
    else:
        from tensorboardX import SummaryWriter
    writers = [
        SummaryWriter(logdir='./logs/%s/%s' % (tag, x))
        for x in ['train', 'test']
    ]

    result = OrderedDict()
    epoch_start = 1
    if save_path and os.path.exists(save_path):
        data = torch.load(save_path)
        model.load_state_dict(data['model'])
        optimizer.load_state_dict(data['optimizer'])
        epoch_start = data['epoch']

    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()
        rs['test'] = run_epoch(model,
                               testloader,
                               unsuploader,
                               criterion,
                               None,
                               desc_default='*test',
                               epoch=epoch_start,
                               writer=writers[1])
        for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                              ['train', 'test']):
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result

    # train loop
    global best_valid_top1
    best_valid_loss = 10e10
    for epoch in range(epoch_start, max_epoch + 1):
        model.train()
        rs = dict()
        if args.train_mode == 'small':
            print(f'only small')
            rs['train'] = run_epoch(model,
                                    trainloader,
                                    unsuploader,
                                    criterion,
                                    optimizer,
                                    desc_default='train',
                                    epoch=epoch,
                                    writer=writers[0],
                                    verbose=True,
                                    unsupervised=False,
                                    scheduler=scheduler)
        else:
            rs['train'] = run_epoch(model,
                                    trainloader,
                                    unsuploader,
                                    criterion,
                                    optimizer,
                                    desc_default='train',
                                    epoch=epoch,
                                    writer=writers[0],
                                    verbose=True,
                                    unsupervised=unsupervised,
                                    scheduler=scheduler)
        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        model.eval()
        if epoch % (10 if 'cifar' in C.get()['dataset'] else
                    30) == 0 or epoch == max_epoch:
            rs['test'] = run_epoch(model,
                                   testloader,
                                   unsuploader,
                                   criterion,
                                   None,
                                   desc_default='*test',
                                   epoch=epoch,
                                   writer=writers[1],
                                   verbose=True)

            if best_valid_top1 < rs['test']['top1']:
                best_valid_top1 = rs['test']['top1']

            if metric == 'last' or rs[metric]['loss'] < best_valid_loss:  # TODO
                if metric != 'last':
                    best_valid_loss = rs[metric]['loss']
                for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                                      ['train', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('test_top1/best', rs['test']['top1'],
                                      epoch)

            # save checkpoint
            if save_path:
                logger.info('save model@%d to %s' % (epoch, save_path))
                torch.save(
                    {
                        'epoch': epoch,
                        'log': {
                            'train': rs['train'].get_dict(),
                            'test': rs['test'].get_dict(),
                        },
                        'optimizer': optimizer.state_dict(),
                        'model': model.state_dict()
                    }, save_path)

    del model

    return result
コード例 #3
0
def train_and_eval(config, tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=0, evaluation_interval=5):

    # ckpt = torch.load(save_path)

    total_batch = config["batch"]
    if local_rank >= 0:
        dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE']))
        device = torch.device('cuda', local_rank)
        torch.cuda.set_device(device)

        config()['lr'] *= dist.get_world_size()
        logger.info(f'local batch={config["batch"]} world_size={dist.get_world_size()} ----> total batch={config["batch"] * dist.get_world_size()}')
        total_batch = config["batch"] * dist.get_world_size()

    is_master = local_rank < 0 or dist.get_rank() == 0
    if is_master:
        add_filehandler(logger, 'master' + '.log')

    if not reporter:
        reporter = lambda **kwargs: 0

    max_epoch = config['epoch']
    trainsampler, trainloader, validloader, testloader_ = get_dataloaders(config['dataset'], config['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0))

    # create a model & an optimizer
    model = get_model(config['model'], num_class(config['dataset']), local_rank=local_rank)
    model_ema = get_model(config['model'], num_class(config['dataset']), local_rank=-1)
    model_ema.eval()

    criterion_ce = criterion = CrossEntropyLabelSmooth(num_class(config['dataset']), config.get('lb_smooth', 0))
    if config.get('mixup', 0.0) > 0.0:
        criterion = CrossEntropyMixUpLabelSmooth(num_class(config['dataset']), config.get('lb_smooth', 0))
    if config['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(
            model.parameters(),
            lr=config['lr'],
            momentum=config['optimizer'].get('momentum', 0.9),
            weight_decay=0.0,
            nesterov=config['optimizer'].get('nesterov', True)
        )
    elif config['optimizer']['type'] == 'rmsprop':
        optimizer = RMSpropTF(
            model.parameters(),
            lr=config['lr'],
            weight_decay=0.0,
            alpha=0.9, momentum=0.9,
            eps=0.001
        )
    else:
        raise ValueError('invalid optimizer type=%s' % config['optimizer']['type'])

    lr_scheduler_type = config['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['epoch'], eta_min=0.)
    elif lr_scheduler_type == 'resnet':
        scheduler = adjust_learning_rate_resnet(optimizer)
    elif lr_scheduler_type == 'efficientnet':
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 0.97 ** int((x + config['lr_schedule']['warmup']['epoch']) / 2.4))
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if config['lr_schedule'].get('warmup', None) and config['lr_schedule']['warmup']['epoch'] > 0:
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=config['lr_schedule']['warmup']['multiplier'],
            total_epoch=config['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler
        )

    if not tag or not is_master:
        from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter
        logger.warning('tag not provided, no tensorboard log.')
    else:
        from tensorboardX import SummaryWriter
    writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']]

    if config['optimizer']['ema'] > 0.0 and is_master:
        # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet
        ema = EMA(config['optimizer']['ema'])
    else:
        ema = None

    result = OrderedDict()
    epoch_start = 1
    #TODO: change only eval=False when without save_path ??
    if save_path != 'test.pth':     # and is_master: --> should load all data(not able to be broadcasted)
        if save_path and not os.path.exists(save_path):
            import torch.utils.model_zoo as model_zoo
            data = model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth',
                               model_dir=os.path.join(os.getcwd(), 'FastAutoAugment/models'))
            if config['dataset'] == 'cifar10':
                data.pop('fc.weight')
                data.pop('fc.bias')
                model_dict = model.state_dict()
                model_dict.update(data)
                model.load_state_dict(model_dict)
                torch.save(model_dict, save_path)

        logger.info('%s file found. loading...' % save_path)
        data = torch.load(save_path)
        key = 'model' if 'model' in data else 'state_dict'

        if 'epoch' not in data:
            model.load_state_dict(data)
        else:
            logger.info('checkpoint epoch@%d' % data['epoch'])
            if not isinstance(model, (DataParallel, DistributedDataParallel)):
                model.load_state_dict({k.replace('module.', ''): v for k, v in data[key].items()})
            else:
                model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data[key].items()})
            logger.info('optimizer.load_state_dict+')
            optimizer.load_state_dict(data['optimizer'])
            if data['epoch'] < config['epoch']:
                epoch_start = data['epoch']
            else:
                only_eval = True
            if ema is not None:
                ema.shadow = data.get('ema', {}) if isinstance(data.get('ema', {}), dict) else data['ema'].state_dict()
        del data

    if local_rank >= 0:
        for name, x in model.state_dict().items():
            dist.broadcast(x, 0)
        logger.info(f'multinode init. local_rank={dist.get_rank()} is_master={is_master}')
        torch.cuda.synchronize()

    tqdm_disabled = bool(os.environ.get('TASK_NAME', '')) and local_rank != 0  # KakaoBrain Environment

    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()
        rs['train'] = run_epoch(config, model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master)

        with torch.no_grad():
            rs['valid'] = run_epoch(config, model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master)
            rs['test'] = run_epoch(config, model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master)
            if ema is not None and len(ema) > 0:
                model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()})
                rs['valid'] = run_epoch(config, model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                rs['test'] = run_epoch(config, model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)
        for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
            if setname not in rs:
                continue
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result

    # train loop
    best_top1 = 0
    for epoch in range(epoch_start, max_epoch + 1):
        if local_rank >= 0:
            trainsampler.set_epoch(epoch)

        model.train()
        rs = dict()
        rs['train'] = run_epoch(config, model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=config['optimizer']['decay'], tqdm_disabled=tqdm_disabled)
        model.eval()

        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        if ema is not None and config['optimizer']['ema_interval'] > 0 and epoch % config['optimizer']['ema_interval'] == 0:
            logger.info(f'ema synced+ rank={dist.get_rank()}')
            if ema is not None:
                model.load_state_dict(ema.state_dict())
            for name, x in model.state_dict().items():
                # print(name)
                dist.broadcast(x, 0)
            torch.cuda.synchronize()
            logger.info(f'ema synced- rank={dist.get_rank()}')

        if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch):
            with torch.no_grad():
                rs['valid'] = run_epoch(config, model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                rs['test'] = run_epoch(config, model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)

                if ema is not None:
                    model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()})
                    rs['valid'] = run_epoch(config, model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled)
                    rs['test'] = run_epoch(config, model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled)

            logger.info(
                f'epoch={epoch} '
                f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} '
                f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} '
                f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} '
            )

            if metric == 'last' or rs[metric]['top1'] > best_top1:
                if metric != 'last':
                    best_top1 = rs[metric]['top1']
                for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch)
                writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch)

                reporter(
                    loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'],
                    loss_test=rs['test']['loss'], top1_test=rs['test']['top1']
                )

                # save checkpoint
                if is_master and save_path:
                    logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1))
                    torch.save({
                        'epoch': epoch,
                        'log': {
                            'train': rs['train'].get_dict(),
                            'valid': rs['valid'].get_dict(),
                            'test': rs['test'].get_dict(),
                        },
                        'optimizer': optimizer.state_dict(),
                        'model': model.state_dict(),
                        'ema': ema.state_dict() if ema is not None else None,
                    }, save_path)

    del model

    result['top1_test'] = best_top1
    return result
コード例 #4
0
def train_and_eval(
        tag, dataroot, metric="last", save_path=None, only_eval=False, unsupervised=False, labeled_sample_num=4000,
):
    max_epoch = C.get()["epoch"]
    trainloader, unsuploader, testloader = get_dataloaders(
        C.get()["dataset"], C.get()["batch"], C.get()["batch_unsup"], dataroot, labeled_sample_num
    )

    # create a model & an optimizer
    model = get_model(
        C.get()["model"], num_class(C.get()["dataset"]), data_parallel=True
    )

    criterion = nn.CrossEntropyLoss()
    if C.get()["optimizer"]["type"] == "sgd":
        optimizer = optim.SGD(
            model.parameters(),
            lr=C.get()["lr"],
            momentum=C.get()["optimizer"].get("momentum", 0.9),
            weight_decay=C.get()["optimizer"]["decay"],
            nesterov=C.get()["optimizer"]["nesterov"],
        )
    else:
        raise ValueError("invalid optimizer type=%s" % C.get()["optimizer"]["type"])

    lr_scheduler_type = C.get()["lr_schedule"].get("type", "cosine")
    if lr_scheduler_type == "cosine":
        t_max = C.get()["epoch"]
        if C.get()["lr_schedule"].get("warmup", None):
            t_max -= C.get()["lr_schedule"]["warmup"]["epoch"]
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=t_max, eta_min=0.0
        )
    else:
        raise ValueError("invalid lr_schduler=%s" % lr_scheduler_type)

    if C.get()["lr_schedule"].get("warmup", None):
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()["lr_schedule"]["warmup"]["multiplier"],
            total_epoch=C.get()["lr_schedule"]["warmup"]["epoch"],
            after_scheduler=scheduler,
        )

    if not tag.strip():
        from metrics import SummaryWriterDummy as SummaryWriter

        logger.warning("tag not provided, no tensorboard log.")
    else:
        from tensorboardX import SummaryWriter
    writers = [
        SummaryWriter(logdir="./logs/%s/%s" % (tag, x)) for x in ["train", "test"]
    ]

    result = OrderedDict()
    epoch_start = 1
    if save_path and os.path.exists(save_path) and (args.resume == True):
        data = torch.load(save_path)
        model.load_state_dict(data["model"])
        optimizer.load_state_dict(data["optimizer"])
        epoch_start = data["epoch"]
        print("load sucessfully")

    if only_eval:
        logger.info("evaluation only+")
        model.eval()
        rs = dict()
        rs["test"] = run_epoch(
            model,
            testloader,
            unsuploader,
            criterion,
            None,
            desc_default="*test",
            epoch=epoch_start,
            writer=writers[1],
            method=args.method,
        )
        for key, setname in itertools.product(
                ["loss", "top1", "top5"], ["train", "test"]
        ):
            result["%s_%s" % (key, setname)] = rs[setname][key]
        result["epoch"] = 0
        return result

    # train loop
    global best_valid_top1
    best_valid_loss = 10e10
    for epoch in range(epoch_start, max_epoch + 1):
        model.train()
        rs = dict()
        rs["train"] = run_epoch(
            model,
            trainloader,
            unsuploader,
            criterion,
            optimizer,
            desc_default="train",
            epoch=epoch,
            writer=writers[0],
            verbose=True,
            unsupervised=unsupervised,
            scheduler=scheduler,
            method=args.method,
        )
        if math.isnan(rs["train"]["loss"]):
            raise Exception("train loss is NaN.")

        model.eval()
        if (
                epoch % (10 if "cifar" in C.get()["dataset"] else 30) == 0
                or epoch == max_epoch
        ):
            rs["test"] = run_epoch(
                model,
                testloader,
                unsuploader,
                criterion,
                None,
                desc_default="*test",
                epoch=epoch,
                writer=writers[1],
                verbose=True,
                method=args.method
            )

            if best_valid_top1 < rs["test"]["top1"]:
                best_valid_top1 = rs["test"]["top1"]

            if metric == "last" or rs[metric]["loss"] < best_valid_loss:  # TODO
                if metric != "last":
                    best_valid_loss = rs[metric]["loss"]
                for key, setname in itertools.product(
                        ["loss", "top1", "top5"], ["train", "test"]
                ):
                    result["%s_%s" % (key, setname)] = rs[setname][key]
                result["epoch"] = epoch

                writers[1].add_scalar("test_top1/best", rs["test"]["top1"], epoch)

            # save checkpoint
            if save_path:
                logger.info("save model@%d to %s" % (epoch, save_path))
                torch.save(
                    {
                        "epoch": epoch,
                        "log": {
                            "train": rs["train"].get_dict(),
                            "test": rs["test"].get_dict(),
                        },
                        "optimizer": optimizer.state_dict(),
                        "model": model.state_dict(),
                    },
                    save_path,
                )

    del model

    return result
コード例 #5
0
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', \
        save_path=None,pretrained=None, only_eval=False):

    if not reporter:
        reporter = lambda **kwargs: 0

    max_epoch = C.get()['epoch']
    trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'],\
            dataroot, test_ratio, split_idx = cv_fold)

    # create a model & an optimizer
    model = get_model(C.get()['model'],
                      num_class(C.get()['dataset']),
                      data_parallel=True)

    # criterion = nn.CrossEntropyLoss()
    criterion = LabelSmoothSoftmaxCE()
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=C.get()['lr'],
                              momentum=C.get()['optimizer'].get(
                                  'momentum', 0.9),
                              weight_decay=C.get()['optimizer']['decay'],
                              nesterov=C.get()['optimizer']['nesterov'])
    else:
        raise ValueError('invalid optimizer type=%s' %
                         C.get()['optimizer']['type'])

    is_master = True
    logger.debug('is_master=%s' % is_master)

    #set schedulers
    lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        logger.debug('cosine learn decay.')
        t_max = C.get()['epoch']
        if C.get()['lr_schedule'].get('warmup', None):
            t_max -= C.get()['lr_schedule']['warmup']['epoch']
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=t_max,
                                                               eta_min=0.)
    elif lr_scheduler_type == 'mixnet_l':
        scheduler = adjust_learning_rate_mixnet(optimizer)
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if C.get()['lr_schedule'].get('warmup', None):
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()['lr_schedule']['warmup']['multiplier'],
            total_epoch=C.get()['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler)

    from tensorboardX import SummaryWriter
    writers = [
        SummaryWriter(log_dir='./logs/%s/%s' % (tag, x))
        for x in ['train', 'valid', 'test']
    ]

    result = OrderedDict()
    epoch_start = 1

    ## load model for training or evaluation
    if save_path and os.path.exists(save_path):
        data = torch.load(save_path)
        if 'model' in data:
            new_state_dict = {}
            for k, v in data['model'].items():
                if 'module.' not in k:
                    new_state_dict['module.' + k] = v
                else:
                    new_state_dict[k] = v
            model.load_state_dict(new_state_dict)
            optimizer.load_state_dict(data['optimizer'])
            logger.info('ckpt epoch@%d' % data['epoch'])
            if data['epoch'] < C.get()['epoch']:
                epoch_start = data['epoch']
            else:
                only_eval = True
            logger.info('epoch=%d' % data['epoch'])
        else:
            model.load_state_dict(data)
        del data
    elif pretrained:
        assert os.path.exists(pretrained)
        ckt = torch.load(pretrained)
        model_dict = model.state_dict()
        if 'model' in ckt:
            new_state_dict = {}
            for k, v in ckt['model'].items():
                if 'module.' not in k:
                    new_state_dict['module.' + k] = v
                else:
                    new_state_dict[k] = v
            model_dict.update(new_state_dict)
            model.load_state_dict(model_dict)
        else:
            model_dict.update(ckt)
            model.load_state_dict(model_dict)

    ##Evaluate the model
    if only_eval:
        print('Eval model')
        logger.info('evaluation only+')
        model.eval()
        rs = dict()

        rs['test'] = run_epoch(model,
                               testloader_,
                               criterion,
                               None,
                               desc_default='*test',
                               epoch=0,
                               writer=writers[2])

        for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                              ['test']):
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result

    # train loop
    best_valid_loss = 10e10
    best_accuracy = 0.0
    for epoch in range(epoch_start, max_epoch + 1):
        scheduler.step()
        model.train()
        rs = dict()
        rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', \
                epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler, is_train=True)
        model.eval()

        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        if (epoch % 1) == 0 or epoch == max_epoch:
            rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', \
                    epoch=epoch, writer=writers[1], verbose=is_master)
            rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', \
                    epoch=epoch, writer=writers[2], verbose=is_master)

            if metric == 'last' or rs[metric]['loss'] < best_valid_loss:
                if metric != 'last':
                    best_valid_loss = rs[metric]['loss']
                for key, setname in itertools.product(
                    ['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('valid_top1/best', rs['valid']['top1'],
                                      epoch)
                writers[2].add_scalar('test_top1/best', rs['test']['top1'],
                                      epoch)

                reporter(loss_valid=rs['valid']['loss'],
                         top1_valid=rs['valid']['top1'],
                         loss_test=rs['test']['loss'],
                         top1_test=rs['test']['top1'])

            # save checkpoint
            if is_master and save_path:
                if rs['test']['top1'] > best_accuracy:
                    best_accuracy = rs['test']['top1']
                    logger.info('save model@%d to %s' % (epoch, save_path))
                    torch.save(
                        {
                            'epoch': epoch,
                            'log': {
                                'train': rs['train'].get_dict(),
                                'valid': rs['valid'].get_dict(),
                                'test': rs['test'].get_dict(),
                            },
                            'optimizer': optimizer.state_dict(),
                            'model': model.state_dict()
                        }, save_path)

    del model

    return result
コード例 #6
0
ファイル: phase2.py プロジェクト: zwzhu-d/cores
def train_and_eval(tag,
                   dataroot,
                   metric='last',
                   resume=False,
                   save_path=None,
                   only_eval=False,
                   unsupervised=False,
                   devices=None):
    max_epoch = C.get()['epoch']
    unsup_idx = C.get()['unsup_idx']
    if os.path.exists(unsup_idx):
        unsup_idx = np.load(unsup_idx).tolist()
        print('Unsup idx:', len(unsup_idx))
        trainloader, unsuploader, testloader = get_dataloaders(
            C.get()['dataset'],
            C.get()['batch'],
            C.get()['batch_unsup'],
            dataroot,
            with_noise=True,
            random_state=C.get()['random_state'],
            unsup_idx=unsup_idx)
    else:
        trainloader, unsuploader, testloader = get_dataloaders(
            C.get()['dataset'],
            C.get()['batch'],
            C.get()['batch_unsup'],
            dataroot,
            with_noise=False,
            random_state=C.get()['random_state'])

    # create a model & an optimizer
    model = get_model(C.get()['model'],
                      num_class(C.get()['dataset']),
                      data_parallel=True,
                      devices=devices)

    criterion = nn.CrossEntropyLoss()
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=C.get()['lr'],
                              momentum=C.get()['optimizer'].get(
                                  'momentum', 0.9),
                              weight_decay=C.get()['optimizer']['decay'],
                              nesterov=C.get()['optimizer']['nesterov'])
    else:
        raise ValueError('invalid optimizer type=%s' %
                         C.get()['optimizer']['type'])

    lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine')
    if lr_scheduler_type == 'cosine':
        #t_max = 600
        #print('Temp Fix for AnnealingCosine, Tmax=',t_max)
        t_max = C.get()['epoch']
        if C.get()['lr_schedule'].get('warmup', None):
            t_max -= C.get()['lr_schedule']['warmup']['epoch']
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=t_max,
                                                               eta_min=0.)
    else:
        raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

    if C.get()['lr_schedule'].get('warmup', None):
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=C.get()['lr_schedule']['warmup']['multiplier'],
            total_epoch=C.get()['lr_schedule']['warmup']['epoch'],
            after_scheduler=scheduler)

    if not tag.strip():
        from metrics import SummaryWriterDummy as SummaryWriter
        logger.warning('tag not provided, no tensorboard log.')
    else:
        from tensorboardX import SummaryWriter
    writers = [
        SummaryWriter(logdir='./logs/%s/%s' % (tag, x))
        for x in ['train', 'test']
    ]

    result = OrderedDict()
    epoch_start = 1

    if (resume or only_eval) and save_path and os.path.exists(save_path):
        print('Resuming from last epoch:', save_path)
        ckpt = torch.load(save_path)
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        epoch_start = ckpt['epoch']

    elif os.path.exists(C.get()['pretrain']):
        print('Loading pretrain from:', C.get()['pretrain'])
        ckpt = torch.load(C.get()['pretrain'])
        try:
            model.load_state_dict(ckpt['state_dict'])
        except RuntimeError:
            new_state_dict = {
                'module.' + k: v
                for k, v in ckpt['state_dict'].items()
            }
            model.load_state_dict(new_state_dict)
    else:
        print('DEBUG: No pretrain available')

    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()
        rs['test'] = run_epoch(model,
                               testloader,
                               unsuploader,
                               criterion,
                               None,
                               desc_default='*test',
                               epoch=epoch_start,
                               writer=writers[1])
        for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                              ['train', 'test']):
            result['%s_%s' % (key, setname)] = rs[setname][key]
        result['epoch'] = 0
        return result

    # train loop
    global best_valid_top1
    best_valid_loss = 10e10
    for epoch in range(epoch_start, max_epoch + 1):
        model.train()
        rs = dict()
        rs['train'] = run_epoch(model,
                                trainloader,
                                unsuploader,
                                criterion,
                                optimizer,
                                desc_default='train',
                                epoch=epoch,
                                writer=writers[0],
                                verbose=False,
                                unsupervised=unsupervised,
                                scheduler=scheduler)
        print('Train At Epoch {}'.format(epoch), rs['train'])
        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        model.eval()
        if epoch % (10 if 'cifar' in C.get()['dataset'] else
                    30) == 0 or epoch == max_epoch:
            rs['test'] = run_epoch(model,
                                   testloader,
                                   unsuploader,
                                   criterion,
                                   None,
                                   desc_default='*test',
                                   epoch=epoch,
                                   writer=writers[1],
                                   verbose=False)
            print('Test At Epoch {}'.format(epoch), rs['test'])
            if best_valid_top1 < rs['test']['top1']:
                best_valid_top1 = rs['test']['top1']

            if metric == 'last' or rs[metric]['loss'] < best_valid_loss:  # TODO
                if metric != 'last':
                    best_valid_loss = rs[metric]['loss']
                for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                                      ['train', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writers[1].add_scalar('test_top1/best', rs['test']['top1'],
                                      epoch)

            # save checkpoint
            if save_path:
                logger.info('save model@%d to %s' % (epoch, save_path))
                torch.save(
                    {
                        'epoch': epoch,
                        'log': {
                            'train': rs['train'].get_dict(),
                            'test': rs['test'].get_dict(),
                        },
                        'optimizer': optimizer.state_dict(),
                        'state_dict': model.state_dict()
                    }, save_path)

    del model

    return result
コード例 #7
0
ファイル: search.py プロジェクト: Bahlat87/ZazuML
def eval_tta(config, augment):
    augment['num_policy'] = 1  # TODO remove
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[
        'cv_fold'], augment['save_path']
    print(augment)
    # setup - provided augmentation rules
    C.get().aug = policy_decoder(augment, augment['num_policy'],
                                 augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path)
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model.eval()

    loaders = []
    for _ in range(augment['num_policy']):  # TODO
        _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'],
                                                  C.get()['batch'],
                                                  augment['dataroot'],
                                                  cv_ratio_test,
                                                  split_idx=cv_fold)
        loaders.append(iter(validloader))
        del tl, tl2

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    try:
        while True:
            losses = []
            corrects = []
            for loader in loaders:
                data, label = next(loader)
                data = data.cuda()
                label = label.cuda()

                pred = model(data)

                loss = loss_fn(pred, label)
                losses.append(loss.detach().cpu().numpy())

                _, pred = pred.topk(1, 1, True, True)
                pred = pred.t()
                correct = pred.eq(label.view(
                    1, -1).expand_as(pred)).detach().cpu().numpy()
                corrects.append(correct)
                del loss, correct, pred, data, label

            losses = np.concatenate(losses)
            losses_min = np.min(losses, axis=0).squeeze()

            corrects = np.concatenate(corrects)
            corrects_max = np.max(corrects, axis=0).squeeze()
            metrics.add_dict({
                'minus_loss': -1 * np.sum(losses_min),
                'correct': np.sum(corrects_max),
                'cnt': len(corrects_max)
            })
            del corrects, corrects_max
    except StopIteration:
        pass

    del model
    metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    # reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True)
    # track.log(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True)
    tune.report(minus_loss=metrics['minus_loss'],
                top1_valid=metrics['correct'],
                elapsed_time=gpu_secs,
                done=True)
    return metrics['correct']