Ejemplo n.º 1
0
def train():
    global GLOBAL_STEP, reduction_arc, cell_arc
    # Dataset
    dataset = cifarDataset(
        batchSize=args.batch_size,
        dataPath=args.data_path,
        numOfWorkers=args.data_nums_workers,
        noise_rate=args.nr,
        is_cifar100=args.train_cifar100,
        filename=args.fn,
    )
    dataLoader = dataset.getDataLoader()

    if args.train_cifar100:
        num_classes = 100
        fixed_cnn = ResNet34(num_classes=num_classes)
    else:
        num_classes = 10
        fixed_cnn = SCEModel()

    if args.loss == 'SCE':
        if args.train_cifar100:
            criterion = SCELoss(alpha=6.0, beta=0.1, num_classes=num_classes)
        else:
            criterion = SCELoss(alpha=0.1, beta=1.0, num_classes=num_classes)
    elif args.loss == 'CE':
        criterion = torch.nn.CrossEntropyLoss()
    else:
        print("Unknown loss")

    print(criterion.__class__.__name__)
    print("Number of Trainable Parameters %.4f" %
          count_parameters_in_MB(fixed_cnn))
    fixed_cnn = torch.nn.DataParallel(fixed_cnn)
    fixed_cnn.to(device)

    fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
                                         lr=args.lr,
                                         momentum=0.9,
                                         nesterov=True,
                                         weight_decay=args.l2_reg)

    if args.train_cifar100:
        milestone = [80, 120]
    else:
        milestone = [40, 80]
    fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestone, gamma=0.1)

    utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path,
                           version=args.version)
    starting_epoch = 0
    train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion,
                fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
Ejemplo n.º 2
0
def train():
    global GLOBAL_STEP, reduction_arc, cell_arc
    # Dataset
    dataset = DatasetGenerator(batchSize=args.batch_size,
                               dataPath=args.data_path,
                               numOfWorkers=args.data_nums_workers,
                               noise_rate=args.nr,
                               asym=args.asym,
                               seed=args.seed,
                               dataset_type=args.dataset_type)
    dataLoader = dataset.getDataLoader()

    if args.dataset_type == 'cifar100':
        num_classes = 100
        args.epoch = 150
        fixed_cnn = ResNet34(num_classes=num_classes)
    elif args.dataset_type == 'cifar10':
        num_classes = 10
        args.epoch = 120
        fixed_cnn = SCEModel()
    else:
        raise ('Unimplemented')

    if args.loss == 'SCE':
        criterion = SCELoss(alpha=args.alpha,
                            beta=args.beta,
                            num_classes=num_classes)
    elif args.loss == 'CE':
        criterion = torch.nn.CrossEntropyLoss()
    else:
        logger.info("Unknown loss")

    logger.info(criterion.__class__.__name__)
    logger.info("Number of Trainable Parameters %.4f" %
                count_parameters_in_MB(fixed_cnn))
    fixed_cnn = torch.nn.DataParallel(fixed_cnn)
    fixed_cnn.to(device)

    fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
                                         lr=args.lr,
                                         momentum=0.9,
                                         nesterov=True,
                                         weight_decay=args.l2_reg)

    fixed_cnn_scheduler = torch.optim.lr_scheduler.StepLR(fixed_cnn_optmizer,
                                                          1,
                                                          gamma=0.97)

    utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path,
                           version=args.version)
    starting_epoch = 0
    train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion,
                fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
Ejemplo n.º 3
0
def build_BSD_500(model_state_dict, optimizer_state_dict, **kwargs):
    # epoch = kwargs.pop('epoch')
    # i_iter = kwargs.pop('i_iter')
    root = "./data/HED-BSDS"

    train_data = dataloader_BSD_Pascal.BSD_loader(root=root,
                                                  split='train',
                                                  normalisation=False,
                                                  keep_size=False)

    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              pin_memory=True,
                                              num_workers=16,
                                              shuffle=True)

    # model = DeepLab(output_stride=16, class_num=2, pretrained=False, freeze_bn=False)
    # model = NASUNetBSD(args, args.classes, depth=args.layers, c=args.channels,
    #                    keep_prob=args.keep_prob, nodes=args.nodes,
    #                    use_aux_head=args.use_aux_head, arch=args.arch,
    #                    double_down_channel=args.double_down_channel)
    model = NAOMSCBC(args,
                     args.classes,
                     args.arch,
                     channels=42,
                     pretrained=True,
                     res='101')

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
    if model_state_dict is not None:
        model.load_state_dict(model_state_dict)

    if torch.cuda.device_count() > 1:
        logging.info("Use %d %s", torch.cuda.device_count(), "GPUs !")
        model = nn.DataParallel(model)
    model = model.cuda()

    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=args.lr_max,
    #     momentum=0.9,
    #     weight_decay=args.l2_reg,
    # )
    # if optimizer_state_dict is not None:
    #     optimizer.load_state_dict(optimizer_state_dict)
    #
    # return train_queue, model, optimizer
    return train_queue, model
Ejemplo n.º 4
0
Archivo: test.py Proyecto: 2BH/NAS_K49
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    genotype = eval("genotypes.%s" % args.arch)
    model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                    args.auxiliary, genotype)
    model = model.cuda()
    utils.load(model, args.model_path)

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    _, test_transform = utils._data_transforms_cifar10(args)
    test_data = dset.CIFAR10(root=args.data,
                             train=False,
                             download=True,
                             transform=test_transform)

    test_queue = torch.utils.data.DataLoader(test_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=2)

    model.drop_path_prob = args.drop_path_prob
    test_acc, test_obj = infer(test_queue, model, criterion)
    logging.info('test_acc %f', test_acc)
Ejemplo n.º 5
0
def main():
    args = cfg.parse_args()
    torch.cuda.manual_seed(args.random_seed)

    # set visible GPU ids
    if len(args.gpu_ids) > 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids

    # set TensorFlow environment for evaluation (calculate IS and FID)
    _init_inception()
    inception_path = check_or_download_inception('./tmp/imagenet/')
    create_inception_graph(inception_path)

    # the first GPU in visible GPUs is dedicated for evaluation (running Inception model)
    str_ids = args.gpu_ids.split(',')
    args.gpu_ids = []
    for id in range(len(str_ids)):
        if id >= 0:
            args.gpu_ids.append(id)
    if len(args.gpu_ids) > 1:
        args.gpu_ids = args.gpu_ids[1:]
    else:
        args.gpu_ids = args.gpu_ids

    # genotype G
    genotypes_root = os.path.join('exps', args.genotypes_exp, 'Genotypes')
    genotype_G = np.load(os.path.join(genotypes_root, 'latest_G.npy'))

    # import network from genotype
    basemodel_gen = eval('archs.' + args.arch + '.Generator')(args, genotype_G)
    gen_net = torch.nn.DataParallel(
        basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])
    basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args)
    dis_net = torch.nn.DataParallel(
        basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])

    # basemodel_gen = eval('archs.' + args.arch + '.Generator')(args=args)
    # gen_net = torch.nn.DataParallel(basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])
    # basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args=args)
    # dis_net = torch.nn.DataParallel(basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])

    # weight init
    def weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv2d') != -1:
            if args.init_type == 'normal':
                nn.init.normal_(m.weight.data, 0.0, 0.02)
            elif args.init_type == 'orth':
                nn.init.orthogonal_(m.weight.data)
            elif args.init_type == 'xavier_uniform':
                nn.init.xavier_uniform(m.weight.data, 1.)
            else:
                raise NotImplementedError('{} unknown inital type'.format(
                    args.init_type))
        elif classname.find('BatchNorm2d') != -1:
            nn.init.normal_(m.weight.data, 1.0, 0.02)
            nn.init.constant_(m.bias.data, 0.0)

    gen_net.apply(weights_init)
    dis_net.apply(weights_init)

    # set up data_loader
    dataset = datasets.ImageDataset(args)
    train_loader = dataset.train

    # epoch number for dis_net
    args.max_epoch_D = args.max_epoch_G * args.n_critic
    if args.max_iter_G:
        args.max_epoch_D = np.ceil(args.max_iter_G * args.n_critic /
                                   len(train_loader))
    max_iter_D = args.max_epoch_D * len(train_loader)

    # set optimizer
    gen_optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr,
        (args.beta1, args.beta2))
    dis_optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr,
        (args.beta1, args.beta2))
    gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, max_iter_D)
    dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, max_iter_D)

    # fid stat
    if args.dataset.lower() == 'cifar10':
        fid_stat = 'fid_stat/fid_stats_cifar10_train.npz'
    elif args.dataset.lower() == 'stl10':
        fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz'
    else:
        raise NotImplementedError(f'no fid stat for {args.dataset.lower()}')
    assert os.path.exists(fid_stat)

    # initial
    gen_avg_param = copy_params(gen_net)
    start_epoch = 0
    best_fid = 1e4

    # set writer
    if args.checkpoint:
        # resuming
        print(f'=> resuming from {args.checkpoint}')
        assert os.path.exists(os.path.join('exps', args.checkpoint))
        checkpoint_file = os.path.join('exps', args.checkpoint, 'Model',
                                       'checkpoint_best.pth')
        assert os.path.exists(checkpoint_file)
        checkpoint = torch.load(checkpoint_file)
        start_epoch = checkpoint['epoch']
        best_fid = checkpoint['best_fid']
        gen_net.load_state_dict(checkpoint['gen_state_dict'])
        dis_net.load_state_dict(checkpoint['dis_state_dict'])
        gen_optimizer.load_state_dict(checkpoint['gen_optimizer'])
        dis_optimizer.load_state_dict(checkpoint['dis_optimizer'])
        avg_gen_net = deepcopy(gen_net)
        avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict'])
        gen_avg_param = copy_params(avg_gen_net)
        del avg_gen_net

        args.path_helper = checkpoint['path_helper']
        logger = create_logger(args.path_helper['log_path'])
        logger.info(
            f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})')
    else:
        # create new log dir
        assert args.exp_name
        args.path_helper = set_log_dir('exps', args.exp_name)
        logger = create_logger(args.path_helper['log_path'])

    logger.info(args)
    writer_dict = {
        'writer': SummaryWriter(args.path_helper['log_path']),
        'train_global_steps': start_epoch * len(train_loader),
        'valid_global_steps': start_epoch // args.val_freq,
    }

    # model size
    logger.info('Param size of G = %fMB', count_parameters_in_MB(gen_net))
    logger.info('Param size of D = %fMB', count_parameters_in_MB(dis_net))
    print_FLOPs(basemodel_gen, (1, args.latent_dim), logger)
    print_FLOPs(basemodel_dis, (1, 3, args.img_size, args.img_size), logger)

    # for visualization
    if args.draw_arch:
        from utils.genotype import draw_graph_G
        draw_graph_G(genotype_G,
                     save=True,
                     file_path=os.path.join(args.path_helper['graph_vis_path'],
                                            'latest_G'))
    fixed_z = torch.cuda.FloatTensor(
        np.random.normal(0, 1, (100, args.latent_dim)))

    # train loop
    for epoch in tqdm(range(int(start_epoch), int(args.max_epoch_D)),
                      desc='total progress'):
        lr_schedulers = (gen_scheduler,
                         dis_scheduler) if args.lr_decay else None
        train(args, gen_net, dis_net, gen_optimizer, dis_optimizer,
              gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers)

        if epoch % args.val_freq == 0 or epoch == int(args.max_epoch_D) - 1:
            backup_param = copy_params(gen_net)
            load_params(gen_net, gen_avg_param)
            inception_score, std, fid_score = validate(args, fixed_z, fid_stat,
                                                       gen_net, writer_dict)
            logger.info(
                f'Inception score mean: {inception_score}, Inception score std: {std}, '
                f'FID score: {fid_score} || @ epoch {epoch}.')
            load_params(gen_net, backup_param)
            if fid_score < best_fid:
                best_fid = fid_score
                is_best = True
            else:
                is_best = False
        else:
            is_best = False

        # save model
        avg_gen_net = deepcopy(gen_net)
        load_params(avg_gen_net, gen_avg_param)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': args.arch,
                'gen_state_dict': gen_net.state_dict(),
                'dis_state_dict': dis_net.state_dict(),
                'avg_gen_state_dict': avg_gen_net.state_dict(),
                'gen_optimizer': gen_optimizer.state_dict(),
                'dis_optimizer': dis_optimizer.state_dict(),
                'best_fid': best_fid,
                'path_helper': args.path_helper
            }, is_best, args.path_helper['ckpt_path'])
        del avg_gen_net
Ejemplo n.º 6
0
def main():
    args = cfg.parse_args()
    torch.cuda.manual_seed(args.random_seed)

    # set visible GPU ids
    if len(args.gpu_ids) > 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids

    # set TensorFlow environment for evaluation (calculate IS and FID)
    _init_inception()
    inception_path = check_or_download_inception('./tmp/imagenet/')
    create_inception_graph(inception_path)

    # the first GPU in visible GPUs is dedicated for evaluation (running Inception model)
    str_ids = args.gpu_ids.split(',')
    args.gpu_ids = []
    for id in range(len(str_ids)):
        if id >= 0:
            args.gpu_ids.append(id)
    if len(args.gpu_ids) > 1:
        args.gpu_ids = args.gpu_ids[1:]
    else:
        args.gpu_ids = args.gpu_ids

    # genotype G
    genotypes_root = os.path.join('exps', args.genotypes_exp, 'Genotypes')
    genotype_G = np.load(os.path.join(genotypes_root, 'latest_G.npy'))

    # import network from genotype
    basemodel_gen = eval('archs.' + args.arch + '.Generator')(args, genotype_G)
    gen_net = torch.nn.DataParallel(
        basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])
    basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args)
    dis_net = torch.nn.DataParallel(
        basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0])

    # fid stat
    if args.dataset.lower() == 'cifar10':
        fid_stat = 'fid_stat/fid_stats_cifar10_train.npz'
    elif args.dataset.lower() == 'stl10':
        fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz'
    else:
        raise NotImplementedError(f'no fid stat for {args.dataset.lower()}')
    assert os.path.exists(fid_stat)

    # set writer
    print(f'=> resuming from {args.checkpoint}')
    assert os.path.exists(os.path.join('exps', args.checkpoint))
    checkpoint_file = os.path.join('exps', args.checkpoint, 'Model',
                                   'checkpoint_best.pth')
    assert os.path.exists(checkpoint_file)
    checkpoint = torch.load(checkpoint_file)
    epoch = checkpoint['epoch'] - 1
    gen_net.load_state_dict(checkpoint['gen_state_dict'])
    dis_net.load_state_dict(checkpoint['dis_state_dict'])
    avg_gen_net = deepcopy(gen_net)
    avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict'])
    gen_avg_param = copy_params(avg_gen_net)
    del avg_gen_net
    assert args.exp_name
    args.path_helper = set_log_dir('exps', args.exp_name)
    logger = create_logger(args.path_helper['log_path'])
    logger.info(f'=> loaded checkpoint {checkpoint_file} (epoch {epoch})')

    logger.info(args)
    writer_dict = {
        'writer': SummaryWriter(args.path_helper['log_path']),
        'valid_global_steps': epoch // args.val_freq,
    }

    # model size
    logger.info('Param size of G = %fMB', count_parameters_in_MB(gen_net))
    logger.info('Param size of D = %fMB', count_parameters_in_MB(dis_net))
    print_FLOPs(basemodel_gen, (1, args.latent_dim), logger)
    print_FLOPs(basemodel_dis, (1, 3, args.img_size, args.img_size), logger)

    # for visualization
    if args.draw_arch:
        from utils.genotype import draw_graph_G
        draw_graph_G(genotype_G,
                     save=True,
                     file_path=os.path.join(args.path_helper['graph_vis_path'],
                                            'latest_G'))
    fixed_z = torch.cuda.FloatTensor(
        np.random.normal(0, 1, (100, args.latent_dim)))

    # test
    load_params(gen_net, gen_avg_param)
    inception_score, std, fid_score = validate(args, fixed_z, fid_stat,
                                               gen_net, writer_dict)
    logger.info(
        f'Inception score mean: {inception_score}, Inception score std: {std}, '
        f'FID score: {fid_score} || @ epoch {epoch}.')
Ejemplo n.º 7
0
def train():
    # Dataset
    if args.dataset_type == 'clothing1m':
        dataset = Clothing1MDatasetLoader(batchSize=args.batch_size,
                                          dataPath=args.data_path,
                                          numOfWorkers=args.data_nums_workers)
    elif args.dataset_type == 'imagenet':
        dataset = ImageNetDatasetLoader(batchSize=args.batch_size,
                                        dataPath=args.data_path,
                                        seed=args.seed,
                                        target_class_num=200,
                                        nosiy_rate=0.4,
                                        numOfWorkers=args.data_nums_workers)
    else:
        dataset = DatasetGenerator(batchSize=args.batch_size,
                                   dataPath=args.data_path,
                                   numOfWorkers=args.data_nums_workers,
                                   noise_rate=args.nr,
                                   asym=args.asym,
                                   seed=args.seed,
                                   dataset_type=args.dataset_type)

    dataLoader = dataset.getDataLoader()
    eta_min = 0
    ln_neg = 1

    if args.dataset_type == 'clothing1m':
        # Train Clothing1M
        args.epoch = 20
        args.l2_reg = 1e-3
        num_classes = 14
        fixed_cnn = torchvision.models.resnet50(num_classes=14)
        # fixed_cnn.fc = torch.nn.Linear(2048, 14)

    elif args.dataset_type == 'cifar100':
        # Train CIFAR100
        args.lr = 0.1
        args.epoch = 200
        num_classes = 100
        fixed_cnn = ResNet34(num_classes=num_classes)

        # NLNL
        if args.loss == 'NLNL':
            args.epoch = 2000
            ln_neg = 110

    elif args.dataset_type == 'cifar10':
        # Train CIFAR10
        args.epoch = 120
        num_classes = 10
        fixed_cnn = SCEModel(type='cifar10')

        # NLNL
        if args.loss == 'NLNL':
            args.epoch = 1000

    elif args.dataset_type == 'mnist':
        # Train mnist
        args.epoch = 50
        num_classes = 10
        fixed_cnn = SCEModel(type='mnist')
        eta_min = 0.001
        args.l2_reg = 1e-3
        # NLNL
        if args.loss == 'NLNL':
            args.epoch = 720

    elif args.dataset_type == 'imagenet':
        args.epoch = 100
        args.l2_reg = 3e-5
        num_classes = 200
        fixed_cnn = torchvision.models.resnet50(num_classes=num_classes)

    logger.info("num_classes: %s" % (num_classes))

    loss_options = {
        'SCE': SCELoss(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
        'CE': torch.nn.CrossEntropyLoss(),
        'NCE': NormalizedCrossEntropy(scale=args.alpha, num_classes=num_classes),
        'MAE': MeanAbsoluteError(scale=args.alpha, num_classes=num_classes),
        'NMAE': NormalizedMeanAbsoluteError(scale=args.alpha, num_classes=num_classes),
        'GCE': GeneralizedCrossEntropy(num_classes=num_classes, q=args.q),
        'RCE': ReverseCrossEntropy(scale=args.alpha, num_classes=num_classes),
        'NRCE': NormalizedReverseCrossEntropy(scale=args.alpha, num_classes=num_classes),
        'NGCE': NormalizedGeneralizedCrossEntropy(scale=args.alpha, num_classes=num_classes, q=args.q),
        'NCEandRCE': NCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
        'NCEandMAE': NCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
        'GCEandMAE': GCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'GCEandRCE': GCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'GCEandNCE': GCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'MAEandRCE': MAEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
        'NGCEandNCE': NGCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'NGCEandMAE': NGCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'NGCEandRCE': NGCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
        'FocalLoss': FocalLoss(gamma=args.gamma),
        'NFL': NormalizedFocalLoss(scale=args.alpha, gamma=args.gamma, num_classes=num_classes),
        'NLNL': NLNL(num_classes=num_classes, train_loader=dataLoader['train_dataset'], ln_neg=ln_neg),
        'NFLandNCE': NFLandNCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
        'NFLandMAE': NFLandMAE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
        'NFLandRCE': NFLandRCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
        'DMI': DMILoss(num_classes=num_classes)
    }

    if args.loss in loss_options:
        criterion = loss_options[args.loss]
    else:
        raise("Unknown loss")

    logger.info(criterion.__class__.__name__)
    logger.info("Number of Trainable Parameters %.4f" % count_parameters_in_MB(fixed_cnn))

    fixed_cnn.to(device)

    if args.loss == 'DMI':
        criterion = loss_options['CE']

    fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
                                         lr=args.lr,
                                         momentum=0.9,
                                         weight_decay=args.l2_reg)

    fixed_cnn_scheduler = CosineAnnealingLR(fixed_cnn_optmizer,
                                            float(args.epoch),
                                            eta_min=eta_min)
    if args.dataset_type == 'clothing1m':
        fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[5, 10], gamma=0.1)
    elif args.dataset_type == 'imagenet':
        fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[30, 60, 80], gamma=0.1)

    utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path, version=args.version)
    starting_epoch = 0

    for arg in vars(args):
        logger.info("%s: %s" % (arg, getattr(args, arg)))

    train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)

    if args.loss == 'DMI':
        criterion = loss_options['DMI']
        fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
                                             lr=1e-6,
                                             momentum=0.9,
                                             weight_decay=args.l2_reg)
        starting_epoch = 0
        fixed_cnn_scheduler = None
        train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
Ejemplo n.º 8
0
def main():


    args = parse_args()
    reset_config(config, args)

    # tensorboard
    logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg, 'train', 'train')
    
    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED
    torch.backends.cudnn.benchmark = True
    
    model = Network(config, gt.DARTS)
    model.init_weights()
    
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }
    
    logger.info("param size = %fMB", count_parameters_in_MB(model))
    
    #dump_input = torch.rand(
    #    (1, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])
    #)
    #logger.info(get_model_summary(model, dump_input))
    
    gpus = [int(i) for i in config.GPUS.split(',')]
    criterion = JointsMSELoss(use_target_weight = config.LOSS.USE_TARGET_WEIGHT).to(device)
    model = nn.DataParallel(model, device_ids=gpus).to(device)
    
    logger.info("Logger is set - training start")

    # weights optimizer
    optimizer = torch.optim.Adam(model.parameters(), config.TRAIN.LR)
                                
                                
    # prepare dataloader
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.'+config.DATASET.DATASET)(
                            config,
                            config.DATASET.ROOT,
                            config.TRAIN.TRAIN_SET,
                            True,
                            transforms.Compose([
                               transforms.ToTensor(),
                               normalize,
                            ]))
                            
    valid_dataset = eval('dataset.'+config.DATASET.DATASET)(
                            conifg,
                            config.DATASET.ROOT,
                            config.TRAIN.TEST_SET,
                            False,
                            transforms.Compose([
                               transforms.ToTensor(),
                               normalize,
                            ]))
                           

  
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
                                               shuffle=True,
                                               num_workers=config.WORKERS,
                                               pin_memory=True)
                                               
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
                                               shuffle=False,
                                               num_workers=config.WORKERS,
                                               pin_memory=True)
                                               
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # training loop
    best_top1 = 0.
    best_model = False
    for epoch in range(config.TRAIN.EPOCHS):


        # training
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        # validation
        top1 = validate(
            config, valid_loader, valid_dataset, model, criterion,
            final_output_dir, tb_log_dir, writer_dict
        )

        # save
        if best_top1 < top1:
            best_top1 = top1
            best_model = True
        else:
            best_model = False
        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_state_dict': model.module.state_dict(),
            'perf': best_top1,
            'optimizer': optimizer.state_dict(),
        }, best_model, final_output_dir)
        
        lr_scheduler.step()

    final_model_state_file = os.path.join(
        final_output_dir, 'final_state.pth'
    )
    logger.info('=> saving final model state to {}'.format(
        final_model_state_file)
    )
    logger.info('=> best accuracy is {}'.format(best_top1))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Ejemplo n.º 9
0
def main():
    if not torch.cuda.is_available():
        logging.info('No GPU device available')
        sys.exit(1)
    np.random.seed(args.seed)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled=True
    torch.cuda.manual_seed(args.seed)
    logging.info("args = %s", args)
    num_gpus = torch.cuda.device_count()   
    genotype = eval("core.genotypes.%s" % args.arch)
    print('---------Genotype---------')
    logging.info(genotype)
    print('--------------------------') 
    model = Network(args.init_channels, args.input_channels, num_classes, args.layers, args.auxiliary, genotype)
    if num_gpus > 1:
        model = nn.DataParallel(model)
        model = model.cuda()
    else:
        model = model.cuda()
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    criterion_smooth = CrossEntropyLabelSmooth(num_classes, args.label_smooth)
    criterion_smooth = criterion_smooth.cuda()

    optimizer = torch.optim.SGD(
        model.parameters(),
        args.learning_rate,
        momentum=args.momentum,
        weight_decay=args.weight_decay
        )
    data_augmentations = args.data_aug
    if data_augmentations is None:
        data_augmentations = transforms.ToTensor()
    elif isinstance(type(data_augmentations), list):
        data_augmentations = transforms.Compose(data_augmentations)
    elif not isinstance(data_augmentations, transforms.Compose):
        raise NotImplementedError

    # Dataset

    train_data = K49(args.data_dir, True, data_augmentations)
    test_data = K49(args.data_dir, False, data_augmentations)

    train_queue = torch.utils.data.DataLoader(
        train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2)

    valid_queue = torch.utils.data.DataLoader(
        test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs))
    best_acc_top1 = 0
    for epoch in range(args.epochs):
        if args.lr_scheduler == 'cosine':
            scheduler.step()
            current_lr = scheduler.get_lr()[0]
        elif args.lr_scheduler == 'linear':
            current_lr = adjust_lr(optimizer, epoch)
        else:
            print('Wrong lr type, exit')
            sys.exit(1)
        logging.info('Epoch: %d lr %e', epoch, current_lr)
        if epoch < 5 and args.batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr * (epoch + 1) / 5.0
            logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0)
        if num_gpus > 1:
            model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs
        else:
            model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
        epoch_start = time.time()
        train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer)
        logging.info('Train_acc: %f', train_acc)

        valid_acc_top1, valid_obj = infer(valid_queue, model, criterion)
        logging.info('Valid_acc_top1: %f', valid_acc_top1)
        epoch_duration = time.time() - epoch_start
        logging.info('Epoch time: %ds.', epoch_duration)
        is_best = False
        if valid_acc_top1 > best_acc_top1:
            best_acc_top1 = valid_acc_top1
            is_best = True
        utils.save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_acc_top1': best_acc_top1,
            'optimizer' : optimizer.state_dict(),
            }, is_best, log_path)        
Ejemplo n.º 10
0
Archivo: train.py Proyecto: 2BH/NAS_K49
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    genotype = eval("core.genotypes.%s" % args.arch)
    #if args.set == "KMNIST":
    #  model = NetworkKMNIST(args.init_channels, args.input_channels, num_classes, args.layers, args.auxiliary, genotype)
    #elif args.set == "K49":
    model = Network(args.init_channels, args.input_channels, num_classes,
                    args.layers, args.auxiliary, genotype)

    model = model.cuda()

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # Data augmentations
    train_transform, valid_transform = utils.data_transforms_Kuzushiji(args)

    # Dataset
    if args.set == "KMNIST":
        train_data = KMNIST(args.data_dir, True, train_transform)
        test_data = KMNIST(args.data_dir, False, valid_transform)
    elif args.set == "K49":
        train_data = K49(args.data_dir, True, train_transform)
        test_data = K49(args.data_dir, False, valid_transform)
    else:
        raise ValueError("Unknown Dataset %s" % args.dataset)

    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=2)

    valid_queue = torch.utils.data.DataLoader(test_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=2)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs))
    best_acc = 0.0
    for epoch in range(args.epochs):
        scheduler.step()
        logging.info('epoch %d/%d lr %e', epoch, args.epochs,
                     scheduler.get_lr()[0])

        genotype = eval("core.genotypes.%s" % args.arch)
        print('---------Genotype---------')
        logging.info(genotype)
        print('--------------------------')

        model.drop_path_prob = args.drop_path_prob * epoch / args.epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer)
        logging.info('train_acc %f', train_acc)

        valid_acc, valid_obj = infer(valid_queue, model, criterion)
        if valid_acc > best_acc:
            best_acc = valid_acc
        logging.info('valid_acc %f, best_acc %f', valid_acc, best_acc)

        utils.save(model, os.path.join(log_path, 'weights.pt'))