Example #1
0
def train(
        embed_file,
        train_file,
        test_file,
        n_epoch=20,
        batch_size=70,
        model=1):

    # Load files
    print("init preprocessor with %s" % embed_file)
    processor = MsrpCorpusPreprocessor(embed_file)
    print("load MSRParaphraseCorpus [train] from %s" % train_file)
    X_train_raw, y_train = load_msrp_corpus(train_file)
    print("load MSRParaphraseCorpus [test] from %s" % test_file)
    X_test_raw, y_test = load_msrp_corpus(test_file)

    print('')
    print("initialize ...")
    print('--------------------------------')
    print('# Minibatch-size: %d' % batch_size)
    print('# epoch: %d' % n_epoch)
    print('--------------------------------')

    # Preprocess data
    X_train = processor.fit_transform(X_train_raw)
    X_test = processor.transform(X_test_raw)

    # Set up a neural network to train
    if model == 2:
        print("use BCNN model")
        model = BClassifier(BCNN(
            channels=3,
            filter_width=3,
            embeddings=processor.embeddings,
        ))
    else:
        print("use BiCNN model")
        model = BiClassifier(BiCNN(
            channels=[3, 5],
            filter_width=[6, 14],
            embeddings=processor.embeddings,
            k_top=4,
            beta=2,
            pool_size=[(10, 10), (10, 10), (6, 6), (2, 2)]
        ))

    # Setup an optimizer
    optimizer = optimizers.AdaGrad(lr=0.01)
    optimizer.setup(model)

    # Initialize datasets
    train_iter = iterators.SerialIterator(datasets.TupleDataset(X_train, y_train), batch_size, repeat=True, shuffle=True)
    test_iter = iterators.SerialIterator(datasets.TupleDataset(X_test, y_test), batch_size, repeat=False, shuffle=False)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer)
    trainer = training.Trainer(updater, (n_epoch, 'epoch'), out='logs')

    # Set extensions
    trainer.extend(E.Evaluator(test_iter, model))
    trainer.extend(E.dump_graph('main/loss'))
    trainer.extend(E.snapshot(), trigger=(n_epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss',
         'main/accuracy', 'validation/main/accuracy']))
    trainer.extend(E.ProgressBar(update_interval=2))

    # Run the training
    print("trainer.run() executed")
    print('')
    trainer.run()
Example #2
0
def main():
    args = arg_parse()
    print_args(args)

    print("==> Creating dataloader...")

    data_dir = args.data_path
    test_list1 = './list/image/test.txt'
    test_loader1 = get_test_set(data_dir, test_list1, args)
    test_list2 = './list/video/large/test.txt'
    test_loader2 = get_test_set_v(data_dir, test_list2, args)
    test_list3 = './list/audio/test.txt'
    test_loader3 = get_test_set(data_dir, test_list3, args)
    test_list4 = './list/text/test.txt'
    test_loader4 = get_text_set(data_dir, test_list4, args, 'test')
    data_set1 = CubTextDataset('dataset', 'list/text/test.txt', 'test')
    test_loader5 = DataLoader(dataset=data_set1, batch_size=1, shuffle=False)

    out_feature_dir1 = os.path.join(args.feature, 'image')
    out_feature_dir2 = os.path.join(args.feature, 'video')
    out_feature_dir3 = os.path.join(args.feature, 'audio')
    out_feature_dir4 = os.path.join(args.feature, 'text')

    mkdir(out_feature_dir1)
    mkdir(out_feature_dir2)
    mkdir(out_feature_dir3)
    mkdir(out_feature_dir4)

    print("==> Loading the modelwork ...")
    model = resnet50(num_classes=200)
    model = model.cuda()
    '''
    if args.gpu is not None:
        model = nn.DataParallel(model, device_ids=range(args.gpu))
        model = model.cuda()
        cudnn.benchmark = True
    '''
    if args.snapshot:
        if os.path.isfile(args.snapshot):
            print("==> loading checkpoint '{}'".format(args.snapshot))
            checkpoint = torch.load(args.snapshot)
            model.load_state_dict(checkpoint)
            print("==> loaded checkpoint '{}'".format(args.snapshot))
        else:
            print("==> no checkpoint found at '{}'".format(args.snapshot))
            exit()

    model_audio = VGG16BN(n_classes=200, pretrained=False).cuda()
    if args.snapshotaudio:
        if os.path.isfile(args.snapshotaudio):
            print("==> loading checkpoint '{}'".format(args.snapshotaudio))
            checkpoint = torch.load(args.snapshotaudio)
            model_audio.load_state_dict(checkpoint)
            print("==> loaded checkpoint '{}'".format(args.snapshot))
        else:
            print("==> no checkpoint found at '{}'".format(args.snapshot))
            exit()
    model_img = BCNN_img(n_classes=200, pretrained=False).cuda()
    if args.gpu is not None:
        # model = torch.nn.DataParallel(model, device_ids=range(args.gpu))
        model_img = nn.DataParallel(model_img, device_ids=[0])
        # model = model.cuda()
        cudnn.benchmark = True

    if args.snapshotimg:  # os.path.isfile(args.snapshot):
        print("==> loading checkpoint '{}'".format(args.snapshotimg))
        checkpoint = torch.load(args.snapshotimg)
        model_dict = model_img.module.state_dict()
        restore_param = {
            k: v
            for k, v in checkpoint.items() if k in model_dict
        }
        model_dict.update(restore_param)
        model_img.module.load_state_dict(model_dict)
        print("==> loaded checkpoint '{}'".format(args.snapshotimg))
    else:
        print("==> no checkpoint found at '{}'".format(args.snapshotimg))
    model_img.eval()

    model_rnn = LSTMClassifier().cuda()
    if True:  # if os.path.isfile(args.snapshot):  # 'snapshot是path to latest checkpoint'
        print("==> loading checkpoint '{}'".format(
            './pretrained/rnnmodel_word2vec_39.375.pkl'))
        checkpoint = torch.load(
            './pretrained/rnnmodel_word2vec_39.375.pkl')  # 加载模型
        model_dict = model_rnn.state_dict()
        restore_param = {
            k: v
            for k, v in checkpoint.items() if k in model_dict
        }
        model_dict.update(restore_param)
        model_rnn.load_state_dict(model_dict)
        print("==> loaded checkpoint '{}'".format(
            './pretrained/rnnmodel_word2vec_39.375.pkl'))
    else:
        print("==> no checkpoint found at '{}'".format(
            './pretrained/rnnmodel_word2vec_39.375.pkl'))
    model_video = BCNN(n_classes=200, pretrained=False).cuda()
    # if args.gpu is not None:
    #     # model = torch.nn.DataParallel(model, device_ids=range(args.gpu))
    #     model_video = nn.DataParallel(model_video, device_ids=[0])
    #     # model = model.cuda()
    #     cudnn.benchmark = True
    #
    # if args.snapshotvideo:  # os.path.isfile(args.snapshot):
    #     print("==> loading checkpoint '{}'".format(args.snapshotvideo))
    #     checkpoint = torch.load(args.snapshotvideo)
    #     model_dict = model_video.module.state_dict()
    #     restore_param = {k: v for k, v in checkpoint.items() if k in model_dict}
    #     model_dict.update(restore_param)
    #     model_video.module.load_state_dict(model_dict)
    #     print("==> loaded checkpoint '{}'".format(args.snapshotvideo))
    # else:
    #     print("==> no checkpoint found at '{}'".format(args.snapshotvideo))
    model.eval()
    # model_video.eval()
    # model_rnn.eval()
    # print("Text Acc:")
    # text_acc = validate(test_loader4, model_rnn, args, True)
    # print("image Acc:")
    # # image_acc = validate(test_loader1, model, args, False)
    # print("V Acc:")
    # video_acc = validate_v(test_loader2, model,model_video,args, False)
    # print("A Acc:")
    #  text_acc = validate(test_loader3, model, args,  False)
    # model = model.module

    print("Text Features ...")
    txt = extra_t(model,
                  model_rnn,
                  test_loader4,
                  test_loader5,
                  out_feature_dir4,
                  args,
                  flag='t')
    print("Image Features ...")
    img = extra_i(model,
                  model_img,
                  test_loader1,
                  out_feature_dir1,
                  args,
                  flag='i')
    # img=os.path.join(args.feature, 'image') + '/features_te.txt'
    print("Video Features ...")
    vid = extra(model, test_loader2, out_feature_dir2, args, flag='v')
    print("Audio Features ...")
    aud = extra_i(model,
                  model_audio,
                  test_loader3,
                  out_feature_dir3,
                  args,
                  flag='a')
    # aud = os.path.join(args.feature, 'audio') + '/features_te.txt'
    # print("Text Features ...")
    # txt = extra_t(model,model_rnn,test_loader4,test_loader5, out_feature_dir4, args, flag='t')
    # txt = os.path.join(args.feature, 'text') + '/features_te.txt'

    compute_mAP(img, vid, aud, txt)
Example #3
0
    def __init__(self, options, path, ckpt_basename='vgg_16'):
        """Prepare the network, criterion, solver, and data.

        Args:
            options, dict: Hyperparameters.
        """
        print('Prepare the network and data.')
        self._options = options
        self._path = path
        self.ckpt_basename = ckpt_basename
        # Network.
        self._net = BCNN(freeze_features=True)
        #self._net = torch.nn.DataParallel(self._net)
        self._net.features = torch.nn.DataParallel(self._net.features)
        self._net.cuda()

        if 'ckpt_path' in self._path:
            if os.path.exists(self._path['ckpt_path']):
                print('Continue from', self._path['ckpt_path'])
                self._net.load_state_dict(torch.load(self._path['ckpt_path']))
            else:
                print('Ckpt {} not found!'.format(self._path['ckpt_path']))
        print(self._net)
        # Criterion.
        self._criterion = torch.nn.CrossEntropyLoss().cuda()
        # Solver.
        self._solver = torch.optim.SGD(
            self._net.fc.parameters(),
            lr=self._options['base_lr'],
            momentum=0.9,
            weight_decay=self._options['weight_decay'])
        if self._options['lr_scheduler'] == 'reduce_on_plateau':
            self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                self._solver,
                mode='max',
                factor=0.1,
                patience=5,
                verbose=True,
                threshold=1e-4,
                min_lr=1e-6)
        elif self._options['lr_scheduler'] == 'fixed':
            self._scheduler = torch.optim.lr_scheduler.LambdaLR(
                self._solver, lambda epoch: 1.0)
        else:
            raise ValueError('Unknown scheduler:',
                             self._options['lr_scheduler'])

# Imagenet normalization
        normalize = torchvision.transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        train_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),  # Let smaller edge match
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.RandomCrop(size=448),
            torchvision.transforms.ToTensor(),
            normalize
        ])
        test_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),
            torchvision.transforms.CenterCrop(size=448),
            torchvision.transforms.ToTensor(), normalize
        ])
        train_data = cub200.CUB200(root=self._path['cub200'],
                                   train=True,
                                   download=True,
                                   transform=train_transforms)
        test_data = cub200.CUB200(root=self._path['cub200'],
                                  train=False,
                                  download=True,
                                  transform=test_transforms)
        self._train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=self._options['batch_size'],
            shuffle=True,
            num_workers=4,
            pin_memory=True)
        self._test_loader = torch.utils.data.DataLoader(test_data,
                                                        batch_size=16,
                                                        shuffle=False,
                                                        num_workers=4,
                                                        pin_memory=True)
Example #4
0
from PIL import ImageFile  # Python:IOError: image file is truncated 的解决办法
ImageFile.LOAD_TRUNCATED_IMAGES = True

torch.manual_seed(0)
torch.cuda.manual_seed(0)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--model', type=str, required=True)
    args = parser.parse_args()

    data_dir = args.data
    model_path = args.model

    net = BCNN(pretrained=False)

    if torch.cuda.device_count() >= 1:
        net = torch.nn.DataParallel(net).cuda()
        print('cuda device : ', torch.cuda.device_count())
    else:
        raise EnvironmentError(
            'This is designed to run on GPU but no GPU is found')
    net.load_state_dict(torch.load(model_path))

    test_transform = torchvision.transforms.Compose([
        torchvision.transforms.Resize(size=448),
        torchvision.transforms.CenterCrop(size=448),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                         std=(0.229, 0.224, 0.225))
Example #5
0
class BCNNTrainer(object):
    """Manager class to train bilinear CNN.

    Attributes:
        _options: Hyperparameters.
        _path: Useful paths.
        _net: Bilinear CNN.
        _criterion: Cross-entropy loss.
        _solver: SGD with momentum.
        _scheduler: Reduce learning rate by a fator of 0.1 when plateau.
        _train_loader: Training data.
        _test_loader: Testing data.
    """
    def __init__(self, options, path, ckpt_basename='vgg_16'):
        """Prepare the network, criterion, solver, and data.

        Args:
            options, dict: Hyperparameters.
        """
        print('Prepare the network and data.')
        self._options = options
        self._path = path
        self.ckpt_basename = ckpt_basename
        # Network.
        self._net = BCNN(freeze_features=True)
        #self._net = torch.nn.DataParallel(self._net)
        self._net.features = torch.nn.DataParallel(self._net.features)
        self._net.cuda()

        if 'ckpt_path' in self._path:
            if os.path.exists(self._path['ckpt_path']):
                print('Continue from', self._path['ckpt_path'])
                self._net.load_state_dict(torch.load(self._path['ckpt_path']))
            else:
                print('Ckpt {} not found!'.format(self._path['ckpt_path']))
        print(self._net)
        # Criterion.
        self._criterion = torch.nn.CrossEntropyLoss().cuda()
        # Solver.
        self._solver = torch.optim.SGD(
            self._net.fc.parameters(),
            lr=self._options['base_lr'],
            momentum=0.9,
            weight_decay=self._options['weight_decay'])
        if self._options['lr_scheduler'] == 'reduce_on_plateau':
            self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                self._solver,
                mode='max',
                factor=0.1,
                patience=5,
                verbose=True,
                threshold=1e-4,
                min_lr=1e-6)
        elif self._options['lr_scheduler'] == 'fixed':
            self._scheduler = torch.optim.lr_scheduler.LambdaLR(
                self._solver, lambda epoch: 1.0)
        else:
            raise ValueError('Unknown scheduler:',
                             self._options['lr_scheduler'])

# Imagenet normalization
        normalize = torchvision.transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        train_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),  # Let smaller edge match
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.RandomCrop(size=448),
            torchvision.transforms.ToTensor(),
            normalize
        ])
        test_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),
            torchvision.transforms.CenterCrop(size=448),
            torchvision.transforms.ToTensor(), normalize
        ])
        train_data = cub200.CUB200(root=self._path['cub200'],
                                   train=True,
                                   download=True,
                                   transform=train_transforms)
        test_data = cub200.CUB200(root=self._path['cub200'],
                                  train=False,
                                  download=True,
                                  transform=test_transforms)
        self._train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=self._options['batch_size'],
            shuffle=True,
            num_workers=4,
            pin_memory=True)
        self._test_loader = torch.utils.data.DataLoader(test_data,
                                                        batch_size=16,
                                                        shuffle=False,
                                                        num_workers=4,
                                                        pin_memory=True)

    def train(self):
        """Train the network."""
        print('Training.')
        self._net.train()
        best_acc = 0.0
        best_epoch = None
        for epoch in range(self._options['epochs']):
            epoch_loss = []
            num_correct = 0
            num_total = 0
            for batch_idx, (X, y) in enumerate(self._train_loader):
                # Data.
                X = X.cuda(non_blocking=True)
                y = y.cuda(non_blocking=True)

                # Clear the existing gradients.
                self._solver.zero_grad()
                # Forward pass.
                score = self._net(X)
                loss = self._criterion(score, y)
                epoch_loss.append(loss.data.item())
                # Prediction.
                _, prediction = torch.max(score.data, 1)
                num_total += y.size(0)
                num_correct += torch.sum(prediction == y.data).item()
                # Backward pass.
                loss.backward()
                self._solver.step()
                sys.stdout.write('\r')
                sys.stdout.write(
                    '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%'
                    % (epoch, self._options['epochs'], batch_idx + 1,
                       len(self._train_loader), loss.data.item(),
                       (100. * num_correct) / num_total))
                sys.stdout.flush()
            train_acc = 100 * num_correct / num_total
            test_acc = self._accuracy(self._test_loader)
            print('\nEpoch\tTrain loss\tTrain acc\tTest acc')
            print('%d\t%4.3f\t\t%4.2f%%\t\t%4.2f%%' %
                  (epoch + 1, np.mean(epoch_loss), train_acc, test_acc))
            self._scheduler.step(test_acc)
            if test_acc > best_acc:
                best_acc = test_acc
                best_epoch = epoch + 1
                print('*', end='')
                # Save model onto disk.
                save_path = os.path.join(
                    self._path['model'],
                    '{}_epoch_{}.pth'.format(self.ckpt_basename, epoch + 1))
                save_path_best = os.path.join(
                    self._path['model'],
                    '{}_epoch_best.pth'.format(self.ckpt_basename))
                torch.save(self._net.state_dict(), save_path)
                shutil.copy(save_path, save_path_best)
        print('Best at epoch %d, test accuaray %f' % (best_epoch, best_acc))

    def _accuracy(self, data_loader):
        """Compute the train/test accuracy.

        Args:
            data_loader: Train/Test DataLoader.

        Returns:
            Train/Test accuracy in percentage.
        """
        self._net.eval()
        num_correct = 0
        num_total = 0
        for X, y in data_loader:
            # Data.
            X = X.cuda(non_blocking=True)
            y = y.cuda(non_blocking=True)
            with torch.no_grad():
                # Prediction.
                score = self._net(X)
                _, prediction = torch.max(score.data, 1)
                num_total += y.size(0)
                num_correct += torch.sum(prediction == y.data).item()
        return 100 * num_correct / num_total

    def getStat(self):
        """Get the mean and std value for a certain dataset."""
        print('Compute mean and variance for training data.')
        train_data = cub200.CUB200(root=self._path['cub200'],
                                   train=True,
                                   transform=torchvision.transforms.ToTensor(),
                                   download=True)
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   pin_memory=True)
        mean = torch.zeros(3)
        std = torch.zeros(3)
        for X, _ in tqdm(train_loader):
            for d in range(3):
                mean[d] += X[:, d, :, :].mean()
                std[d] += X[:, d, :, :].std()
        mean.div_(len(train_data))
        std.div_(len(train_data))
        print(mean)
        print(std)
Example #6
0
    def __init__(self, options, path):
        """Prepare the network, criterion, solver, and data.

        Args:
            options, dict: Hyperparameters.
        """
        print('Prepare the network and data.')
        self._options = options
        self._path = path
        # Network.
        if self._options['dataset'] == 'cub200':
            num_classes = 200
        elif self._options['dataset'] == 'aircraft':
            num_classes = 100
        else:
            raise NotImplementedError("Dataset " + self._options['dataset'] +
                                      " is not implemented.")
        self._net = BCNN(num_classes=num_classes,
                         pretrained=options['target'] == 'fc')
        # Load the model from disk.
        if options['target'] == 'all':
            self._net.load_state_dict(torch.load(self._path['model']))
        self._net = torch.nn.parallel.DistributedDataParallel(
            self._net.cuda(),
            device_ids=[self._options['local_rank']],
            output_device=self._options['local_rank'])
        if dist.get_rank() == 0:
            print(self._net)
        # Criterion.
        self._criterion = torch.nn.CrossEntropyLoss().cuda()
        # Solver.
        self._solver = torch.optim.SGD(
            self._net.module.trainable_params,
            lr=self._options['base_lr'] * dist.get_world_size(),
            momentum=0.9,
            weight_decay=self._options['weight_decay'])
        self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self._solver,
            mode='max',
            factor=0.1,
            patience=3,
            verbose=True,
            threshold=1e-4)

        train_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),  # Let smaller edge match
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.RandomCrop(size=448),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                             std=(0.229, 0.224, 0.225))
        ])
        test_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),
            torchvision.transforms.CenterCrop(size=448),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                             std=(0.229, 0.224, 0.225))
        ])
        if self._options['dataset'] == 'cub200':
            train_data = cub200.CUB200(root=self._path['dataset'],
                                       train=True,
                                       download=True,
                                       transform=train_transforms)
            test_data = cub200.CUB200(root=self._path['dataset'],
                                      train=False,
                                      download=True,
                                      transform=test_transforms)
        elif self._options['dataset'] == 'aircraft':
            train_data = aircraft.Aircraft(root=self._path['dataset'],
                                           train=True,
                                           download=True,
                                           transform=train_transforms)
            test_data = aircraft.Aircraft(root=self._path['dataset'],
                                          train=False,
                                          download=True,
                                          transform=test_transforms)
        else:
            raise NotImplementedError("Dataset " + self._options['dataset'] +
                                      " is not implemented.")
        # Partition dataset among workers using DistributedSampler
        train_sampler = distributed.DistributedSampler(
            train_data,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank())
        test_sampler = distributed.DistributedSampler(
            test_data,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank())

        self._train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=self._options['batch_size'],
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            sampler=train_sampler)
        self._test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=self._options['batch_size'],
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            sampler=test_sampler)
Example #7
0
class BCNNManager(object):
    """Manager class to train bilinear CNN.

    Attributes:
        _options: Hyperparameters.
        _path: Useful paths.
        _net: Bilinear CNN.
        _criterion: Cross-entropy loss.
        _solver: SGD with momentum.
        _scheduler: Reduce learning rate by a fator of 0.1 when plateau.
        _train_loader: Training data.
        _test_loader: Testing data.
    """
    def __init__(self, options, path):
        """Prepare the network, criterion, solver, and data.

        Args:
            options, dict: Hyperparameters.
        """
        print('Prepare the network and data.')
        self._options = options
        self._path = path
        # Network.
        if self._options['dataset'] == 'cub200':
            num_classes = 200
        elif self._options['dataset'] == 'aircraft':
            num_classes = 100
        else:
            raise NotImplementedError("Dataset " + self._options['dataset'] +
                                      " is not implemented.")
        self._net = BCNN(num_classes=num_classes,
                         pretrained=options['target'] == 'fc')
        # Load the model from disk.
        if options['target'] == 'all':
            self._net.load_state_dict(torch.load(self._path['model']))
        self._net = torch.nn.parallel.DistributedDataParallel(
            self._net.cuda(),
            device_ids=[self._options['local_rank']],
            output_device=self._options['local_rank'])
        if dist.get_rank() == 0:
            print(self._net)
        # Criterion.
        self._criterion = torch.nn.CrossEntropyLoss().cuda()
        # Solver.
        self._solver = torch.optim.SGD(
            self._net.module.trainable_params,
            lr=self._options['base_lr'] * dist.get_world_size(),
            momentum=0.9,
            weight_decay=self._options['weight_decay'])
        self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self._solver,
            mode='max',
            factor=0.1,
            patience=3,
            verbose=True,
            threshold=1e-4)

        train_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),  # Let smaller edge match
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.RandomCrop(size=448),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                             std=(0.229, 0.224, 0.225))
        ])
        test_transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=448),
            torchvision.transforms.CenterCrop(size=448),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                             std=(0.229, 0.224, 0.225))
        ])
        if self._options['dataset'] == 'cub200':
            train_data = cub200.CUB200(root=self._path['dataset'],
                                       train=True,
                                       download=True,
                                       transform=train_transforms)
            test_data = cub200.CUB200(root=self._path['dataset'],
                                      train=False,
                                      download=True,
                                      transform=test_transforms)
        elif self._options['dataset'] == 'aircraft':
            train_data = aircraft.Aircraft(root=self._path['dataset'],
                                           train=True,
                                           download=True,
                                           transform=train_transforms)
            test_data = aircraft.Aircraft(root=self._path['dataset'],
                                          train=False,
                                          download=True,
                                          transform=test_transforms)
        else:
            raise NotImplementedError("Dataset " + self._options['dataset'] +
                                      " is not implemented.")
        # Partition dataset among workers using DistributedSampler
        train_sampler = distributed.DistributedSampler(
            train_data,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank())
        test_sampler = distributed.DistributedSampler(
            test_data,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank())

        self._train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=self._options['batch_size'],
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            sampler=train_sampler)
        self._test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=self._options['batch_size'],
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            sampler=test_sampler)

    def train(self):
        """Train the network."""
        best_acc = 0.0
        best_epoch = None
        if dist.get_rank() == 0:
            print('Training.')
            print('Epoch\tTrain loss\tTrain acc\tTest acc\tTrain time')
        for t in range(self._options['epochs']):
            t0 = time.time()
            self._train_loader.sampler.set_epoch(t)
            epoch_loss = []
            num_correct = 0
            num_total = 0
            for X, y in self._train_loader:
                # Data.
                X = torch.autograd.Variable(X.cuda())
                y = torch.autograd.Variable(y.cuda(async=True))

                # Clear the existing gradients.
                self._solver.zero_grad()
                # Forward pass.
                score = self._net(X)
                loss = self._criterion(score, y)
                epoch_loss.append(loss.item())
                # Prediction.
                _, prediction = torch.max(score.data, 1)
                num_total += y.size(0)
                num_correct += torch.sum(prediction == y.data)
                # Backward pass.
                loss.backward()
                self._solver.step()

            test_acc = self._accuracy(self._test_loader)
            self._scheduler.step(test_acc)

            if dist.get_rank() == 0:
                train_acc = 100 * num_correct / num_total
                if test_acc > best_acc:
                    best_acc = test_acc
                    best_epoch = t + 1
                    print('*', end='')
                    # Save model onto disk.
                    torch.save(
                        self._net.module.state_dict(),
                        os.path.join(self._path['model_dir'],
                                     'vgg_16_epoch_%d.pth' % (t + 1)))
                print('%d\t%4.3f\t\t%4.2f%%\t\t%4.2f%%\t\t%4.2fs' %
                      (t + 1, sum(epoch_loss) / len(epoch_loss), train_acc,
                       test_acc, time.time() - t0))

        if dist.get_rank() == 0:
            print('Best at epoch %d, test accuaray %f' %
                  (best_epoch, best_acc))

    def _accuracy(self, data_loader):
        """Compute the train/test accuracy.

        Args:
            data_loader: Train/Test DataLoader.

        Returns:
            Train/Test accuracy in percentage.
        """
        self._net.train(False)
        num_correct = 0
        num_total = 0
        with torch.no_grad():
            for X, y in data_loader:
                # Data.
                X = torch.autograd.Variable(X.cuda())
                y = torch.autograd.Variable(y.cuda(async=True))

                # Prediction.
                score = self._net(X)
                _, prediction = torch.max(score.data, 1)
                num_total += y.size(0)
                num_correct += torch.sum(prediction == y.data).item()
        self._net.train(True)  # Set the model to training phase
        num_total = torch.tensor(num_total).cuda()
        num_correct = torch.tensor(num_correct).cuda()
        dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
        dist.all_reduce(num_correct, op=dist.ReduceOp.SUM)
        return 100 * num_correct.data.item() / num_total.data.item()

    def getStat(self):
        """Get the mean and std value for a certain dataset."""
        print('Compute mean and variance for training data.')
        train_data = cub200.CUB200(root=self._path['cub200'],
                                   train=True,
                                   transform=torchvision.transforms.ToTensor(),
                                   download=True)
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   pin_memory=True)
        mean = torch.zeros(3)
        std = torch.zeros(3)
        for X, _ in train_loader:
            for d in range(3):
                mean[d] += X[:, d, :, :].mean()
                std[d] += X[:, d, :, :].std()
        mean.div_(len(train_data))
        std.div_(len(train_data))
        print(mean)
        print(std)
Example #8
0
def main():
    # step = args.step
    print('===> About training in a two-step process! ===')
    print('------\n' 'drop rate: [{}]\t' '\n------'.format(drop_rate))

    # step 1: only train the fc layer
    if step == 1:
        print('===> Step 1 ...')
        bnn = BCNN(pretrained=True, n_classes=num_classes)
        bnn = nn.DataParallel(bnn).cuda()
        optimizer = optim.Adam(bnn.module.fc.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
    # step 1: train the whole network
    elif step == 2:
        print('===> Step 2 ...')
        bnn = BCNN(pretrained=False, n_classes=num_classes)
        bnn = nn.DataParallel(bnn).cuda()
        optimizer = optim.Adam(bnn.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
    else:
        raise AssertionError('Wrong step argument')

    correcter = self_correcter.Correcter(num_train_images, num_classes,
                                         queue_size)

    loadmodel = 'checkpoint.pth'

    # check if it is resume mode
    print(
        '-----------------------------------------------------------------------------'
    )
    if resume:
        assert os.path.isfile(
            loadmodel), 'please make sure checkpoint.pth exists'
        print('---> loading checkpoint.pth <---')
        checkpoint = torch.load(loadmodel)
        assert step == checkpoint[
            'step'], 'step in checkpoint does not match step in argument'
        start_epoch = checkpoint['epoch']
        best_accuracy = checkpoint['best_accuracy']
        best_epoch = checkpoint['best_epoch']
        bnn.load_state_dict(checkpoint['bnn_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        correcter.all_predictions = (checkpoint['all_predictions'])
        correcter.softmax_record = (checkpoint['softmax_record'])
        correcter.update_counters = (checkpoint['update_counters'])

    else:
        if step == 2:
            print('--->        step2 checkpoint loaded         <---')
            bnn.load_state_dict(
                torch.load('model/bnn_step1_vgg16_best_epoch.pth'))
        else:
            print('--->        no checkpoint loaded         <---')

        start_epoch = 0
        best_accuracy = 0.0
        best_epoch = None

    print(
        '-----------------------------------------------------------------------------'
    )

    with open(logfile, "a") as f:
        f.write('------ Step: {} ...\n'.format(step))
        f.write('------\n'
                'drop rate: [{}]\tqueue_size: [{}]\t'
                'warm_up: [{}]\tinit_lr: [{}]\t'
                '\n'.format(drop_rate, queue_size, warm_up, learning_rate))

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='max',
                                                     factor=0.5,
                                                     patience=4,
                                                     verbose=True,
                                                     threshold=learning_rate *
                                                     1e-3)

    for epoch in range(start_epoch, num_epochs):
        epoch_start_time = time.time()

        bnn.train()

        if epoch < warm_up:
            warm = True
        else:
            warm = False

        if not warm:
            correcter.separate_clean_and_unclean_keys(drop_rate)
            print("干净的样本数:", len(correcter.clean_key))

        train_acc, train_total = train(train_loader,
                                       epoch,
                                       bnn,
                                       optimizer,
                                       warm,
                                       correcter=correcter)

        test_acc = evaluate(test_loader, bnn)
        if not warm:
            scheduler.step(test_acc)

        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_epoch = epoch + 1
            torch.save(bnn.state_dict(),
                       'model/bnn_step{}_vgg16_best_epoch.pth'.format(step))

        epoch_end_time = time.time()
        print("all_predictions", len(correcter.all_predictions[0]))
        print("update_counters", correcter.update_counters[0])
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'bnn_state_dict': bnn.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_epoch': best_epoch,
                'best_accuracy': best_accuracy,
                'step': step,
                'all_predictions': correcter.all_predictions,
                'softmax_record': correcter.softmax_record,
                'update_counters': correcter.update_counters
            },
            filename=loadmodel)

        print('------\n'
              'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t'
              'Test Accuracy: [{:6.2f}]\t'
              'Epoch Runtime: [{:6.2f}]\t'\
              '\n------'.format(
            epoch + 1, num_epochs, train_acc, test_acc,
            epoch_end_time - epoch_start_time))
        with open(logfile, "a") as f:
            output = 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' \
                     'Test Accuracy: [{:6.2f}]\t' \
                     'Epoch Runtime: [{:7.2f}]\tTrain_total[{:06d}]\tclean_key[{:06d}]'.format(
                epoch + 1, num_epochs, train_acc, test_acc,
                epoch_end_time - epoch_start_time,train_total,len(correcter.clean_key))
            f.write(output + "\n")

    print('******\n'
          'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}] '
          '\n******'.format(best_accuracy, best_epoch))
    with open(logfile, "a") as f:
        output = '******\n' \
                 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}]; ' \
                 '\n******'.format(best_accuracy, best_epoch)
        f.write(output + "\n")
Example #9
0
def main():
    # step = args.step
    print('===> About training in a two-step process! ===')
    print('------\n'
          'drop rate: [{}]\tT_k: [{}]\t'
          'start epoch: [{}]\t'
          '\n------'.format(drop_rate, T_k, start))
    # step 1: only train the fc layer
    if step == 1:
        print('===> Step 1 ...')
        bnn = BCNN(pretrained=True, n_classes=num_classes)
        bnn = nn.DataParallel(bnn).cuda()
        optimizer = optim.Adam(bnn.module.fc.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
    # step 1: train the whole network
    elif step == 2:
        print('===> Step 2 ...')
        bnn = BCNN(pretrained=False, n_classes=num_classes)
        bnn = nn.DataParallel(bnn).cuda()
        optimizer = optim.Adam(bnn.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
    else:
        raise AssertionError('Wrong step argument')

    loadmodel = 'checkpoint.pth'
    # check if it is resume mode
    print(
        '-----------------------------------------------------------------------------'
    )
    if resume:
        assert os.path.isfile(
            loadmodel), 'please make sure checkpoint.pth exists'
        print('---> loading checkpoint.pth <---')
        checkpoint = torch.load(loadmodel)
        assert step == checkpoint[
            'step'], 'step in checkpoint does not match step in argument'
        start_epoch = checkpoint['epoch']
        best_accuracy = checkpoint['best_accuracy']
        best_epoch = checkpoint['best_epoch']
        bnn.load_state_dict(checkpoint['bnn_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        Cross_entropy = checkpoint['Cross_entropy']
        logits_softmax = checkpoint['logits_softmax']
    else:
        if step == 2:
            print('--->        step2 checkpoint loaded         <---')
            bnn.load_state_dict(
                torch.load('model/bnn_step1_vgg16_best_epoch.pth'))
        else:
            print('--->        no checkpoint loaded         <---')
        Cross_entropy = []
        logits_softmax = []
        start_epoch = 0
        best_accuracy = 0.0
        best_epoch = None
    print(
        '-----------------------------------------------------------------------------'
    )

    with open(logfile, "a") as f:
        f.write('------ Step: {} ...\n'.format(step))
        f.write('------\n'
                'drop rate: [{}]\tT_k: [{}]\t'
                'start epoch: [{}]\t'
                '\n------'.format(drop_rate, T_k, start))

    for epoch in range(start_epoch, num_epochs):
        epoch_start_time = time.time()

        bnn.train()
        adjust_learning_rate(optimizer, epoch)

        #train returns 'Cross_entropy', used in saving checkpoints.
        train_acc, logits_softmax, Cross_entropy = train(
            train_loader, epoch, bnn, optimizer, logits_softmax, Cross_entropy)

        # dump the output: cross_entropy, image path, image label, image id. If you want to check the selection result, just use the code blow.
        # if len(Cross_entropy) > 0:
        #     pickle.dump(Cross_entropy, open(cross_entropy_savapath + 'crossentropy_epoch{}_step{}.pkl'.format(epoch + 1,step), 'wb'))

        test_acc = evaluate(test_loader, bnn)

        if test_acc > best_accuracy:
            best_accuracy = test_acc
            best_epoch = epoch + 1
            torch.save(bnn.state_dict(),
                       'model/bnn_step{}_vgg16_best_epoch.pth'.format(step))

        epoch_end_time = time.time()
        # save checkpoint
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'bnn_state_dict': bnn.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_epoch': best_epoch,
                'best_accuracy': best_accuracy,
                'step': step,
                'Cross_entropy': Cross_entropy,
                'logits_softmax': logits_softmax
            },
            filename=loadmodel)

        print('------\n'
              'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t'
              'Test Accuracy: [{:6.2f}]\t'
              'Epoch Runtime: [{:6.2f}]\t'\
              '\n------'.format(
            epoch + 1, num_epochs, train_acc, test_acc,
            epoch_end_time - epoch_start_time))
        with open(logfile, "a") as f:
            output = 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' \
                     'Test Accuracy: [{:6.2f}]\t' \
                     'Epoch Runtime: [{:6.2f}]\t'.format(
                epoch + 1, num_epochs, train_acc, test_acc,
                epoch_end_time - epoch_start_time)
            f.write(output + "\n")

    print('******\n'
          'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}] '
          '\n******'.format(best_accuracy, best_epoch))
    with open(logfile, "a") as f:
        output = '******\n' \
                 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}]; ' \
                 '\n******'.format(best_accuracy, best_epoch)
        f.write(output + "\n")