def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(train):
        return DataLoader(create_dataset(opt, train),
                          batch_size=opt.batch_size,
                          shuffle=train,
                          num_workers=opt.nthread,
                          pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    f, params, stats = resnet(opt.depth, opt.width, num_classes)

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD(params.values(), lr, 0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors, stats = state_dict['params'], state_dict['stats']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    print_tensor_dict(params)
    print('\nAdditional buffers:')
    print_tensor_dict(stats)

    n_parameters = sum(p.numel() for p in params.values())
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = Variable(cast(sample[0], opt.dtype))
        targets = Variable(cast(sample[1], 'long'))
        y = data_parallel(f, inputs, params, stats, sample[2],
                          list(range(opt.ngpu)))
        return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(
            dict(params={k: v.data
                         for k, v in params.items()},
                 stats=stats,
                 optimizer=state['optimizer'].state_dict(),
                 epoch=t['epoch']),
            open(os.path.join(opt.save, 'model.pt7'), 'wb'))
        z = vars(opt).copy()
        z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        classacc.add(state['output'].data,
                     torch.LongTensor(state['sample'][1]))
        meter_loss.add(state['loss'].data[0])

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, test_loader)

        test_acc = classacc.value()[0]
        print(
            log(
                {
                    "train_loss": train_loss[0],
                    "train_acc": train_acc[0],
                    "test_loss": meter_loss.value()[0],
                    "test_acc": test_acc,
                    "epoch": state['epoch'],
                    "num_classes": num_classes,
                    "n_parameters": n_parameters,
                    "train_time": train_time,
                    "test_time": timer_test.value(),
                }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
def main():
    st = time.time()
    opt = parser.parse_args()
    epoch_step = json.loads(opt.epoch_step)
    print('parsed options:', vars(opt))

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    epoch_step = json.loads(opt.epoch_step)

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    f_s, params_s = define_student(opt.depth, opt.width)
    f_t, params_t = define_teacher(opt.teacher_params)
    params = {'student.'+k: v for k, v in params_s.items()}
    params.update({'teacher.'+k: v for k, v in params_t.items()})

    params = OrderedDict((k, p.cuda().detach().requires_grad_(p.requires_grad)) for k, p in params.items())

    optimizable = [v for v in params.values() if v.requires_grad]
    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD(optimizable, lr, momentum=0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    iter_train = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, True)
    iter_test = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, False)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    utils.print_tensor_dict(params)


    n_parameters = sum(p.numel() for p in optimizable)
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')
    meters_at = [tnt.meter.AverageValueMeter() for i in range(4)]

    def f(inputs, params, mode):
        y_s, g_s = f_s(inputs, params, mode, 'student.')
        with torch.no_grad():
            y_t, g_t = f_t(inputs, params, 'teacher.')
        return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)]

    def h(sample):
        inputs, targets, mode = sample
        inputs = inputs.cuda().detach()
        targets = targets.cuda().long().detach()
        y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))
        loss_groups = [v.sum() for v in loss_groups]
        [m.add(v.item()) for m,v in zip(meters_at, loss_groups)]
        return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
                + opt.beta * sum(loss_groups), y_s

    def log(t, state):
        torch.save(dict(params={k: v.data for k, v in params.items()},
                        optimizer=state['optimizer'].state_dict(),
                        epoch=t['epoch']),
                   os.path.join(opt.save, 'model.pt7'))
        z = vars(opt).copy(); z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        classacc.add(state['output'].data, state['sample'][1])
        loss = state['loss'].item()
        meter_loss.add(loss)
        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        [meter.reset() for meter in meters_at]
        state['iterator'] = tqdm(iter_train, dynamic_ncols=True)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, iter_test)

        print(log({
            "train_loss": train_loss[0],
            "train_acc": train_acc,
            "test_loss": meter_loss.value()[0],
            "test_acc": classacc.value(),
            "epoch": state['epoch'],
            "n_parameters": n_parameters,
            "train_time": train_time,
            "test_time": timer_test.value(),
            "at_losses": [m.value() for m in meters_at],
           }, state))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, iter_train, opt.epochs, optimizer)

    print("total time: {}".format(time.time()-st))
Beispiel #3
0
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100
    log_step = 1
    assert opt.subset_size in [100, 500, 1000, -1
                               ], 'subset size should be 100, 500, 1000 or -1'
    assert opt.subset_id in [1, 2, 3, 4, 5,
                             -1], 'subset ide should be 1-5 or -1'
    if opt.subset_size in [100, 500, 1000]:
        log_step = 10000 // opt.subset_size

    torch.manual_seed(opt.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(mode):
        shuffle = mode and (opt.subset_size == -1 or opt.subset_id == -1)
        sampler = None
        if mode and not shuffle:
            ind = np.loadtxt('subsets/subset_' + str(opt.subset_size) + '_' +
                             str(opt.subset_id) + '.txt',
                             dtype=np.int64)
            sampler = SubsetRandomSampler(ind)

        return DataLoader(create_dataset(opt, mode),
                          opt.batch_size,
                          sampler=sampler,
                          shuffle=shuffle,
                          num_workers=opt.nthread,
                          pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    kwargs = {}
    if not opt.level is None:
        kwargs.update({'level': opt.level})
    f, params = resnet(opt.depth, opt.width, num_classes, opt.dropout,
                       **kwargs)

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD([v for v in params.values() if v.requires_grad],
                   lr,
                   momentum=0.9,
                   weight_decay=opt.weight_decay,
                   nesterov=opt.nesterov)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            if k in params_tensors:
                v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in params.values() if p.requires_grad)
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = cast(sample[0], opt.dtype)
        targets = cast(sample[1], 'long')
        y = data_parallel(f, inputs, params, sample[2],
                          list(range(opt.ngpu))).float()
        return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(
            dict(params={
                k: v
                for k, v in params.items() if k.find('dct') == -1
            },
                 epoch=t['epoch'],
                 optimizer=state['optimizer'].state_dict()),
            os.path.join(opt.save, 'model.pt7'))
        z = vars(opt).copy()
        z.update(t)
        with open(os.path.join(opt.save, 'log.txt'), 'a') as flog:
            flog.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        loss = float(state['loss'])
        classacc.add(state['output'].data, state['sample'][1])
        meter_loss.add(loss)
        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader, dynamic_ncols=True)
        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        if state['epoch'] % log_step == 0:
            train_loss = meter_loss.value()
            train_acc = classacc.value()
            train_time = timer_train.value()
            meter_loss.reset()
            classacc.reset()
            timer_test.reset()

            with torch.no_grad():
                engine.test(h, test_loader)

            test_acc = classacc.value()[0]
            print(
                log(
                    {
                        "train_loss": train_loss[0],
                        "train_acc": train_acc[0],
                        "test_loss": meter_loss.value()[0],
                        "test_acc": test_acc,
                        "epoch": state['epoch'],
                        "num_classes": num_classes,
                        "n_parameters": n_parameters,
                        "train_time": train_time,
                        "test_time": timer_test.value(),
                    }, state))
            print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' %
                  (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  ###multiple gpu
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(mode):
        return DataLoader(create_dataset(opt, mode),
                          opt.batch_size,
                          shuffle=mode,
                          num_workers=opt.nthread,
                          pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    # deal with student first
    f_s, params_s = resnet(opt.depth, opt.width, num_classes)

    # deal with teacher
    if opt.teacher_id:
        with open(os.path.join('logs', opt.teacher_id, 'log.txt'), 'r') as ff:
            line = ff.readline()
            r = line.find('json_stats')
            info = json.loads(line[r + 12:])
        f_t = resnet(info['depth'], info['width'], num_classes)[0]
        model_data = torch.load(
            os.path.join('logs', opt.teacher_id, 'model.pt7'))
        params_t = model_data['params']

        # merge teacher and student params
        params = {'student.' + k: v for k, v in params_s.items()}
        for k, v in params_t.items():
            if not (k.startswith("teacher")):
                k = k.replace("student.", "")
                params['teacher.' + k] = v.detach().requires_grad_(False)

        def f(inputs, params, mode):
            y_s, g_s = f_s(inputs, params, mode, 'student.')
            with torch.no_grad():
                y_t, g_t = f_t(inputs, params, False, 'teacher.')
            return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)]
    else:
        f, params = f_s, params_s

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD((v for v in params.values() if v.requires_grad),
                   lr,
                   momentum=0.9,
                   weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    utils.print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in list(params_s.values()))
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')
    meters_at = [tnt.meter.AverageValueMeter() for i in range(3)]

    opt.save = opt.save + "_" + opt.dataset + "_epochs_" + str(opt.epochs)
    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    writer = SummaryWriter(opt.save)

    def h(sample):
        inputs = utils.cast(sample[0], opt.dtype).detach()
        targets = utils.cast(sample[1], 'long')
        if opt.teacher_id != '':
            y_s, y_t, loss_groups = utils.data_parallel(
                f, inputs, params, sample[2], range(opt.ngpu))
            loss_groups = [v.sum() for v in loss_groups]
            [m.add(v.item()) for m, v in zip(meters_at, loss_groups)]
            return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
                   + opt.beta * sum(loss_groups), y_s
        else:
            y = utils.data_parallel(f, inputs, params, sample[2],
                                    range(opt.ngpu))[0]
            return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(
            dict(params={k: v.data
                         for k, v in params.items()},
                 optimizer=state['optimizer'].state_dict(),
                 epoch=t['epoch']),
            os.path.join(opt.save, 'model.pt7'))  #정해준 path에 모델을 save 한다.
        z = vars(opt).copy()
        z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])  #sample을 train상태에 올린다.

    def on_forward(state):
        classacc.add(state['output'].data, state['sample'][1])  #
        meter_loss.add(state['loss'].item())

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        [meter.reset() for meter in meters_at]
        state['iterator'] = tqdm(train_loader)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.mean
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, test_loader)  #upward

        test_acc = classacc.value()[0]

        writer.add_scalar('loss/train', train_loss, state['epoch'])
        writer.add_scalar('acc/train', train_acc[0], state['epoch'])
        writer.add_scalar('loss/test', meter_loss.mean, state['epoch'])
        writer.add_scalar('acc/test', test_acc, state['epoch'])

        print(
            log(
                {
                    "train_loss": train_loss,
                    "train_acc": train_acc[0],
                    "test_loss": meter_loss.mean,
                    "test_acc": test_acc,
                    "epoch": state['epoch'],
                    "num_classes": num_classes,
                    "n_parameters": n_parameters,
                    "train_time": train_time,
                    "test_time": timer_test.value(),
                    "at_losses": [m.value() for m in meters_at],
                }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)

    writer.close()
Beispiel #5
0
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    torch.manual_seed(opt.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(mode):
        return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode,
                          num_workers=opt.nthread, pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)
     
    if opt.activation_dropout:
        print('[*********] Using activation dropout')
        
    f, params = resnet(opt.depth, opt.width, num_classes, opt.dropout_prob, opt.activation_dropout)

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD([v for v in params.values() if v.requires_grad], lr, momentum=0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in params.values() if p.requires_grad)
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = cast(sample[0], opt.dtype)
        targets = cast(sample[1], 'long')
        y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float()
        return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(dict(params=params, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()),
                   os.path.join(opt.save, 'model.pt7'))
        z = {**vars(opt), **t}
        with open(os.path.join(opt.save, 'log.txt'), 'a') as flog:
            flog.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        loss = float(state['loss'])
        classacc.add(state['output'].data, state['sample'][1])
        meter_loss.add(loss)
        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader, dynamic_ncols=True)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        with torch.no_grad():
            engine.test(h, test_loader)

        test_acc = classacc.value()[0]
        print(log({
            "train_loss": train_loss[0],
            "train_acc": train_acc[0],
            "test_loss": meter_loss.value()[0],
            "test_acc": test_acc,
            "epoch": state['epoch'],
            "num_classes": num_classes,
            "n_parameters": n_parameters,
            "train_time": train_time,
            "test_time": timer_test.value(),
        }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' %
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
Beispiel #6
0
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    torch.manual_seed(opt.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(mode):
        return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode,
                          num_workers=opt.nthread, pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    f_1, params_1 = resnet(opt.depth, opt.width, num_classes)
    f_2, params_2 = resnet(opt.depth, opt.width, num_classes)

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD([v for v in params_1.values() if v.requires_grad] + [v for v in params_2.values() if v.requires_grad], lr, momentum=0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        raise NotImplementedError

    print('\nParameters:')
    print_tensor_dict(params_1)
    print_tensor_dict(params_2)

    n_parameters = sum([p.numel() for p in params_1.values() if p.requires_grad] + [p.numel() for p in params_2.values() if p.requires_grad])
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')
    classacc_ep1 = tnt.meter.ClassErrorMeter(accuracy=True)
    classacc_ep2 = tnt.meter.ClassErrorMeter(accuracy=True)

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        global _outputs, _loss

        connection_map = np.array([
            [0,0,0, 1,1,1],
            [0,0,0, 1,1,1],
            [0,0,0, 1,1,1],

            [1,1,1, 0,0,0],
            [1,1,1, 0,0,0],
            [1,1,1, 0,0,0]])

        inputs = cast(sample[0], opt.dtype)
        targets = cast(sample[1], 'long')
        net1_outputs = data_parallel(f_1, inputs, params_1, sample[2], list(range(opt.ngpu)))
        net2_outputs = data_parallel(f_2, inputs, params_2, sample[2], list(range(opt.ngpu)))
        net1_outputs = [o.float() for o in net1_outputs]
        net2_outputs = [o.float() for o in net2_outputs]

        _loss = []

        # hard supervision
        for i, o in enumerate(net1_outputs):
            _loss.append(F.cross_entropy(o, targets))

        for i, o in enumerate(net2_outputs):
            _loss.append(F.cross_entropy(o, targets))

        outputs = net1_outputs + net2_outputs
        # soft supervision
        for i, o in enumerate(outputs):
            for j, o2 in enumerate(outputs):
                if connection_map[i,j] > 0:
                    _loss.append(KL_divergence(o2.detach(),o))

        loss = sum(_loss)
        _outputs = net1_outputs

        return loss, net1_outputs[-1]

    def log(t, state):
        torch.save(dict(params=params_1, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()),
                   os.path.join(opt.save, 'model.pt7'))
        z = {**vars(opt), **t}
        with open(os.path.join(opt.save, 'log.txt'), 'a') as flog:
            flog.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        loss = float(state['loss'])
        classacc.add(state['output'].data, state['sample'][1])
        classacc_ep1.add(_outputs[0].data, state['sample'][1])
        classacc_ep2.add(_outputs[1].data, state['sample'][1])
        meter_loss.add(loss)
        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        classacc_ep1.reset()
        classacc_ep2.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader, dynamic_ncols=True)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        train_acc_ep1 = classacc_ep1.value()
        train_acc_ep2 = classacc_ep2.value()

        meter_loss.reset()
        classacc.reset()
        timer_test.reset()
        classacc_ep1.reset()
        classacc_ep2.reset()

        with torch.no_grad():
            engine.test(h, test_loader)

        test_acc = classacc.value()[0]
        test_acc_ep1 = classacc_ep1.value()[0]
        test_acc_ep2 = classacc_ep2.value()[0]
        print(log({
            "train_loss": train_loss[0],
            "train_acc": train_acc[0],
            "test_loss": meter_loss.value()[0],
            "test_acc": test_acc,
            "train_acc_ep1": train_acc_ep1[0],
            "train_acc_ep2": train_acc_ep2[0],
            "test_acc_ep1": test_acc_ep1,
            "test_acc_ep2": test_acc_ep2,

            "epoch": state['epoch'],
            "num_classes": num_classes,
            "n_parameters": n_parameters,
            "train_time": train_time,
            "test_time": timer_test.value(),
        }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' %
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
Beispiel #7
0
def main():
    args = parser.parse_args()
    print('parsed options:', vars(args))
    epoch_step = json.loads(args.epoch_step)

    check_manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id

    ds = check_dataset(args.dataset, args.dataroot, args.augment,
                       args.download)

    if args.dataset == "awa2":
        image_shape, num_classes, train_dataset, test_dataset, all_labels = ds
        all_labels = all_labels.to("cuda:0")
    else:
        image_shape, num_classes, train_dataset, test_dataset = ds
        all_labels = torch.eye(num_classes).to("cuda:0")

    if args.ssl:
        num_labelled = args.num_labelled
        num_unlabelled = len(train_dataset) - num_labelled
        if args.dataset == "awa2":
            labelled_set, unlabelled_set = data.random_split(
                train_dataset, [num_labelled, num_unlabelled])
        else:
            td_targets = train_dataset.targets if args.dataset == "cifar10" else train_dataset.labels
            labelled_idxs, unlabelled_idxs = x_u_split(td_targets,
                                                       num_labelled,
                                                       num_classes)
            labelled_set, unlabelled_set = [
                Subset(train_dataset, labelled_idxs),
                Subset(train_dataset, unlabelled_idxs)
            ]
        labelled_set = data.ConcatDataset(
            [labelled_set for i in range(num_unlabelled // num_labelled + 1)])
        labelled_set, _ = data.random_split(
            labelled_set, [num_unlabelled,
                           len(labelled_set) - num_unlabelled])

        train_dataset = Joint(labelled_set, unlabelled_set)

    def _init_fn(worker_id):
        np.random.seed(args.seed)

    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.n_workers,
                                   worker_init_fn=_init_fn)

    test_loader = data.DataLoader(test_dataset,
                                  batch_size=args.eval_batch_size,
                                  shuffle=False,
                                  num_workers=args.n_workers,
                                  worker_init_fn=_init_fn)

    model, params = resnet(args.depth, args.width, num_classes, image_shape[0])

    if args.lp:
        num_flow_classes = num_classes if not num_classes % 2 else num_classes + 1
        prior_y = MultivariateNormal(
            torch.zeros(num_flow_classes).to("cuda:0"),
            torch.eye(num_flow_classes).to("cuda:0"))
        num_flows = 3
        flows = [
            NSF_CL(dim=num_flow_classes, K=8, B=3, hidden_dim=16)
            for _ in range(num_flows)
        ]
        convs = [
            Invertible1x1Conv(dim=num_flow_classes) for i in range(num_flows)
        ]
        flows = list(itertools.chain(*zip(convs, flows)))
        model_y = NormalizingFlowModel(prior_y, flows,
                                       num_flow_classes).to("cuda:0")
        optimizer_y = Adam(model_y.parameters(), lr=1e-3, weight_decay=1e-5)

    def create_optimizer(args, lr):
        print('creating optimizer with lr = ', lr)
        return SGD([v for v in params.values() if v.requires_grad],
                   lr,
                   momentum=0.9,
                   weight_decay=args.weight_decay)

    optimizer = create_optimizer(args, args.lr)

    epoch = 0

    print('\nParameters:')
    print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in params.values() if p.requires_grad)
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    if args.dataset == "awa2":
        classacc = tnt.meter.AverageValueMeter()
    else:
        classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(args.save):
        os.mkdir(args.save)

    global counter
    counter = 0

    def compute_loss(sample):
        if not args.ssl:
            inputs = cast(sample[0], args.dtype)
            targets = cast(sample[1], 'long')
            y = data_parallel(model, inputs, params, sample[2],
                              list(range(args.ngpu))).float()
            if args.dataset == "awa2":
                return F.binary_cross_entropy_with_logits(y,
                                                          targets.float()), y
            else:
                return F.cross_entropy(y, targets), y
        else:
            global counter
            l = sample[0]
            u = sample[1]
            inputs_l = cast(l[0], args.dtype)
            targets_l = cast(l[1], 'long')
            inputs_u = cast(u[0], args.dtype)
            y_l = data_parallel(model, inputs_l, params, sample[2],
                                list(range(args.ngpu))).float()
            y_u = data_parallel(model, inputs_u, params, sample[2],
                                list(range(args.ngpu))).float()
            if args.dataset == "awa2":
                loss = F.binary_cross_entropy_with_logits(
                    y_l, targets_l.float())
            else:
                loss = F.cross_entropy(y_l, targets_l)

            if args.min_entropy:
                if args.dataset == "awa2":
                    labels_pred = F.sigmoid(y_u)
                    entropy = -torch.sum(labels_pred * torch.log(labels_pred),
                                         dim=1)
                else:
                    labels_pred = F.softmax(y_u, dim=1)
                    entropy = -torch.sum(labels_pred * torch.log(labels_pred),
                                         dim=1)
                if counter >= 10:
                    loss_entropy = args.unl_weight * torch.mean(entropy)
                    loss += loss_entropy

            elif args.semantic_loss:
                if args.dataset == "awa2":
                    labels_pred = F.sigmoid(y_u)
                else:
                    labels_pred = F.softmax(y_u, dim=1)
                part1 = torch.stack([
                    labels_pred**all_labels[i]
                    for i in range(all_labels.shape[0])
                ])
                part2 = torch.stack([(1 - labels_pred)**(1 - all_labels[i])
                                     for i in range(all_labels.shape[0])])
                sem_loss = -torch.log(
                    torch.sum(torch.prod(part1 * part2, dim=2), dim=0))
                if counter >= 10:
                    semantic_loss = args.unl_weight * torch.mean(sem_loss)
                    loss += semantic_loss

            elif args.lp:
                model_y.eval()
                if args.dataset == "awa2":
                    labels_pred = F.sigmoid(y_u)
                else:
                    labels_pred = F.softmax(y_u, dim=1)
                if num_classes % 2:
                    labels_pred = torch.cat(
                        (labels_pred, torch.zeros(
                            (labels_pred.shape[0], 1)).to("cuda:0")),
                        dim=1)
                _, nll_ypred = model_y(labels_pred)
                if counter >= 10:
                    loss_nll_ypred = args.unl_weight * torch.mean(nll_ypred)
                    loss += loss_nll_ypred

                model_y.train()
                optimizer_y.zero_grad()
                if args.dataset == "awa2":
                    a = targets_l.float() * 120. + (1 -
                                                    targets_l.float()) * 1.1
                    b = (1 -
                         targets_l.float()) * 120. + targets_l.float() * 1.1
                    beta_targets = Beta(a, b).rsample()
                    if num_classes % 2:
                        beta_targets = torch.cat(
                            (beta_targets,
                             torch.zeros(
                                 (beta_targets.shape[0], 1)).to("cuda:0")),
                            dim=1)
                    zs, nll_y = model_y(beta_targets)
                else:
                    one_hot_targets = F.one_hot(torch.tensor(targets_l),
                                                num_classes).float()
                    one_hot_targets = one_hot_targets * 120 + (
                        1 - one_hot_targets) * 1.1
                    dirichlet_targets = torch.stack(
                        [Dirichlet(i).sample() for i in one_hot_targets])
                    zs, nll_y = model_y(dirichlet_targets)
                loss_nll_y = torch.mean(nll_y)
                loss_nll_y.backward()
                optimizer_y.step()
            return loss, y_l

    def compute_loss_test(sample):
        inputs = cast(sample[0], args.dtype)
        targets = cast(sample[1], 'long')
        y = data_parallel(model, inputs, params, sample[2],
                          list(range(args.ngpu))).float()
        if args.dataset == "awa2":
            return F.binary_cross_entropy_with_logits(y, targets.float()), y
        else:
            return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(
            dict(params=params,
                 epoch=t['epoch'],
                 optimizer=state['optimizer'].state_dict()),
            os.path.join(args.save, 'model.pt7'))
        z = {**vars(args), **t}
        with open(os.path.join(args.save, 'log.txt'), 'a') as flog:
            flog.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        loss = float(state['loss'])
        if args.dataset == "awa2":
            if not args.ssl or not state['train']:
                acc = calculate_accuracy(F.sigmoid(state['output'].data),
                                         state['sample'][1])
            else:
                acc = calculate_accuracy(F.sigmoid(state['output'].data),
                                         state['sample'][0][1])
            classacc.add(acc)
        else:
            if not args.ssl or not state['train']:
                classacc.add(state['output'].data, state['sample'][1])
            else:
                classacc.add(state['output'].data, state['sample'][0][1])
        meter_loss.add(loss)

        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader, dynamic_ncols=True)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(args,
                                                  lr * args.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()[0]
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        with torch.no_grad():
            engine.test(compute_loss_test, test_loader)

        test_acc = classacc.value()[0]
        print(
            log(
                {
                    "train_loss": train_loss[0],
                    "train_acc": train_acc,
                    "test_loss": meter_loss.value()[0],
                    "test_acc": test_acc,
                    "epoch": state['epoch'],
                    "num_classes": num_classes,
                    "n_parameters": n_parameters,
                    "train_time": train_time,
                    "test_time": timer_test.value(),
                }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' %
              (args.save, state['epoch'], args.epochs, test_acc))

        global counter
        counter += 1

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(compute_loss, train_loader, args.epochs, optimizer)
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(train):
        return DataLoader(create_dataset(opt, train), batch_size=opt.batch_size, shuffle=train,
                          num_workers=opt.nthread, pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    f, params, stats = resnet(opt.depth, opt.width, num_classes)

    def create_optimizer(opt, lr):
        print('creating optimizer with lr = ', lr)
        return SGD(params.values(), lr, 0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors, stats = state_dict['params'], state_dict['stats']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    print_tensor_dict(params)
    print('\nAdditional buffers:')
    print_tensor_dict(stats)

    n_parameters = sum(p.numel() for p in params.values())
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = Variable(cast(sample[0], opt.dtype))
        targets = Variable(cast(sample[1], 'long'))
        y = data_parallel(f, inputs, params, stats, sample[2], list(range(opt.ngpu)))
        return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(dict(params={k: v.data for k, v in params.items()},
                        stats=stats,
                        optimizer=state['optimizer'].state_dict(),
                        epoch=t['epoch']),
                   open(os.path.join(opt.save, 'model.pt7'), 'wb'))
        z = vars(opt).copy(); z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        classacc.add(state['output'].data, torch.LongTensor(state['sample'][1]))
        meter_loss.add(state['loss'].data[0])

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, test_loader)

        test_acc = classacc.value()[0]
        print(log({
            "train_loss": train_loss[0],
            "train_acc": train_acc[0],
            "test_loss": meter_loss.value()[0],
            "test_acc": test_acc,
            "epoch": state['epoch'],
            "num_classes": num_classes,
            "n_parameters": n_parameters,
            "train_time": train_time,
            "test_time": timer_test.value(),
        }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
Beispiel #9
0
def main():
    st = time.time()
    opt = parser.parse_args()
    epoch_step = json.loads(opt.epoch_step)
    print('parsed options:', vars(opt))

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    epoch_step = json.loads(opt.epoch_step)

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    f_s, params_s = define_student(opt.depth, opt.width)

    if opt.teacher_id:
        assert opt.teacher_id == "resnet34"
        f_t, params_t = define_teacher(opt.teacher_params)
        params = {'student.'+k: v for k, v in params_s.items()}
        params.update({'teacher.'+k: v for k, v in params_t.items()})
        def f(inputs, params, mode):
            y_s, g_s = f_s(inputs, params, mode, 'student.')
            with torch.no_grad():
                y_t, g_t = f_t(inputs, params, 'teacher.')
            return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)]
    else:
        f, params = f_s, params_s

    params = OrderedDict((k, p.cuda().detach().requires_grad_(p.requires_grad)) for k, p in params.items())

    optimizable = [v for v in params.values() if v.requires_grad]
    def create_optimizer(opt, lr):
        # print('creating optimizer with lr = ', lr)
        return SGD(optimizable, lr, momentum=0.9, weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    iter_train = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, True)
    iter_test = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, False)
    # train_size = len(iter_train.dataset)
    # test_size = len(iter_test.dataset)
    # steps_per_epoch = round(train_size / opt.batch_size)
    # total_steps = opt.epochs * steps_per_epoch
    # print("train size: {}, test size: {}, steps per epoch: {}, total steps: {}".format(train_size, test_size, steps_per_epoch, total_steps))

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    utils.print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in optimizable)
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')
    meters_at = [tnt.meter.AverageValueMeter() for i in range(4)]

    if opt.teacher_id != '':
        classacc_t = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True)
        t_test_acc_top1 = []; t_test_acc_top5 = []
        with torch.no_grad():
            for i, (inputs, targets) in enumerate(iter_test):
                inputs = inputs.cuda().detach()
                targets = targets.cuda().long().detach()
                y_t, _ = f_t(inputs, params, 'teacher.')
                classacc_t.add(y_t, targets)
                t_test_acc_top1.append(classacc_t.value()[0]);t_test_acc_top5.append(classacc_t.value()[1])
                classacc_t.reset()
        print("teacher top1 test acc: {}, teacher top5 test acc: {}".format(np.mean(t_test_acc_top1), np.mean(t_test_acc_top5)))

    def h(sample):
        inputs, targets, mode = sample
        inputs = inputs.cuda().detach()
        targets = targets.cuda().long().detach()
        if opt.teacher_id != '':
            if opt.kt_method == "at":
                y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))
                loss_groups = [v.sum() for v in loss_groups]
                [m.add(v.item()) for m,v in zip(meters_at, loss_groups)]
                return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s
            elif opt.kt_method == "st":
                y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))
                return torch.sqrt(torch.mean((y_s - y_t) ** 2)), y_s
        else:
            y = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))[0]
            return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(dict(params={k: v.data for k, v in params.items()},
                        optimizer=state['optimizer'].state_dict(),
                        epoch=t['epoch']),
                   os.path.join(opt.save, 'model.pt7'))
        z = vars(opt).copy(); z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])
        # if state['sample'][2]:
        #     curr_lr = 0.5 * opt.lr * (1 + np.cos(np.pi * state['t'] / total_steps))
        #     state['optimizer'] = create_optimizer(opt, curr_lr)

    def on_forward(state):
        classacc.add(state['output'].data, state['sample'][1])
        loss = state['loss'].item()
        meter_loss.add(loss)
        if state['train']:
            state['iterator'].set_postfix(loss=loss)

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        [meter.reset() for meter in meters_at]
        state['iterator'] = tqdm(iter_train, dynamic_ncols=True)

        epoch = state['epoch'] + 1
        if epoch in epoch_step:
            lr = state['optimizer'].param_groups[0]['lr']
            state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, iter_test)
        test_acc = classacc.value()
        print(log({
            "train_loss": train_loss[0],
            "train_acc": train_acc,
            "test_loss": meter_loss.value()[0],
            "test_acc": test_acc,
            "epoch": state['epoch'],
            "n_parameters": n_parameters,
            "train_time": train_time,
            "test_time": timer_test.value(),
            "at_losses": [m.value() for m in meters_at],
            "kt_method": opt.kt_method,
            "curr_lr": state['optimizer'].param_groups[0]['lr'],
        }, state))
        print('==> id: %s (%d/%d), test_top1_acc: \33[91m%.2f\033[0m, test_top5_acc: \33[91m%.2f\033[0m' %
              (opt.save, state['epoch'], opt.epochs, test_acc[0], test_acc[1]))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, iter_train, opt.epochs, optimizer)
    print("total time: {}".format(time.time()-st))
Beispiel #10
0
def main():
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id
    if torch.cuda.is_available():
        # to prevent opencv from initializing CUDA in workers
        torch.randn(8).cuda()
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    def create_iterator(mode):
        ds = create_dataset(opt, mode)
        return ds.parallel(batch_size=opt.batchSize,
                           shuffle=mode,
                           num_workers=opt.nthread,
                           pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)

    f, vectors, scalars, stats = resnet(opt.depth, opt.width, num_classes)
    params = vectors.copy()
    params.update(scalars)

    def create_optimizer(opt):
        print('creating optimizer with lr = ', opt.lr)
        return NDAdam([{
            'params': scalars.values(),
            'weight_decay': opt.weightDecay
        }, {
            'params': vectors.values(),
            'vec_axes': [1, 2, 3]
        }],
                      lr=opt.lr,
                      betas=(0.9, 0.99))

    optimizer = create_optimizer(opt)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors, stats = state_dict['params'], state_dict['stats']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    print_tensor_dict(params)
    print('\nAdditional buffers:')
    print_tensor_dict(stats)

    n_parameters = sum(p.numel() for p in params.values())
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = Variable(cast(sample[0], opt.dtype))
        targets = Variable(cast(sample[1], 'long'))
        y = data_parallel(f, inputs, params, stats, sample[2],
                          tuple(range(opt.ngpu)))
        logit_loss = 0.5 * torch.mean(torch.sum(y * y, 1))
        return F.cross_entropy(y, targets) + opt.logitDecay * logit_loss, y

    def log(t, state):
        torch.save(
            dict(params={k: v.data
                         for k, v in params.items()},
                 stats=stats,
                 optimizer=state['optimizer'].state_dict(),
                 epoch=t['epoch']),
            open(os.path.join(opt.save, 'model.pt7'), 'wb'))
        z = vars(opt).copy()
        z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

    def on_forward(state):
        classacc.add(state['output'].data,
                     torch.LongTensor(state['sample'][1]))
        meter_loss.add(state['loss'].item())

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        state['iterator'] = tqdm(train_loader)

        epoch = state['epoch']
        for group in optimizer.param_groups:
            group['lr'] = opt.lr * 0.5 * (
                1 + math.cos(math.pi * float(epoch) / opt.epochs))

    def on_end_epoch(state):
        train_loss = meter_loss.value()
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, test_loader)

        test_acc = classacc.value()[0]
        print(
            log(
                {
                    "train_loss": train_loss[0],
                    "train_acc": train_acc[0],
                    "test_loss": meter_loss.value()[0],
                    "test_acc": test_acc,
                    "epoch": state['epoch'],
                    "num_classes": num_classes,
                    "n_parameters": n_parameters,
                    "train_time": train_time,
                    "test_time": timer_test.value(),
                }, state))
        print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \
              (opt.save, state['epoch'], opt.epochs, test_acc))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)
Beispiel #11
0
def main():
    st_total = time.time()
    opt = parser.parse_args()
    print('parsed options:', vars(opt))
    epoch_step = json.loads(opt.epoch_step)
    num_classes = 10 if opt.dataset == 'CIFAR10' else 100

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id

    def create_iterator(mode):
        return DataLoader(create_dataset(opt, mode),
                          opt.batch_size,
                          shuffle=mode,
                          num_workers=opt.nthread,
                          pin_memory=torch.cuda.is_available())

    train_loader = create_iterator(True)
    test_loader = create_iterator(False)
    train_size = len(train_loader.dataset)
    test_size = len(test_loader.dataset)
    steps_per_epoch = round(train_size / opt.batch_size)
    total_steps = opt.epochs * steps_per_epoch
    print(
        "train size: {}, test size: {}, steps per epoch: {}, total steps: {}".
        format(train_size, test_size, steps_per_epoch, total_steps))

    # deal with student first
    f_s, params_s = resnet(opt.depth, opt.width, num_classes)
    print(type(f_s), type(params_s))

    # deal with teacher
    if opt.teacher_id:
        with open(os.path.join('logs', opt.teacher_id, 'log.txt'), 'r') as ff:
            line = ff.readline()
            r = line.find('json_stats')
            info = json.loads(line[r + 12:])
        f_t, _ = resnet(info['depth'], info['width'], num_classes)
        model_data = torch.load(
            os.path.join('logs', opt.teacher_id, 'model.pt7'))
        params_t = model_data['params']

        # merge teacher and student params
        params = {'student.' + k: v for k, v in params_s.items()}
        for k, v in params_t.items():
            params['teacher.' + k] = v.detach().requires_grad_(False)

        if opt.kt_method == "at":

            def f(inputs, params, mode):
                y_s, g_s = f_s(inputs, params, mode, 'student.')
                with torch.no_grad():
                    y_t, g_t = f_t(inputs, params, False, 'teacher.')
                return y_s, y_t, [
                    utils.at_loss(x, y) for x, y in zip(g_s, g_t)
                ]
        elif opt.kt_method == "st":

            def f(inputs, params, mode):
                y_s, g_s = f_s(inputs, params, mode, 'student.')
                with torch.no_grad():
                    y_t, g_t = f_t(inputs, params, False, 'teacher.')
                return y_s, y_t, [
                    utils.at_loss(x, y) for x, y in zip(g_s, g_t)
                ]
        else:
            raise EOFError("Not found kt method.")

    else:
        f, params = f_s, params_s

    def create_optimizer(opt, lr):
        # print('creating optimizer with lr = ', lr)
        return SGD((v for v in params.values() if v.requires_grad),
                   lr,
                   momentum=0.9,
                   weight_decay=opt.weight_decay)

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors = state_dict['params']
        for k, v in params.items():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print('\nParameters:')
    utils.print_tensor_dict(params)

    n_parameters = sum(p.numel() for p in list(params_s.values()))
    print('\nTotal number of parameters:', n_parameters)

    meter_loss = tnt.meter.AverageValueMeter()
    classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True)
    timer_train = tnt.meter.TimeMeter('s')
    timer_test = tnt.meter.TimeMeter('s')
    meters_at = [tnt.meter.AverageValueMeter() for i in range(3)]

    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    def h(sample):
        inputs = utils.cast(sample[0], opt.dtype).detach()
        targets = utils.cast(sample[1], 'long')
        if opt.teacher_id != '':
            if opt.kt_method == "at":
                y_s, y_t, loss_groups = utils.data_parallel(
                    f, inputs, params, sample[2], range(opt.ngpu))
                loss_groups = [v.sum() for v in loss_groups]
                [m.add(v.item()) for m, v in zip(meters_at, loss_groups)]
                return utils.distillation(
                    y_s, y_t, targets, opt.temperature,
                    opt.alpha) + opt.beta * sum(loss_groups), y_s
            elif opt.kt_method == "st":
                y_s, y_t, loss_groups = utils.data_parallel(
                    f, inputs, params, sample[2], range(opt.ngpu))
                return torch.sqrt(torch.mean((y_s - y_t)**2)), y_s
        else:
            y = utils.data_parallel(f, inputs, params, sample[2],
                                    range(opt.ngpu))[0]
            return F.cross_entropy(y, targets), y

    def log(t, state):
        torch.save(
            dict(params={k: v.data
                         for k, v in params.items()},
                 optimizer=state['optimizer'].state_dict(),
                 epoch=t['epoch']), os.path.join(opt.save, 'model.pt7'))
        z = vars(opt).copy()
        z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print(z)

    def on_sample(state):
        state['sample'].append(state['train'])

        if state['sample'][2]:
            curr_lr = 0.5 * opt.lr * (1 +
                                      np.cos(np.pi * state['t'] / total_steps))
            state['optimizer'] = create_optimizer(opt, curr_lr)
        # print(len(state['sample']), state['sample'][0].size(), state['sample'][1].size(), state['sample'][2])

    def on_forward(state):
        classacc.add(state['output'].data, state['sample'][1])
        meter_loss.add(state['loss'].item())

    def on_start(state):
        state['epoch'] = epoch

    def on_start_epoch(state):
        classacc.reset()
        meter_loss.reset()
        timer_train.reset()
        [meter.reset() for meter in meters_at]
        state['iterator'] = tqdm(train_loader)

        # epoch = state['epoch'] + 1
        # if epoch in epoch_step:
        #     lr = state['optimizer'].param_groups[0]['lr']
        #     state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio)

    def on_end_epoch(state):
        train_loss = meter_loss.mean
        train_acc = classacc.value()
        train_time = timer_train.value()
        meter_loss.reset()
        classacc.reset()
        timer_test.reset()

        engine.test(h, test_loader)

        test_acc = classacc.value()
        print(
            log(
                {
                    "train_loss": train_loss,
                    "train_acc": train_acc,
                    "test_loss": meter_loss.mean,
                    "test_acc": test_acc,
                    "epoch": state['epoch'],
                    "num_classes": num_classes,
                    "n_parameters": n_parameters,
                    "train_time": train_time,
                    "test_time": timer_test.value(),
                    "at_losses": [m.value() for m in meters_at],
                    "kt_method": opt.kt_method,
                    "curr_lr": state['optimizer'].param_groups[0]['lr'],
                }, state))
        print(
            '==> id: %s (%d/%d), test_top1_acc: \33[91m%.2f\033[0m, test_top5_acc: \33[91m%.2f\033[0m'
            % (opt.save, state['epoch'], opt.epochs, test_acc[0], test_acc[1]))

    engine = Engine()
    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch
    engine.hooks['on_start'] = on_start
    engine.train(h, train_loader, opt.epochs, optimizer)

    print("total time (h): {}".format((time.time() - st_total) / 3600.))