def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(train): return DataLoader(create_dataset(opt, train), batch_size=opt.batch_size, shuffle=train, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) f, params, stats = resnet(opt.depth, opt.width, num_classes) def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD(params.values(), lr, 0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') print_tensor_dict(params) print('\nAdditional buffers:') print_tensor_dict(stats) n_parameters = sum(p.numel() for p in params.values()) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], list(range(opt.ngpu))) return F.cross_entropy(y, targets), y def log(t, state): torch.save( dict(params={k: v.data for k, v in params.items()}, stats=stats, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'model.pt7'), 'wb')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].data[0]) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, test_loader) test_acc = classacc.value()[0] print( log( { "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \ (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): st = time.time() opt = parser.parse_args() epoch_step = json.loads(opt.epoch_step) print('parsed options:', vars(opt)) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id epoch_step = json.loads(opt.epoch_step) if not os.path.exists(opt.save): os.mkdir(opt.save) f_s, params_s = define_student(opt.depth, opt.width) f_t, params_t = define_teacher(opt.teacher_params) params = {'student.'+k: v for k, v in params_s.items()} params.update({'teacher.'+k: v for k, v in params_t.items()}) params = OrderedDict((k, p.cuda().detach().requires_grad_(p.requires_grad)) for k, p in params.items()) optimizable = [v for v in params.values() if v.requires_grad] def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD(optimizable, lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) iter_train = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, True) iter_test = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, False) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') utils.print_tensor_dict(params) n_parameters = sum(p.numel() for p in optimizable) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') meters_at = [tnt.meter.AverageValueMeter() for i in range(4)] def f(inputs, params, mode): y_s, g_s = f_s(inputs, params, mode, 'student.') with torch.no_grad(): y_t, g_t = f_t(inputs, params, 'teacher.') return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)] def h(sample): inputs, targets, mode = sample inputs = inputs.cuda().detach() targets = targets.cuda().long().detach() y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m,v in zip(meters_at, loss_groups)] return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s def log(t, state): torch.save(dict(params={k: v.data for k, v in params.items()}, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), os.path.join(opt.save, 'model.pt7')) z = vars(opt).copy(); z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): classacc.add(state['output'].data, state['sample'][1]) loss = state['loss'].item() meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() [meter.reset() for meter in meters_at] state['iterator'] = tqdm(iter_train, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, iter_test) print(log({ "train_loss": train_loss[0], "train_acc": train_acc, "test_loss": meter_loss.value()[0], "test_acc": classacc.value(), "epoch": state['epoch'], "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), "at_losses": [m.value() for m in meters_at], }, state)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, iter_train, opt.epochs, optimizer) print("total time: {}".format(time.time()-st))
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 log_step = 1 assert opt.subset_size in [100, 500, 1000, -1 ], 'subset size should be 100, 500, 1000 or -1' assert opt.subset_id in [1, 2, 3, 4, 5, -1], 'subset ide should be 1-5 or -1' if opt.subset_size in [100, 500, 1000]: log_step = 10000 // opt.subset_size torch.manual_seed(opt.seed) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(mode): shuffle = mode and (opt.subset_size == -1 or opt.subset_id == -1) sampler = None if mode and not shuffle: ind = np.loadtxt('subsets/subset_' + str(opt.subset_size) + '_' + str(opt.subset_id) + '.txt', dtype=np.int64) sampler = SubsetRandomSampler(ind) return DataLoader(create_dataset(opt, mode), opt.batch_size, sampler=sampler, shuffle=shuffle, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) kwargs = {} if not opt.level is None: kwargs.update({'level': opt.level}) f, params = resnet(opt.depth, opt.width, num_classes, opt.dropout, **kwargs) def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD([v for v in params.values() if v.requires_grad], lr, momentum=0.9, weight_decay=opt.weight_decay, nesterov=opt.nesterov) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): if k in params_tensors: v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') print_tensor_dict(params) n_parameters = sum(p.numel() for p in params.values() if p.requires_grad) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float() return F.cross_entropy(y, targets), y def log(t, state): torch.save( dict(params={ k: v for k, v in params.items() if k.find('dct') == -1 }, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()), os.path.join(opt.save, 'model.pt7')) z = vars(opt).copy() z.update(t) with open(os.path.join(opt.save, 'log.txt'), 'a') as flog: flog.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): loss = float(state['loss']) classacc.add(state['output'].data, state['sample'][1]) meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): if state['epoch'] % log_step == 0: train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() with torch.no_grad(): engine.test(h, test_loader) test_acc = classacc.value()[0] print( log( { "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" ###multiple gpu os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(mode): return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) # deal with student first f_s, params_s = resnet(opt.depth, opt.width, num_classes) # deal with teacher if opt.teacher_id: with open(os.path.join('logs', opt.teacher_id, 'log.txt'), 'r') as ff: line = ff.readline() r = line.find('json_stats') info = json.loads(line[r + 12:]) f_t = resnet(info['depth'], info['width'], num_classes)[0] model_data = torch.load( os.path.join('logs', opt.teacher_id, 'model.pt7')) params_t = model_data['params'] # merge teacher and student params params = {'student.' + k: v for k, v in params_s.items()} for k, v in params_t.items(): if not (k.startswith("teacher")): k = k.replace("student.", "") params['teacher.' + k] = v.detach().requires_grad_(False) def f(inputs, params, mode): y_s, g_s = f_s(inputs, params, mode, 'student.') with torch.no_grad(): y_t, g_t = f_t(inputs, params, False, 'teacher.') return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)] else: f, params = f_s, params_s def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD((v for v in params.values() if v.requires_grad), lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') utils.print_tensor_dict(params) n_parameters = sum(p.numel() for p in list(params_s.values())) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') meters_at = [tnt.meter.AverageValueMeter() for i in range(3)] opt.save = opt.save + "_" + opt.dataset + "_epochs_" + str(opt.epochs) if not os.path.exists(opt.save): os.mkdir(opt.save) writer = SummaryWriter(opt.save) def h(sample): inputs = utils.cast(sample[0], opt.dtype).detach() targets = utils.cast(sample[1], 'long') if opt.teacher_id != '': y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m, v in zip(meters_at, loss_groups)] return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s else: y = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))[0] return F.cross_entropy(y, targets), y def log(t, state): torch.save( dict(params={k: v.data for k, v in params.items()}, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), os.path.join(opt.save, 'model.pt7')) #정해준 path에 모델을 save 한다. z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) #sample을 train상태에 올린다. def on_forward(state): classacc.add(state['output'].data, state['sample'][1]) # meter_loss.add(state['loss'].item()) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() [meter.reset() for meter in meters_at] state['iterator'] = tqdm(train_loader) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.mean train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, test_loader) #upward test_acc = classacc.value()[0] writer.add_scalar('loss/train', train_loss, state['epoch']) writer.add_scalar('acc/train', train_acc[0], state['epoch']) writer.add_scalar('loss/test', meter_loss.mean, state['epoch']) writer.add_scalar('acc/test', test_acc, state['epoch']) print( log( { "train_loss": train_loss, "train_acc": train_acc[0], "test_loss": meter_loss.mean, "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), "at_losses": [m.value() for m in meters_at], }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \ (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer) writer.close()
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 torch.manual_seed(opt.seed) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(mode): return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) if opt.activation_dropout: print('[*********] Using activation dropout') f, params = resnet(opt.depth, opt.width, num_classes, opt.dropout_prob, opt.activation_dropout) def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD([v for v in params.values() if v.requires_grad], lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') print_tensor_dict(params) n_parameters = sum(p.numel() for p in params.values() if p.requires_grad) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float() return F.cross_entropy(y, targets), y def log(t, state): torch.save(dict(params=params, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()), os.path.join(opt.save, 'model.pt7')) z = {**vars(opt), **t} with open(os.path.join(opt.save, 'log.txt'), 'a') as flog: flog.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): loss = float(state['loss']) classacc.add(state['output'].data, state['sample'][1]) meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() with torch.no_grad(): engine.test(h, test_loader) test_acc = classacc.value()[0] print(log({ "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 torch.manual_seed(opt.seed) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(mode): return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) f_1, params_1 = resnet(opt.depth, opt.width, num_classes) f_2, params_2 = resnet(opt.depth, opt.width, num_classes) def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD([v for v in params_1.values() if v.requires_grad] + [v for v in params_2.values() if v.requires_grad], lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': raise NotImplementedError print('\nParameters:') print_tensor_dict(params_1) print_tensor_dict(params_2) n_parameters = sum([p.numel() for p in params_1.values() if p.requires_grad] + [p.numel() for p in params_2.values() if p.requires_grad]) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') classacc_ep1 = tnt.meter.ClassErrorMeter(accuracy=True) classacc_ep2 = tnt.meter.ClassErrorMeter(accuracy=True) if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): global _outputs, _loss connection_map = np.array([ [0,0,0, 1,1,1], [0,0,0, 1,1,1], [0,0,0, 1,1,1], [1,1,1, 0,0,0], [1,1,1, 0,0,0], [1,1,1, 0,0,0]]) inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') net1_outputs = data_parallel(f_1, inputs, params_1, sample[2], list(range(opt.ngpu))) net2_outputs = data_parallel(f_2, inputs, params_2, sample[2], list(range(opt.ngpu))) net1_outputs = [o.float() for o in net1_outputs] net2_outputs = [o.float() for o in net2_outputs] _loss = [] # hard supervision for i, o in enumerate(net1_outputs): _loss.append(F.cross_entropy(o, targets)) for i, o in enumerate(net2_outputs): _loss.append(F.cross_entropy(o, targets)) outputs = net1_outputs + net2_outputs # soft supervision for i, o in enumerate(outputs): for j, o2 in enumerate(outputs): if connection_map[i,j] > 0: _loss.append(KL_divergence(o2.detach(),o)) loss = sum(_loss) _outputs = net1_outputs return loss, net1_outputs[-1] def log(t, state): torch.save(dict(params=params_1, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()), os.path.join(opt.save, 'model.pt7')) z = {**vars(opt), **t} with open(os.path.join(opt.save, 'log.txt'), 'a') as flog: flog.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): loss = float(state['loss']) classacc.add(state['output'].data, state['sample'][1]) classacc_ep1.add(_outputs[0].data, state['sample'][1]) classacc_ep2.add(_outputs[1].data, state['sample'][1]) meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() classacc_ep1.reset() classacc_ep2.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() train_acc_ep1 = classacc_ep1.value() train_acc_ep2 = classacc_ep2.value() meter_loss.reset() classacc.reset() timer_test.reset() classacc_ep1.reset() classacc_ep2.reset() with torch.no_grad(): engine.test(h, test_loader) test_acc = classacc.value()[0] test_acc_ep1 = classacc_ep1.value()[0] test_acc_ep2 = classacc_ep2.value()[0] print(log({ "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "train_acc_ep1": train_acc_ep1[0], "train_acc_ep2": train_acc_ep2[0], "test_acc_ep1": test_acc_ep1, "test_acc_ep2": test_acc_ep2, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): args = parser.parse_args() print('parsed options:', vars(args)) epoch_step = json.loads(args.epoch_step) check_manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id ds = check_dataset(args.dataset, args.dataroot, args.augment, args.download) if args.dataset == "awa2": image_shape, num_classes, train_dataset, test_dataset, all_labels = ds all_labels = all_labels.to("cuda:0") else: image_shape, num_classes, train_dataset, test_dataset = ds all_labels = torch.eye(num_classes).to("cuda:0") if args.ssl: num_labelled = args.num_labelled num_unlabelled = len(train_dataset) - num_labelled if args.dataset == "awa2": labelled_set, unlabelled_set = data.random_split( train_dataset, [num_labelled, num_unlabelled]) else: td_targets = train_dataset.targets if args.dataset == "cifar10" else train_dataset.labels labelled_idxs, unlabelled_idxs = x_u_split(td_targets, num_labelled, num_classes) labelled_set, unlabelled_set = [ Subset(train_dataset, labelled_idxs), Subset(train_dataset, unlabelled_idxs) ] labelled_set = data.ConcatDataset( [labelled_set for i in range(num_unlabelled // num_labelled + 1)]) labelled_set, _ = data.random_split( labelled_set, [num_unlabelled, len(labelled_set) - num_unlabelled]) train_dataset = Joint(labelled_set, unlabelled_set) def _init_fn(worker_id): np.random.seed(args.seed) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers, worker_init_fn=_init_fn) test_loader = data.DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.n_workers, worker_init_fn=_init_fn) model, params = resnet(args.depth, args.width, num_classes, image_shape[0]) if args.lp: num_flow_classes = num_classes if not num_classes % 2 else num_classes + 1 prior_y = MultivariateNormal( torch.zeros(num_flow_classes).to("cuda:0"), torch.eye(num_flow_classes).to("cuda:0")) num_flows = 3 flows = [ NSF_CL(dim=num_flow_classes, K=8, B=3, hidden_dim=16) for _ in range(num_flows) ] convs = [ Invertible1x1Conv(dim=num_flow_classes) for i in range(num_flows) ] flows = list(itertools.chain(*zip(convs, flows))) model_y = NormalizingFlowModel(prior_y, flows, num_flow_classes).to("cuda:0") optimizer_y = Adam(model_y.parameters(), lr=1e-3, weight_decay=1e-5) def create_optimizer(args, lr): print('creating optimizer with lr = ', lr) return SGD([v for v in params.values() if v.requires_grad], lr, momentum=0.9, weight_decay=args.weight_decay) optimizer = create_optimizer(args, args.lr) epoch = 0 print('\nParameters:') print_tensor_dict(params) n_parameters = sum(p.numel() for p in params.values() if p.requires_grad) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() if args.dataset == "awa2": classacc = tnt.meter.AverageValueMeter() else: classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(args.save): os.mkdir(args.save) global counter counter = 0 def compute_loss(sample): if not args.ssl: inputs = cast(sample[0], args.dtype) targets = cast(sample[1], 'long') y = data_parallel(model, inputs, params, sample[2], list(range(args.ngpu))).float() if args.dataset == "awa2": return F.binary_cross_entropy_with_logits(y, targets.float()), y else: return F.cross_entropy(y, targets), y else: global counter l = sample[0] u = sample[1] inputs_l = cast(l[0], args.dtype) targets_l = cast(l[1], 'long') inputs_u = cast(u[0], args.dtype) y_l = data_parallel(model, inputs_l, params, sample[2], list(range(args.ngpu))).float() y_u = data_parallel(model, inputs_u, params, sample[2], list(range(args.ngpu))).float() if args.dataset == "awa2": loss = F.binary_cross_entropy_with_logits( y_l, targets_l.float()) else: loss = F.cross_entropy(y_l, targets_l) if args.min_entropy: if args.dataset == "awa2": labels_pred = F.sigmoid(y_u) entropy = -torch.sum(labels_pred * torch.log(labels_pred), dim=1) else: labels_pred = F.softmax(y_u, dim=1) entropy = -torch.sum(labels_pred * torch.log(labels_pred), dim=1) if counter >= 10: loss_entropy = args.unl_weight * torch.mean(entropy) loss += loss_entropy elif args.semantic_loss: if args.dataset == "awa2": labels_pred = F.sigmoid(y_u) else: labels_pred = F.softmax(y_u, dim=1) part1 = torch.stack([ labels_pred**all_labels[i] for i in range(all_labels.shape[0]) ]) part2 = torch.stack([(1 - labels_pred)**(1 - all_labels[i]) for i in range(all_labels.shape[0])]) sem_loss = -torch.log( torch.sum(torch.prod(part1 * part2, dim=2), dim=0)) if counter >= 10: semantic_loss = args.unl_weight * torch.mean(sem_loss) loss += semantic_loss elif args.lp: model_y.eval() if args.dataset == "awa2": labels_pred = F.sigmoid(y_u) else: labels_pred = F.softmax(y_u, dim=1) if num_classes % 2: labels_pred = torch.cat( (labels_pred, torch.zeros( (labels_pred.shape[0], 1)).to("cuda:0")), dim=1) _, nll_ypred = model_y(labels_pred) if counter >= 10: loss_nll_ypred = args.unl_weight * torch.mean(nll_ypred) loss += loss_nll_ypred model_y.train() optimizer_y.zero_grad() if args.dataset == "awa2": a = targets_l.float() * 120. + (1 - targets_l.float()) * 1.1 b = (1 - targets_l.float()) * 120. + targets_l.float() * 1.1 beta_targets = Beta(a, b).rsample() if num_classes % 2: beta_targets = torch.cat( (beta_targets, torch.zeros( (beta_targets.shape[0], 1)).to("cuda:0")), dim=1) zs, nll_y = model_y(beta_targets) else: one_hot_targets = F.one_hot(torch.tensor(targets_l), num_classes).float() one_hot_targets = one_hot_targets * 120 + ( 1 - one_hot_targets) * 1.1 dirichlet_targets = torch.stack( [Dirichlet(i).sample() for i in one_hot_targets]) zs, nll_y = model_y(dirichlet_targets) loss_nll_y = torch.mean(nll_y) loss_nll_y.backward() optimizer_y.step() return loss, y_l def compute_loss_test(sample): inputs = cast(sample[0], args.dtype) targets = cast(sample[1], 'long') y = data_parallel(model, inputs, params, sample[2], list(range(args.ngpu))).float() if args.dataset == "awa2": return F.binary_cross_entropy_with_logits(y, targets.float()), y else: return F.cross_entropy(y, targets), y def log(t, state): torch.save( dict(params=params, epoch=t['epoch'], optimizer=state['optimizer'].state_dict()), os.path.join(args.save, 'model.pt7')) z = {**vars(args), **t} with open(os.path.join(args.save, 'log.txt'), 'a') as flog: flog.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): loss = float(state['loss']) if args.dataset == "awa2": if not args.ssl or not state['train']: acc = calculate_accuracy(F.sigmoid(state['output'].data), state['sample'][1]) else: acc = calculate_accuracy(F.sigmoid(state['output'].data), state['sample'][0][1]) classacc.add(acc) else: if not args.ssl or not state['train']: classacc.add(state['output'].data, state['sample'][1]) else: classacc.add(state['output'].data, state['sample'][0][1]) meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(args, lr * args.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value()[0] train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() with torch.no_grad(): engine.test(compute_loss_test, test_loader) test_acc = classacc.value()[0] print( log( { "train_loss": train_loss[0], "train_acc": train_acc, "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % (args.save, state['epoch'], args.epochs, test_acc)) global counter counter += 1 engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(compute_loss, train_loader, args.epochs, optimizer)
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(train): return DataLoader(create_dataset(opt, train), batch_size=opt.batch_size, shuffle=train, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) f, params, stats = resnet(opt.depth, opt.width, num_classes) def create_optimizer(opt, lr): print('creating optimizer with lr = ', lr) return SGD(params.values(), lr, 0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') print_tensor_dict(params) print('\nAdditional buffers:') print_tensor_dict(stats) n_parameters = sum(p.numel() for p in params.values()) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], list(range(opt.ngpu))) return F.cross_entropy(y, targets), y def log(t, state): torch.save(dict(params={k: v.data for k, v in params.items()}, stats=stats, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'model.pt7'), 'wb')) z = vars(opt).copy(); z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].data[0]) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, test_loader) test_acc = classacc.value()[0] print(log({ "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \ (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): st = time.time() opt = parser.parse_args() epoch_step = json.loads(opt.epoch_step) print('parsed options:', vars(opt)) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id epoch_step = json.loads(opt.epoch_step) if not os.path.exists(opt.save): os.mkdir(opt.save) f_s, params_s = define_student(opt.depth, opt.width) if opt.teacher_id: assert opt.teacher_id == "resnet34" f_t, params_t = define_teacher(opt.teacher_params) params = {'student.'+k: v for k, v in params_s.items()} params.update({'teacher.'+k: v for k, v in params_t.items()}) def f(inputs, params, mode): y_s, g_s = f_s(inputs, params, mode, 'student.') with torch.no_grad(): y_t, g_t = f_t(inputs, params, 'teacher.') return y_s, y_t, [utils.at_loss(x, y) for x, y in zip(g_s, g_t)] else: f, params = f_s, params_s params = OrderedDict((k, p.cuda().detach().requires_grad_(p.requires_grad)) for k, p in params.items()) optimizable = [v for v in params.values() if v.requires_grad] def create_optimizer(opt, lr): # print('creating optimizer with lr = ', lr) return SGD(optimizable, lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) iter_train = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, True) iter_test = get_iterator(opt.imagenetpath, opt.batch_size, opt.nthread, False) # train_size = len(iter_train.dataset) # test_size = len(iter_test.dataset) # steps_per_epoch = round(train_size / opt.batch_size) # total_steps = opt.epochs * steps_per_epoch # print("train size: {}, test size: {}, steps per epoch: {}, total steps: {}".format(train_size, test_size, steps_per_epoch, total_steps)) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') utils.print_tensor_dict(params) n_parameters = sum(p.numel() for p in optimizable) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') meters_at = [tnt.meter.AverageValueMeter() for i in range(4)] if opt.teacher_id != '': classacc_t = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True) t_test_acc_top1 = []; t_test_acc_top5 = [] with torch.no_grad(): for i, (inputs, targets) in enumerate(iter_test): inputs = inputs.cuda().detach() targets = targets.cuda().long().detach() y_t, _ = f_t(inputs, params, 'teacher.') classacc_t.add(y_t, targets) t_test_acc_top1.append(classacc_t.value()[0]);t_test_acc_top5.append(classacc_t.value()[1]) classacc_t.reset() print("teacher top1 test acc: {}, teacher top5 test acc: {}".format(np.mean(t_test_acc_top1), np.mean(t_test_acc_top5))) def h(sample): inputs, targets, mode = sample inputs = inputs.cuda().detach() targets = targets.cuda().long().detach() if opt.teacher_id != '': if opt.kt_method == "at": y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m,v in zip(meters_at, loss_groups)] return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s elif opt.kt_method == "st": y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu)) return torch.sqrt(torch.mean((y_s - y_t) ** 2)), y_s else: y = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))[0] return F.cross_entropy(y, targets), y def log(t, state): torch.save(dict(params={k: v.data for k, v in params.items()}, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), os.path.join(opt.save, 'model.pt7')) z = vars(opt).copy(); z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) # if state['sample'][2]: # curr_lr = 0.5 * opt.lr * (1 + np.cos(np.pi * state['t'] / total_steps)) # state['optimizer'] = create_optimizer(opt, curr_lr) def on_forward(state): classacc.add(state['output'].data, state['sample'][1]) loss = state['loss'].item() meter_loss.add(loss) if state['train']: state['iterator'].set_postfix(loss=loss) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() [meter.reset() for meter in meters_at] state['iterator'] = tqdm(iter_train, dynamic_ncols=True) epoch = state['epoch'] + 1 if epoch in epoch_step: lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, iter_test) test_acc = classacc.value() print(log({ "train_loss": train_loss[0], "train_acc": train_acc, "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), "at_losses": [m.value() for m in meters_at], "kt_method": opt.kt_method, "curr_lr": state['optimizer'].param_groups[0]['lr'], }, state)) print('==> id: %s (%d/%d), test_top1_acc: \33[91m%.2f\033[0m, test_top5_acc: \33[91m%.2f\033[0m' % (opt.save, state['epoch'], opt.epochs, test_acc[0], test_acc[1])) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, iter_train, opt.epochs, optimizer) print("total time: {}".format(time.time()-st))
def main(): opt = parser.parse_args() print('parsed options:', vars(opt)) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id if torch.cuda.is_available(): # to prevent opencv from initializing CUDA in workers torch.randn(8).cuda() os.environ['CUDA_VISIBLE_DEVICES'] = '' def create_iterator(mode): ds = create_dataset(opt, mode) return ds.parallel(batch_size=opt.batchSize, shuffle=mode, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) f, vectors, scalars, stats = resnet(opt.depth, opt.width, num_classes) params = vectors.copy() params.update(scalars) def create_optimizer(opt): print('creating optimizer with lr = ', opt.lr) return NDAdam([{ 'params': scalars.values(), 'weight_decay': opt.weightDecay }, { 'params': vectors.values(), 'vec_axes': [1, 2, 3] }], lr=opt.lr, betas=(0.9, 0.99)) optimizer = create_optimizer(opt) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') print_tensor_dict(params) print('\nAdditional buffers:') print_tensor_dict(stats) n_parameters = sum(p.numel() for p in params.values()) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], tuple(range(opt.ngpu))) logit_loss = 0.5 * torch.mean(torch.sum(y * y, 1)) return F.cross_entropy(y, targets) + opt.logitDecay * logit_loss, y def log(t, state): torch.save( dict(params={k: v.data for k, v in params.items()}, stats=stats, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'model.pt7'), 'wb')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) def on_forward(state): classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].item()) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = tqdm(train_loader) epoch = state['epoch'] for group in optimizer.param_groups: group['lr'] = opt.lr * 0.5 * ( 1 + math.cos(math.pi * float(epoch) / opt.epochs)) def on_end_epoch(state): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, test_loader) test_acc = classacc.value()[0] print( log( { "train_loss": train_loss[0], "train_acc": train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state)) print('==> id: %s (%d/%d), test_acc: \33[91m%.2f\033[0m' % \ (opt.save, state['epoch'], opt.epochs, test_acc)) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer)
def main(): st_total = time.time() opt = parser.parse_args() print('parsed options:', vars(opt)) epoch_step = json.loads(opt.epoch_step) num_classes = 10 if opt.dataset == 'CIFAR10' else 100 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id def create_iterator(mode): return DataLoader(create_dataset(opt, mode), opt.batch_size, shuffle=mode, num_workers=opt.nthread, pin_memory=torch.cuda.is_available()) train_loader = create_iterator(True) test_loader = create_iterator(False) train_size = len(train_loader.dataset) test_size = len(test_loader.dataset) steps_per_epoch = round(train_size / opt.batch_size) total_steps = opt.epochs * steps_per_epoch print( "train size: {}, test size: {}, steps per epoch: {}, total steps: {}". format(train_size, test_size, steps_per_epoch, total_steps)) # deal with student first f_s, params_s = resnet(opt.depth, opt.width, num_classes) print(type(f_s), type(params_s)) # deal with teacher if opt.teacher_id: with open(os.path.join('logs', opt.teacher_id, 'log.txt'), 'r') as ff: line = ff.readline() r = line.find('json_stats') info = json.loads(line[r + 12:]) f_t, _ = resnet(info['depth'], info['width'], num_classes) model_data = torch.load( os.path.join('logs', opt.teacher_id, 'model.pt7')) params_t = model_data['params'] # merge teacher and student params params = {'student.' + k: v for k, v in params_s.items()} for k, v in params_t.items(): params['teacher.' + k] = v.detach().requires_grad_(False) if opt.kt_method == "at": def f(inputs, params, mode): y_s, g_s = f_s(inputs, params, mode, 'student.') with torch.no_grad(): y_t, g_t = f_t(inputs, params, False, 'teacher.') return y_s, y_t, [ utils.at_loss(x, y) for x, y in zip(g_s, g_t) ] elif opt.kt_method == "st": def f(inputs, params, mode): y_s, g_s = f_s(inputs, params, mode, 'student.') with torch.no_grad(): y_t, g_t = f_t(inputs, params, False, 'teacher.') return y_s, y_t, [ utils.at_loss(x, y) for x, y in zip(g_s, g_t) ] else: raise EOFError("Not found kt method.") else: f, params = f_s, params_s def create_optimizer(opt, lr): # print('creating optimizer with lr = ', lr) return SGD((v for v in params.values() if v.requires_grad), lr, momentum=0.9, weight_decay=opt.weight_decay) optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors = state_dict['params'] for k, v in params.items(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('\nParameters:') utils.print_tensor_dict(params) n_parameters = sum(p.numel() for p in list(params_s.values())) print('\nTotal number of parameters:', n_parameters) meter_loss = tnt.meter.AverageValueMeter() classacc = tnt.meter.ClassErrorMeter(topk=[1, 5], accuracy=True) timer_train = tnt.meter.TimeMeter('s') timer_test = tnt.meter.TimeMeter('s') meters_at = [tnt.meter.AverageValueMeter() for i in range(3)] if not os.path.exists(opt.save): os.mkdir(opt.save) def h(sample): inputs = utils.cast(sample[0], opt.dtype).detach() targets = utils.cast(sample[1], 'long') if opt.teacher_id != '': if opt.kt_method == "at": y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m, v in zip(meters_at, loss_groups)] return utils.distillation( y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s elif opt.kt_method == "st": y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) return torch.sqrt(torch.mean((y_s - y_t)**2)), y_s else: y = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))[0] return F.cross_entropy(y, targets), y def log(t, state): torch.save( dict(params={k: v.data for k, v in params.items()}, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), os.path.join(opt.save, 'model.pt7')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): state['sample'].append(state['train']) if state['sample'][2]: curr_lr = 0.5 * opt.lr * (1 + np.cos(np.pi * state['t'] / total_steps)) state['optimizer'] = create_optimizer(opt, curr_lr) # print(len(state['sample']), state['sample'][0].size(), state['sample'][1].size(), state['sample'][2]) def on_forward(state): classacc.add(state['output'].data, state['sample'][1]) meter_loss.add(state['loss'].item()) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() [meter.reset() for meter in meters_at] state['iterator'] = tqdm(train_loader) # epoch = state['epoch'] + 1 # if epoch in epoch_step: # lr = state['optimizer'].param_groups[0]['lr'] # state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): train_loss = meter_loss.mean train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, test_loader) test_acc = classacc.value() print( log( { "train_loss": train_loss, "train_acc": train_acc, "test_loss": meter_loss.mean, "test_acc": test_acc, "epoch": state['epoch'], "num_classes": num_classes, "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), "at_losses": [m.value() for m in meters_at], "kt_method": opt.kt_method, "curr_lr": state['optimizer'].param_groups[0]['lr'], }, state)) print( '==> id: %s (%d/%d), test_top1_acc: \33[91m%.2f\033[0m, test_top5_acc: \33[91m%.2f\033[0m' % (opt.save, state['epoch'], opt.epochs, test_acc[0], test_acc[1])) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, train_loader, opt.epochs, optimizer) print("total time (h): {}".format((time.time() - st_total) / 3600.))