def h(sample): inputs = utils.cast(sample[0], opt.dtype).detach() targets = utils.cast(sample[1], 'long') if opt.teacher_id != '': #loss_groups是什么? print('f = ', f) print('tensor inputs = ', inputs.shape) print('dict params = ', params.keys()) print('sample = ', sample[2]) print('opt.ngpu = ', range(opt.ngpu)) y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) print('y_s = ', y_s.shape) print('y_t = ', y_t.shape) print('loss_groups = ', loss_groups) ipdb.set_trace() loss_groups = [v.sum() for v in loss_groups] #计算meters_at,即at_losses注意力loss [m.add(v.item()) for m, v in zip(meters_at, loss_groups)] return utils.distillation( y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s else: y = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def h(sample): #input 是输入样本 #target是标签 inputs = utils.cast(sample[0], opt.dtype).detach() targets = utils.cast(sample[1], 'long') #如果模型是学生模型 #用给出的损失函数训练 if opt.teacher_id != '': y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) #取出总的loss loss_groups = [v.sum() for v in loss_groups] #总的损失? [m.add(v.item()) for m, v in zip(meters_at, loss_groups)] #第一部分是蒸馏#y_s:学生网络的输出#y_t:教师网络的输出#target:真实标签 #第二部分是AD损失函数部分 #第三部分是学生网络的输出 #当是AT算法时,alpha等于0,第一部分。就剩的是学生网络和真实标签的交叉熵 #当为KD算法时,beta等于0,就剩蒸馏损失函数,在这儿实现从1加到c return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s #如果是教师网络 #用标准交叉熵训练 else: #y是网络的输出 y = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) if opt.teacher_id != '': if opt.gamma: ys, y_t_auto, y_t = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))[:3] loss_l2 = torch.nn.MSELoss() T = 4 loss_student = F.cross_entropy(ys, targets) loss_teacher = F.cross_entropy(y_t_auto, targets) loss_course = opt.beta * \ ((y_t_auto - ys) * (y_t_auto - ys)).sum() / opt.batchSize y_tech_temp = torch.autograd.Variable(y_t_auto.data, requires_grad=False) log_kd = rocket_distillation(ys, y_t, targets, opt.temperature, opt.alpha) return rocket_distillation(ys, y_t, targets, opt.temperature, opt.alpha) \ + F.cross_entropy(y_t_auto, targets) + F.cross_entropy(ys, targets) + opt.beta * ((y_tech_temp - ys) * ( y_tech_temp - ys)).sum() / opt.batchSize, (ys, y_t_auto, loss_student, loss_teacher, loss_course, log_kd) else: y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.data[0]) for m, v in zip(meters_at, loss_groups)] return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s else: if opt.gamma: ys, y = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))[:2] loss_l2 = torch.nn.MSELoss() T = 4 loss_student = F.cross_entropy(ys, targets) loss_teacher = F.cross_entropy(y, targets) loss_course = opt.beta * \ ((y - ys) * (y - ys)).sum() / opt.batchSize if opt.grad_block: y_course = torch.autograd.Variable(y.data, requires_grad=False) else: y_course = y return F.cross_entropy(y, targets) + F.cross_entropy( ys, targets) + opt.beta * ( (y_course - ys) * (y_course - ys)).sum() / opt.batchSize, (ys, y, loss_student, loss_teacher, loss_course) else: y = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def forward(self, input): recurrent, _ = utils.data_parallel( self.rnn, input, self.ngpu) # [T, b, h * 2] T, b, h = recurrent.size() t_rec = recurrent.view(T * b, h) output = utils.data_parallel( self.embedding, t_rec, self.ngpu) # [T * b, nOut] output = output.view(T, b, -1) return output
def h_ensemble(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y_grassmann = data_parallel(f_grassmann, inputs, params_grassmann, stats_grassmann, sample[2], np.arange(opt.ngpu)) y_oblique = data_parallel(f_oblique, inputs, params_oblique, stats_oblique, sample[2], np.arange(opt.ngpu)) y_ensemble = y_grassmann + y_oblique return F.cross_entropy(y_ensemble, targets), y_ensemble
def forward(self, input): recurrent, _ = utils.data_parallel( self.rnn, input, self.ngpu) # [T, b, h * 2] T, b, h = recurrent.size() t_rec = recurrent.view(T * b, h) output = utils.data_parallel( self.embedding, t_rec, self.ngpu) # [T * b, nOut] output = output.view(T, b, -1) return output
def forward(self, input): # conv features conv = utils.data_parallel(self.cnn, input, self.ngpu) b, c, h, w = conv.size() assert h == 1, "the height of conv must be 1" conv = conv.squeeze(2) conv = conv.permute(2, 0, 1) # [w, b, c] # rnn features output = utils.data_parallel(self.rnn, conv, self.ngpu) return output
def forward(self, input): # conv features conv = utils.data_parallel(self.cnn, input, self.ngpu) b, c, h, w = conv.size() assert h == 1, "the height of conv must be 1" conv = conv.squeeze(2) conv = conv.permute(2, 0, 1) # [w, b, c] # rnn features output = utils.data_parallel(self.rnn, conv, self.ngpu) return output
def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) if opt.teacher_id != '': y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.data[0]) for m,v in zip(meters_at, loss_groups)] return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s else: y = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def forward(self, x): #features = self.features(x) # out = F.relu(features, inplace=True) conv = utils.data_parallel(self.features, x, self.ngpu) # b, c, h, w = conv.size() # assert h == 1, "the height of conv must be 1" print conv.size() conv = conv.squeeze(2) conv = conv.permute(2, 0, 1) # [w, b, c] # rnn features output = utils.data_parallel(self.rnn, conv, self.ngpu) return output
def h(sample): pdb.set_trace() inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu)) return F.cross_entropy(y, targets), y
def __init__(self, model, lr_master, n_epochs, n_iters, train_loader, test_loader, feature_dim, momentum=0.9, weight_decay=1e-4, optimizer_state=None, logger=None, ngpu=1, gpu=0): self.model = utils.data_parallel(model, ngpu, gpu) self.n_iters = n_iters self.n_epochs = n_epochs self.train_loader = train_loader self.test_loader = test_loader self.criterion = nn.CrossEntropyLoss().cuda() self.lr_master = lr_master self.optimizer = torch.optim.SGD(params=self.model.parameters(), lr=self.lr_master.lr, momentum=momentum, weight_decay=weight_decay, nesterov=True, ) weight_params = [] bias_params = [] self.logger = logger self.ngpu = ngpu self.gpu = gpu self.momentum = momentum self.weight_decay = weight_decay self.feature_dim = feature_dim self.iteration = 0 self.criterion = F.cross_entropy self.ToPILImage = transforms.ToPILImage() self.ToTensor = transforms.ToTensor()
def __init__(self, model, train_loader, val_loader, settings, logger, tensorboard_logger, optimizer_state=None, run_count=0): self.settings = settings self.model = utils.data_parallel(model=model, n_gpus=self.settings.n_gpus) self.train_loader = train_loader self.val_loader = val_loader self.criterion = nn.CrossEntropyLoss().cuda() self.lr = self.settings.lr self.optimizer = torch.optim.SGD( params=self.model.parameters(), lr=self.settings.lr, momentum=self.settings.momentum, weight_decay=self.settings.weight_decay, nesterov=True) if optimizer_state is not None: self.optimizer.load_state_dict(optimizer_state) self.logger = logger self.tensorboard_logger = tensorboard_logger self.run_count = run_count
def __init__(self, model, lr_master, train_loader, test_loader, settings, logger=None, optimizer_state=None): """ init trainer """ self.settings = settings self.model = utils.data_parallel(model, self.settings.nGPU, self.settings.GPU) self.train_loader = train_loader self.test_loader = test_loader self.criterion = nn.CrossEntropyLoss().cuda() self.lr_master = lr_master self.optimizer = torch.optim.SGD( params=self.model.parameters(), lr=self.lr_master.lr, momentum=self.settings.momentum, weight_decay=self.settings.weightDecay, nesterov=True, ) if optimizer_state is not None: self.optimizer.load_state_dict(optimizer_state) self.logger = logger self.run_count = 0 self.scalar_info = {}
def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], tuple(range(opt.ngpu))) logit_loss = 0.5 * torch.mean(torch.sum(y * y, 1)) return F.cross_entropy(y, targets) + opt.logitDecay * logit_loss, y
def h(sample): global _outputs, _loss connection_map = np.array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]]) inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') net1_outputs = data_parallel(f_1, inputs, params_1, sample[2], list(range(opt.ngpu))) net2_outputs = model_2(inputs) net1_outputs = [o.float() for o in net1_outputs] net2_outputs = [o.float() for o in net2_outputs] _loss = [] # hard supervision for i, o in enumerate(net1_outputs): _loss.append(F.cross_entropy(o, targets)) for i, o in enumerate(net2_outputs): _loss.append(F.cross_entropy(o, targets)) outputs = net1_outputs + net2_outputs # soft supervision for i, o in enumerate(outputs): for j, o2 in enumerate(outputs): if connection_map[i, j] > 0: _loss.append(KL_divergence(o2.detach(), o)) loss = sum(_loss) _outputs = net2_outputs[-1].detach() return loss, net1_outputs[-1]
def h(sample): inputs, targets, mode = sample inputs = inputs.cuda().detach() targets = targets.cuda().long().detach() if opt.teacher_id != '': if opt.kt_method == "at": y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m,v in zip(meters_at, loss_groups)] return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s elif opt.kt_method == "st": y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu)) return torch.sqrt(torch.mean((y_s - y_t) ** 2)), y_s else: y = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def compute_loss_test(sample): inputs = cast(sample[0], args.dtype) targets = cast(sample[1], 'long') y = data_parallel(model, inputs, params, sample[2], list(range(args.ngpu))).float() if args.dataset == "awa2": return F.binary_cross_entropy_with_logits(y, targets.float()), y else: return F.cross_entropy(y, targets), y
def reset_model(self, model): self.model = utils.data_parallel(model, self.ngpu, self.gpu) parameters = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = torch.optim.SGD(params=parameters, lr=self.lr_master.lr, momentum=self.momentum, weight_decay=self.weight_decay, nesterov=True, )
def h(sample): inputs, targets, mode = sample inputs = inputs.cuda().detach() targets = targets.cuda().long().detach() y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m,v in zip(meters_at, loss_groups)] return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s
def h(sample): inputs = Variable(sample[0].cuda()) targets = Variable(sample[1].cuda().long()) y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.data[0]) for m, v in zip(meters_at, loss_groups)] return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \ + opt.beta * sum(loss_groups), y_s
def reset_model(self, model): self.model = utils.data_parallel(model, self.ngpu, self.gpu) self.optimizer = torch.optim.SGD( params=self.model.parameters(), lr=self.lr_master.lr, momentum=self.momentum, weight_decay=self.weight_decay, nesterov=True, )
def forward(self, input): # conv features # # for i in range(len(self.cnn)): # input = self.cnn[i](input) # print(self.cnn[i],input.size()) conv = utils.data_parallel(self.cnn, input, self.ngpu) # conv=self.cnn(input) b, c, h, w = conv.size() #batchsize,channel,image hight,image width # print('conv.size():',conv.size()) assert h == 1, "the height of conv must be 1" conv = conv.squeeze(2) conv = conv.permute(2, 0, 1) # [w, b, c] # print('conv.size():',conv.size()) # rnn features output = utils.data_parallel(self.rnn, conv, self.ngpu) return output
def update_model(self, model): self.model = utils.data_parallel(model=model, ngpus=self.settings.nGPU, gpu0=self.settings.GPU) parameters = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = torch.optim.SGD(params=parameters, lr=self.lr_master.lr, momentum=self.settings.momentum, weight_decay=self.settings.weightDecay, nesterov=True)
def __init__(self, model, lr_master, n_epochs, n_iters, train_loader, test_loader, feature_dim, momentum=0.9, weight_decay=1e-4, optimizer_state=None, logger=None, ngpu=1, gpu=0): self.model = utils.data_parallel(model, ngpu, gpu) self.n_iters = n_iters self.n_epochs = n_epochs self.train_loader = train_loader self.test_loader = test_loader self.criterion = nn.CrossEntropyLoss().cuda() self.lr_master = lr_master """ self.optimizer = torch.optim.SGD(params=self.model.parameters(), lr=self.lr_master.lr, momentum=momentum, weight_decay=weight_decay, nesterov=True, )""" weight_params = [] bias_params = [] for name, params in self.model.named_parameters(): if "weight" in name: weight_params.append({"params": params}) # print "add d params" elif "bias" in name: bias_params.append({"params": params}) # print "add model params" self.optimizer = utils.caffeSGD(params=weight_params, lr=self.lr_master.lr, momentum=momentum, weight_decay=weight_decay, nesterov=True, ) self.optimizer_2 = utils.caffeSGD(params=bias_params, lr=self.lr_master.lr * 2, momentum=momentum, weight_decay=0, nesterov=True, ) self.logger = logger self.ngpu = ngpu self.gpu = gpu self.momentum = momentum self.weight_decay = weight_decay self.feature_dim = feature_dim self.iteration = 0 self.criterion = F.cross_entropy self.ToPILImage = transforms.ToPILImage() self.ToTensor = transforms.ToTensor()
def h(sample): inputs = utils.cast(sample[0], opt.dtype).detach() targets = utils.cast(sample[1], 'long') if opt.teacher_id != '': if opt.kt_method == "at": y_s, y_t, loss_groups = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) loss_groups = [v.sum() for v in loss_groups] [m.add(v.item()) for m, v in zip(meters_at, loss_groups)] return utils.distillation( y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s elif opt.kt_method == "st": y_s, y_t, loss_list = utils.data_parallel( f, inputs, params, sample[2], range(opt.ngpu)) loss_list = [v.sum() for v in loss_list] [m.add(v.item()) for m, v in zip(meters_st, loss_list)] fc_loss = torch.sqrt(torch.mean((y_s - y_t)**2)) loss_list.append(fc_loss) return loss_list, y_s else: y = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))[0] return F.cross_entropy(y, targets), y
def h(sample): global _outputs, _loss inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') _outputs = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))) _outputs = [o.float() for o in _outputs] _loss = [] for o in _outputs: _loss.append(F.cross_entropy(o, targets)) for o2 in _outputs: if o is not o2: _loss.append(KL_divergence(o2.detach(), o)) loss = sum(_loss) return loss, _outputs[-1]
def __init__(self, model, lr_master, n_epoch, train_loader, test_loader, momentum=0.9, weight_decay=1e-4, optimizer_state=None, tencrop=False, logger=None, ngpu=1, gpu=0): """ init trainer """ self.model = utils.data_parallel(model, ngpu, gpu) self.n_epoch = n_epoch self.train_loader = train_loader self.test_loader = test_loader self.ten_crop = False self.criterion = nn.CrossEntropyLoss().cuda() self.lr_master = lr_master self.optimizer = torch.optim.SGD( params=self.model.parameters(), lr=self.lr_master.lr, momentum=momentum, weight_decay=weight_decay, nesterov=True, ) self.logger = logger self.run_count = 0 self.scalar_info = {} self.ngpu = ngpu self.gpu = gpu self.momentum = momentum self.weight_decay = weight_decay
def h(sample): inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float() return F.cross_entropy(y, targets), y
def _network_split(self): r""" 1. split the network into several segments with pre-define pivot set 2. create auxiliary classifiers 3. create optimizers for network segments and fcs """ # register forward hook block_count = 0 if self.settings.netType in ["ResNet", "PreResNet","CifarResNeXt","DARTSNet"]: i=0 for module in self.model.modules(): i+=1 if isinstance(module, (BasicBlock, Bottleneck, PreBasicBlock,ResNeXtBottleneck,Cell)): block_count += 1 module.block_index = block_count if block_count in self.settings.pivotSet: module.register_forward_hook(self._forward_hook) if self.settings.netType in ["PreResNet", "ResNet","CifarResNeXt","DARTSNet"]: if self.settings.netType == "DARTSNet": shallow_model = nn.Sequential( nn.Conv2d(3, 3*self.settings.init_channels, 3, padding=1, bias=False), nn.BatchNorm2d(3*self.settings.init_channels) ) elif self.settings.netType == "PreResNet": shallow_model = nn.Sequential(self.model.conv) elif self.settings.netType == "CifarResNeXt": shallow_model = nn.Sequential( self.model.conv_1_3x3, self.model.bn_1, self.model.relu,) else: shallow_model = nn.Sequential( self.model.conv1, self.model.bn1, self.model.relu, self.model.maxpool,) print "init shallow head done!" else: assert False, "unsupported netType: %s" % self.settings.netType block_count = 0 for module in self.model.modules(): if isinstance(module, (PreBasicBlock, Bottleneck, BasicBlock,ResNeXtBottleneck,Cell)): # copy blocks if shallow_model is not None: shallow_model.add_module( str(len(shallow_model)), module) else: shallow_model = nn.Sequential(module) block_count += 1 # if block_count is equals to pivot_num, then create new segment if block_count in self.settings.pivotSet: self.segments.append(shallow_model) shallow_model = None else: pass self.segments.append(shallow_model) # create auxiliary classifier num_classes = self.settings.nClasses for i in range(len(self.segments) - 1): if isinstance(self.segments[i][-1], (Cell)): in_channels = self.segments[i][-1].preprocess1.conv21.in_channels elif isinstance(self.segments[i][-1], (ResNeXtBottleneck)): in_channels = self.segments[i][-1].conv_expand.out_channels elif isinstance(self.segments[i][-1], (PreBasicBlock, BasicBlock)): in_channels = self.segments[i][-1].conv2.out_channels elif isinstance(self.segments[i][-1], Bottleneck): in_channels = self.segments[i][-1].conv3.out_channels self.auxfc.append(AuxClassifier( in_channels=in_channels, num_classes=num_classes),) if self.settings.netType == "DARTSNet": final_fc = nn.Sequential( self.model.auxiliary_head, self.model.global_pooling, View(), self.model.classifier, ) elif self.settings.netType == "PreResNet": final_fc = nn.Sequential( self.model.bn, self.model.relu, self.model.avg_pool, View(), self.model.fc,) elif self.settings.netType == "CifarResNeXt": final_fc = nn.Sequential( self.model.avg_pool, View(), self.model.classifier,) elif self.settings.netType == "ResNet": final_fc = nn.Sequential( self.model.avgpool, View(), self.model.fc,) self.auxfc.append(final_fc) # model parallel """ self.segments = utils.data_parallel(model=self.segments, ngpus=self.settings.nGPU, gpu0=self.settings.GPU) """ self.model = utils.data_parallel(model=self.model, ngpus=self.settings.nGPU, gpu0=self.settings.GPU) self.auxfc = utils.data_parallel(model=self.auxfc, ngpus=1, gpu0=self.settings.GPU) # create optimizers for i in range(len(self.segments)): temp_optim = [] for j in range(i + 1): # add parameters in segmenets into optimizer # from the i-th optimizer contains [0:i] segments temp_optim.append({'params': self.segments[j].parameters(), 'lr': self.lr_master.lr}) # optimizer for segments and fc temp_seg_optim = torch.optim.SGD( temp_optim, momentum=self.settings.momentum, weight_decay=self.settings.weightDecay, nesterov=True,) temp_fc_optim = torch.optim.SGD( params=self.auxfc[i].parameters(), lr=self.lr_master.lr, momentum=self.settings.momentum, weight_decay=self.settings.weightDecay, nesterov=True,) self.seg_optimizer.append(temp_seg_optim) self.fc_optimizer.append(temp_fc_optim)
def _set_parallel(self): self.model = utils.data_parallel(self.model, self.settings.nGPU, self.settings.GPU)
def main(): opt = Options().parse() epoch_step = json.loads(opt.epoch_step) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id # to prevent opencv from initializing CUDA in workers torch.randn(8).cuda() os.environ['CUDA_VISIBLE_DEVICES'] = '' kwargs = { 'num_workers': opt.nthread, 'pin_memory': True } if opt.cuda else {} cv2_scale = lambda x: cv2.resize( x, dsize=(opt.imageSize, opt.imageSize ), interpolation=cv2.INTER_AREA).astype(np.uint8) np_reshape = lambda x: np.reshape(x, (opt.imageSize, opt.imageSize, opt. nchannels)) np_repeat = lambda x: np.repeat(x, 3, axis=2) ################################# # NORMALIZATION: Calculate the mean and std of training. ################################# train_transform = tnt.transform.compose([ cv2_scale, np_reshape, transforms.ToTensor(), ]) train_loader = torch.utils.data.DataLoader(OmniglotOS( root=opt.dataroot, train='train', transform=train_transform, target_transform=None), batch_size=opt.batchSize, shuffle=True, **kwargs) pbar = tqdm(enumerate(train_loader)) tmp = [] for batch_idx, (data, labels) in pbar: tmp.append(data) pbar.set_description('[{}/{} ({:.0f}%)]\t'.format( batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader))) omn_mean = torch.cat(tmp).mean() omn_std = torch.cat(tmp).std() # Free cuda memory tmp = [] data = [] labels = [] ################################# # TRANSFORMATIONS: transformations for the TRAIN dataset ################################# train_transform = tnt.transform.compose([ cv2_scale, np_reshape, np_repeat, T.AugmentationAleju(channel_is_first_axis=False, hflip=opt.hflip, vflip=opt.vflip, rotation_deg=opt.rotation_deg, shear_deg=opt.shear_deg, translation_x_px=opt.translation_px, translation_y_px=opt.translation_px), T.Normalize([omn_mean, omn_mean, omn_mean], [omn_std, omn_std, omn_std]), transforms.ToTensor(), ]) train_loader = torch.utils.data.DataLoader(OmniglotOS( root=opt.dataroot, train='train', transform=train_transform, target_transform=None), batch_size=opt.batchSize, shuffle=True, **kwargs) ################################# # TRANSFORMATIONS: transformations for the EVAL and TEST dataset ################################# eval_test_transform = tnt.transform.compose([ cv2_scale, np_reshape, np_repeat, T.Normalize([omn_mean, omn_mean, omn_mean], [omn_std, omn_std, omn_std]), transforms.ToTensor(), ]) val_loader = torch.utils.data.DataLoader(OmniglotOS( root=opt.dataroot, train='val', transform=eval_test_transform, target_transform=None), batch_size=opt.batchSize, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(OmniglotOS( root=opt.dataroot, train='test', transform=eval_test_transform, target_transform=None), batch_size=opt.batchSize, shuffle=False, **kwargs) num_classes = train_loader.dataset.getNumClasses() f, params, stats = resnet(opt.depth, opt.width, num_classes, False) def create_optimizer(opt, lr): print 'creating optimizer with lr = ', lr if opt.optim_method == 'SGD': return torch.optim.SGD(params.values(), lr, 0.9, weight_decay=opt.weightDecay) elif opt.optim_method == 'Adam': return torch.optim.Adam(params.values(), lr) def log(t, optimizer, params, stats, opt): torch.save( dict(params={k: v.data for k, v in params.iteritems()}, stats=stats, optimizer=optimizer.state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'model.pt7'), 'w')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print z optimizer = create_optimizer(opt, opt.lr) epoch = 0 if opt.resume != '': state_dict = torch.load(opt.resume) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.iteritems(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print '\nParameters:' kmax = max(len(key) for key in params.keys()) for i, (key, v) in enumerate(params.items()): print str(i).ljust(5), key.ljust(kmax + 3), str(tuple( v.size())).ljust(23), torch.typename(v.data) print '\nAdditional buffers:' kmax = max(len(key) for key in stats.keys()) for i, (key, v) in enumerate(stats.items()): print str(i).ljust(5), key.ljust(kmax + 3), str(tuple( v.size())).ljust(23), torch.typename(v) n_parameters = sum(p.numel() for p in params.values() + stats.values()) print '\nTotal number of parameters:', n_parameters # Save folder best_val_acc = 0 if opt.save == '': opt.save = './logs/resnet_' + str(random.getrandbits(128))[:-20] if not os.path.exists(opt.save): os.mkdir(opt.save) ###################################### # TRAIN ###################################### for epoch in range(opt.epochs): train_all_acc = [] train_all_losses = [] tick = time.clock() for batch_idx, (data, label) in enumerate(train_loader): if opt.cuda: data = data.cuda() label = label.cuda() inputs = Variable(data) targets = Variable(label) model_training = True train_preds = data_parallel(f, inputs, params, stats, model_training, np.arange(opt.ngpu)) training_loss = F.cross_entropy(train_preds, targets) optimizer.zero_grad() training_loss.backward() optimizer.step() train_probs, train_classes = torch.max(train_preds, 1) train_acc = accuracy_score(targets.data.cpu().numpy(), train_classes.data.cpu().numpy()) train_all_acc.append(train_acc) train_all_losses.append(training_loss.data.cpu().numpy()[0]) # Free memory inputs = [] targets = [] # Adjust learning rate if epoch in epoch_step: lr = optimizer.param_groups[0]['lr'] optimizer = create_optimizer(opt, lr * opt.lr_decay_ratio) # Validation if epoch % opt.eval_freq == 0: all_preds = [] all_targets = [] for batch_idx, (data, label) in enumerate(val_loader): if opt.cuda: data = data.cuda() inputs = Variable(data) model_training = False y = data_parallel(f, inputs, params, stats, model_training, np.arange(opt.ngpu)) all_preds.append(y.cpu().data.numpy()) all_targets.append(label.numpy()) all_preds = np.vstack(all_preds).argmax(1) all_targets = np.hstack(all_targets) val_acc = accuracy_score(all_targets, all_preds) print("++++++++++++++++++++++++") print("epoch: %d, val acc: %.2f" % (epoch, val_acc)) print("++++++++++++++++++++++++") if val_acc >= best_val_acc: log( { "train_loss": float(np.mean(train_all_losses)), "train_acc": float(np.mean(train_all_acc)), "test_acc": val_acc, "epoch": epoch, "num_classes": num_classes, "n_parameters": n_parameters, }, optimizer, params, stats, opt) best_val_acc = val_acc # Free memory data = [], label = [] y = [] tock = time.clock() print("epoch: %d, train loss: %f, train acc: %.2f, time: %.2f s" % (epoch, np.round(np.mean(train_all_losses), 6), np.mean(train_all_acc), np.round((tock - tick))))
def h(sample): inputs = Variable(cast(sample[0], opt.dtype)) targets = Variable(cast(sample[1], 'long')) y = data_parallel(f, inputs, params, stats, sample[2], list(range(opt.ngpu))) return F.cross_entropy(y, targets), y
def h(sample): inputs = cast(sample[0], opt.dtype) targets = cast(sample[1], 'long') y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float() return F.cross_entropy(y, targets), y