def h(sample):
     inputs = utils.cast(sample[0], opt.dtype).detach()
     targets = utils.cast(sample[1], 'long')
     if opt.teacher_id != '':
         #loss_groups是什么?
         print('f = ', f)
         print('tensor inputs = ', inputs.shape)
         print('dict params = ', params.keys())
         print('sample = ', sample[2])
         print('opt.ngpu = ', range(opt.ngpu))
         y_s, y_t, loss_groups = utils.data_parallel(
             f, inputs, params, sample[2], range(opt.ngpu))
         print('y_s = ', y_s.shape)
         print('y_t = ', y_t.shape)
         print('loss_groups = ', loss_groups)
         ipdb.set_trace()
         loss_groups = [v.sum() for v in loss_groups]
         #计算meters_at,即at_losses注意力loss
         [m.add(v.item()) for m, v in zip(meters_at, loss_groups)]
         return utils.distillation(
             y_s, y_t, targets, opt.temperature,
             opt.alpha) + opt.beta * sum(loss_groups), y_s
     else:
         y = utils.data_parallel(f, inputs, params, sample[2],
                                 range(opt.ngpu))[0]
         return F.cross_entropy(y, targets), y
Esempio n. 2
0
 def h(sample):
     #input 是输入样本
     #target是标签
     inputs = utils.cast(sample[0], opt.dtype).detach()
     targets = utils.cast(sample[1], 'long')
     #如果模型是学生模型
     #用给出的损失函数训练
     if opt.teacher_id != '':
         y_s, y_t, loss_groups = utils.data_parallel(
             f, inputs, params, sample[2], range(opt.ngpu))
         #取出总的loss
         loss_groups = [v.sum() for v in loss_groups]
         #总的损失?
         [m.add(v.item()) for m, v in zip(meters_at, loss_groups)]
         #第一部分是蒸馏#y_s:学生网络的输出#y_t:教师网络的输出#target:真实标签
         #第二部分是AD损失函数部分
         #第三部分是学生网络的输出
         #当是AT算法时,alpha等于0,第一部分。就剩的是学生网络和真实标签的交叉熵
         #当为KD算法时,beta等于0,就剩蒸馏损失函数,在这儿实现从1加到c
         return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
                 + opt.beta * sum(loss_groups), y_s
     #如果是教师网络
     #用标准交叉熵训练
     else:
         #y是网络的输出
         y = utils.data_parallel(f, inputs, params, sample[2],
                                 range(opt.ngpu))[0]
         return F.cross_entropy(y, targets), y
 def h(sample):
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     if opt.teacher_id != '':
         if opt.gamma:
             ys, y_t_auto, y_t = data_parallel(f, inputs, params,
                                               stats, sample[2],
                                               np.arange(opt.ngpu))[:3]
             loss_l2 = torch.nn.MSELoss()
             T = 4
             loss_student = F.cross_entropy(ys, targets)
             loss_teacher = F.cross_entropy(y_t_auto, targets)
             loss_course = opt.beta * \
                 ((y_t_auto - ys) * (y_t_auto - ys)).sum() / opt.batchSize
             y_tech_temp = torch.autograd.Variable(y_t_auto.data,
                                                   requires_grad=False)
             log_kd = rocket_distillation(ys, y_t, targets, opt.temperature,
                                          opt.alpha)
             return rocket_distillation(ys, y_t, targets, opt.temperature, opt.alpha) \
                 + F.cross_entropy(y_t_auto, targets) + F.cross_entropy(ys, targets) + opt.beta * ((y_tech_temp - ys) * (
                     y_tech_temp - ys)).sum() / opt.batchSize, (ys, y_t_auto, loss_student, loss_teacher, loss_course, log_kd)
         else:
             y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats,
                                                   sample[2],
                                                   np.arange(opt.ngpu))
             loss_groups = [v.sum() for v in loss_groups]
             [m.add(v.data[0]) for m, v in zip(meters_at, loss_groups)]
             return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
                 + opt.beta * sum(loss_groups), y_s
     else:
         if opt.gamma:
             ys, y = data_parallel(f, inputs, params, stats, sample[2],
                                   np.arange(opt.ngpu))[:2]
             loss_l2 = torch.nn.MSELoss()
             T = 4
             loss_student = F.cross_entropy(ys, targets)
             loss_teacher = F.cross_entropy(y, targets)
             loss_course = opt.beta * \
                 ((y - ys) * (y - ys)).sum() / opt.batchSize
             if opt.grad_block:
                 y_course = torch.autograd.Variable(y.data,
                                                    requires_grad=False)
             else:
                 y_course = y
             return F.cross_entropy(y, targets) + F.cross_entropy(
                 ys, targets) + opt.beta * (
                     (y_course - ys) *
                     (y_course - ys)).sum() / opt.batchSize, (ys, y,
                                                              loss_student,
                                                              loss_teacher,
                                                              loss_course)
         else:
             y = data_parallel(f, inputs, params, stats, sample[2],
                               np.arange(opt.ngpu))[0]
             return F.cross_entropy(y, targets), y
Esempio n. 4
0
    def forward(self, input):
        recurrent, _ = utils.data_parallel(
            self.rnn, input, self.ngpu)  # [T, b, h * 2]

        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)
        output = utils.data_parallel(
            self.embedding, t_rec, self.ngpu)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output
Esempio n. 5
0
 def h_ensemble(sample):
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     y_grassmann = data_parallel(f_grassmann, inputs, params_grassmann,
                                 stats_grassmann, sample[2],
                                 np.arange(opt.ngpu))
     y_oblique = data_parallel(f_oblique, inputs, params_oblique,
                               stats_oblique, sample[2],
                               np.arange(opt.ngpu))
     y_ensemble = y_grassmann + y_oblique
     return F.cross_entropy(y_ensemble, targets), y_ensemble
Esempio n. 6
0
    def forward(self, input):
        recurrent, _ = utils.data_parallel(
            self.rnn, input, self.ngpu)  # [T, b, h * 2]

        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)
        output = utils.data_parallel(
            self.embedding, t_rec, self.ngpu)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output
Esempio n. 7
0
    def forward(self, input):
        # conv features
        conv = utils.data_parallel(self.cnn, input, self.ngpu)
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = utils.data_parallel(self.rnn, conv, self.ngpu)

        return output
Esempio n. 8
0
    def forward(self, input):
        # conv features
        conv = utils.data_parallel(self.cnn, input, self.ngpu)
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = utils.data_parallel(self.rnn, conv, self.ngpu)

        return output
Esempio n. 9
0
 def h(sample):
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     if opt.teacher_id != '':
         y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))
         loss_groups = [v.sum() for v in loss_groups]
         [m.add(v.data[0]) for m,v in zip(meters_at, loss_groups)]
         return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
                 + opt.beta * sum(loss_groups), y_s
     else:
         y = data_parallel(f, inputs, params, stats, sample[2], np.arange(opt.ngpu))[0]
         return F.cross_entropy(y, targets), y
    def forward(self, x):
        #features = self.features(x)
        # out = F.relu(features, inplace=True)
        conv = utils.data_parallel(self.features, x, self.ngpu)
        # b, c, h, w = conv.size()
        # assert h == 1, "the height of conv must be 1"
        print conv.size()
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = utils.data_parallel(self.rnn, conv, self.ngpu)

        return output
Esempio n. 11
0
 def h(sample):
     pdb.set_trace()
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     y = data_parallel(f, inputs, params, stats, sample[2],
                       np.arange(opt.ngpu))
     return F.cross_entropy(y, targets), y
Esempio n. 12
0
    def __init__(self, model, lr_master, n_epochs, n_iters, train_loader, test_loader,
                 feature_dim,
                 momentum=0.9, weight_decay=1e-4, optimizer_state=None,
                 logger=None, ngpu=1, gpu=0):
        self.model = utils.data_parallel(model, ngpu, gpu)

        self.n_iters = n_iters
        self.n_epochs = n_epochs
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr_master = lr_master
        self.optimizer = torch.optim.SGD(params=self.model.parameters(),
                                         lr=self.lr_master.lr,
                                         momentum=momentum,
                                         weight_decay=weight_decay,
                                         nesterov=True,
                                         )
        weight_params = []
        bias_params = []

        self.logger = logger
        self.ngpu = ngpu
        self.gpu = gpu
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.feature_dim = feature_dim
        self.iteration = 0
        self.criterion = F.cross_entropy
        self.ToPILImage = transforms.ToPILImage()
        self.ToTensor = transforms.ToTensor()
Esempio n. 13
0
    def __init__(self,
                 model,
                 train_loader,
                 val_loader,
                 settings,
                 logger,
                 tensorboard_logger,
                 optimizer_state=None,
                 run_count=0):
        self.settings = settings

        self.model = utils.data_parallel(model=model,
                                         n_gpus=self.settings.n_gpus)
        self.train_loader = train_loader
        self.val_loader = val_loader

        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr = self.settings.lr
        self.optimizer = torch.optim.SGD(
            params=self.model.parameters(),
            lr=self.settings.lr,
            momentum=self.settings.momentum,
            weight_decay=self.settings.weight_decay,
            nesterov=True)
        if optimizer_state is not None:
            self.optimizer.load_state_dict(optimizer_state)

        self.logger = logger
        self.tensorboard_logger = tensorboard_logger
        self.run_count = run_count
Esempio n. 14
0
    def __init__(self,
                 model,
                 lr_master,
                 train_loader,
                 test_loader,
                 settings,
                 logger=None,
                 optimizer_state=None):
        """
        init trainer
        """

        self.settings = settings

        self.model = utils.data_parallel(model, self.settings.nGPU,
                                         self.settings.GPU)
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr_master = lr_master
        self.optimizer = torch.optim.SGD(
            params=self.model.parameters(),
            lr=self.lr_master.lr,
            momentum=self.settings.momentum,
            weight_decay=self.settings.weightDecay,
            nesterov=True,
        )
        if optimizer_state is not None:
            self.optimizer.load_state_dict(optimizer_state)
        self.logger = logger
        self.run_count = 0
        self.scalar_info = {}
Esempio n. 15
0
File: main.py Progetto: zj10/ND-Adam
 def h(sample):
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     y = data_parallel(f, inputs, params, stats, sample[2],
                       tuple(range(opt.ngpu)))
     logit_loss = 0.5 * torch.mean(torch.sum(y * y, 1))
     return F.cross_entropy(y, targets) + opt.logitDecay * logit_loss, y
Esempio n. 16
0
    def h(sample):
        global _outputs, _loss

        connection_map = np.array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1],
                                   [0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0],
                                   [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]])

        inputs = cast(sample[0], opt.dtype)
        targets = cast(sample[1], 'long')
        net1_outputs = data_parallel(f_1, inputs, params_1, sample[2],
                                     list(range(opt.ngpu)))
        net2_outputs = model_2(inputs)
        net1_outputs = [o.float() for o in net1_outputs]
        net2_outputs = [o.float() for o in net2_outputs]

        _loss = []

        # hard supervision
        for i, o in enumerate(net1_outputs):
            _loss.append(F.cross_entropy(o, targets))

        for i, o in enumerate(net2_outputs):
            _loss.append(F.cross_entropy(o, targets))

        outputs = net1_outputs + net2_outputs
        # soft supervision
        for i, o in enumerate(outputs):
            for j, o2 in enumerate(outputs):
                if connection_map[i, j] > 0:
                    _loss.append(KL_divergence(o2.detach(), o))

        loss = sum(_loss)
        _outputs = net2_outputs[-1].detach()

        return loss, net1_outputs[-1]
Esempio n. 17
0
 def h(sample):
     inputs, targets, mode = sample
     inputs = inputs.cuda().detach()
     targets = targets.cuda().long().detach()
     if opt.teacher_id != '':
         if opt.kt_method == "at":
             y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))
             loss_groups = [v.sum() for v in loss_groups]
             [m.add(v.item()) for m,v in zip(meters_at, loss_groups)]
             return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) + opt.beta * sum(loss_groups), y_s
         elif opt.kt_method == "st":
             y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, sample[2], range(opt.ngpu))
             return torch.sqrt(torch.mean((y_s - y_t) ** 2)), y_s
     else:
         y = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))[0]
         return F.cross_entropy(y, targets), y
Esempio n. 18
0
 def compute_loss_test(sample):
     inputs = cast(sample[0], args.dtype)
     targets = cast(sample[1], 'long')
     y = data_parallel(model, inputs, params, sample[2],
                       list(range(args.ngpu))).float()
     if args.dataset == "awa2":
         return F.binary_cross_entropy_with_logits(y, targets.float()), y
     else:
         return F.cross_entropy(y, targets), y
Esempio n. 19
0
 def reset_model(self, model):
     self.model = utils.data_parallel(model, self.ngpu, self.gpu)
     parameters = filter(lambda p: p.requires_grad, self.model.parameters())
     self.optimizer = torch.optim.SGD(params=parameters,
                                      lr=self.lr_master.lr,
                                      momentum=self.momentum,
                                      weight_decay=self.weight_decay,
                                      nesterov=True,
                                      )
Esempio n. 20
0
 def h(sample):
     inputs, targets, mode = sample
     inputs = inputs.cuda().detach()
     targets = targets.cuda().long().detach()
     y_s, y_t, loss_groups = utils.data_parallel(f, inputs, params, mode, range(opt.ngpu))
     loss_groups = [v.sum() for v in loss_groups]
     [m.add(v.item()) for m,v in zip(meters_at, loss_groups)]
     return utils.distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
             + opt.beta * sum(loss_groups), y_s
Esempio n. 21
0
 def h(sample):
     inputs = Variable(sample[0].cuda())
     targets = Variable(sample[1].cuda().long())
     y_s, y_t, loss_groups = data_parallel(f, inputs, params, stats,
                                           sample[2], np.arange(opt.ngpu))
     loss_groups = [v.sum() for v in loss_groups]
     [m.add(v.data[0]) for m, v in zip(meters_at, loss_groups)]
     return distillation(y_s, y_t, targets, opt.temperature, opt.alpha) \
             + opt.beta * sum(loss_groups), y_s
Esempio n. 22
0
 def reset_model(self, model):
     self.model = utils.data_parallel(model, self.ngpu, self.gpu)
     self.optimizer = torch.optim.SGD(
         params=self.model.parameters(),
         lr=self.lr_master.lr,
         momentum=self.momentum,
         weight_decay=self.weight_decay,
         nesterov=True,
     )
Esempio n. 23
0
    def forward(self, input):
        # conv features
        #
        # for i in range(len(self.cnn)):
        #     input = self.cnn[i](input)
        #     print(self.cnn[i],input.size())

        conv = utils.data_parallel(self.cnn, input, self.ngpu)

        # conv=self.cnn(input)
        b, c, h, w = conv.size()  #batchsize,channel,image hight,image width
        # print('conv.size():',conv.size())
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]
        # print('conv.size():',conv.size())

        # rnn features
        output = utils.data_parallel(self.rnn, conv, self.ngpu)

        return output
    def update_model(self, model):
        self.model = utils.data_parallel(model=model,
                                         ngpus=self.settings.nGPU,
                                         gpu0=self.settings.GPU)

        parameters = filter(lambda p: p.requires_grad,
                            self.model.parameters())

        self.optimizer = torch.optim.SGD(params=parameters,
                                         lr=self.lr_master.lr,
                                         momentum=self.settings.momentum,
                                         weight_decay=self.settings.weightDecay,
                                         nesterov=True)
Esempio n. 25
0
    def __init__(self, model, lr_master, n_epochs, n_iters, train_loader, test_loader,
                 feature_dim,
                 momentum=0.9, weight_decay=1e-4, optimizer_state=None,
                 logger=None, ngpu=1, gpu=0):
        self.model = utils.data_parallel(model, ngpu, gpu)

        self.n_iters = n_iters
        self.n_epochs = n_epochs
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr_master = lr_master
        """
        self.optimizer = torch.optim.SGD(params=self.model.parameters(),
                                         lr=self.lr_master.lr,
                                         momentum=momentum,
                                         weight_decay=weight_decay,
                                         nesterov=True,
                                         )"""
        weight_params = []
        bias_params = []
        for name, params in self.model.named_parameters():
            if "weight" in name:
                weight_params.append({"params": params})
                # print "add d params"
            elif "bias" in name:
                bias_params.append({"params": params})
                # print "add model params"
        self.optimizer = utils.caffeSGD(params=weight_params,
                                        lr=self.lr_master.lr,
                                        momentum=momentum,
                                        weight_decay=weight_decay,
                                        nesterov=True,
                                        )

        self.optimizer_2 = utils.caffeSGD(params=bias_params,
                                          lr=self.lr_master.lr * 2,
                                          momentum=momentum,
                                          weight_decay=0,
                                          nesterov=True,
                                          )
        self.logger = logger
        self.ngpu = ngpu
        self.gpu = gpu
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.feature_dim = feature_dim
        self.iteration = 0
        self.criterion = F.cross_entropy
        self.ToPILImage = transforms.ToPILImage()
        self.ToTensor = transforms.ToTensor()
Esempio n. 26
0
 def h(sample):
     inputs = utils.cast(sample[0], opt.dtype).detach()
     targets = utils.cast(sample[1], 'long')
     if opt.teacher_id != '':
         if opt.kt_method == "at":
             y_s, y_t, loss_groups = utils.data_parallel(
                 f, inputs, params, sample[2], range(opt.ngpu))
             loss_groups = [v.sum() for v in loss_groups]
             [m.add(v.item()) for m, v in zip(meters_at, loss_groups)]
             return utils.distillation(
                 y_s, y_t, targets, opt.temperature,
                 opt.alpha) + opt.beta * sum(loss_groups), y_s
         elif opt.kt_method == "st":
             y_s, y_t, loss_list = utils.data_parallel(
                 f, inputs, params, sample[2], range(opt.ngpu))
             loss_list = [v.sum() for v in loss_list]
             [m.add(v.item()) for m, v in zip(meters_st, loss_list)]
             fc_loss = torch.sqrt(torch.mean((y_s - y_t)**2))
             loss_list.append(fc_loss)
             return loss_list, y_s
     else:
         y = utils.data_parallel(f, inputs, params, sample[2],
                                 range(opt.ngpu))[0]
         return F.cross_entropy(y, targets), y
Esempio n. 27
0
    def h(sample):
        global _outputs, _loss

        inputs = cast(sample[0], opt.dtype)
        targets = cast(sample[1], 'long')
        _outputs = data_parallel(f, inputs, params, sample[2],
                                 list(range(opt.ngpu)))
        _outputs = [o.float() for o in _outputs]

        _loss = []
        for o in _outputs:
            _loss.append(F.cross_entropy(o, targets))
            for o2 in _outputs:
                if o is not o2:
                    _loss.append(KL_divergence(o2.detach(), o))
        loss = sum(_loss)

        return loss, _outputs[-1]
Esempio n. 28
0
    def __init__(self,
                 model,
                 lr_master,
                 n_epoch,
                 train_loader,
                 test_loader,
                 momentum=0.9,
                 weight_decay=1e-4,
                 optimizer_state=None,
                 tencrop=False,
                 logger=None,
                 ngpu=1,
                 gpu=0):
        """
        init trainer
        """

        self.model = utils.data_parallel(model, ngpu, gpu)

        self.n_epoch = n_epoch
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.ten_crop = False
        self.criterion = nn.CrossEntropyLoss().cuda()
        self.lr_master = lr_master
        self.optimizer = torch.optim.SGD(
            params=self.model.parameters(),
            lr=self.lr_master.lr,
            momentum=momentum,
            weight_decay=weight_decay,
            nesterov=True,
        )
        self.logger = logger
        self.run_count = 0
        self.scalar_info = {}
        self.ngpu = ngpu
        self.gpu = gpu
        self.momentum = momentum
        self.weight_decay = weight_decay
Esempio n. 29
0
 def h(sample):
     inputs = cast(sample[0], opt.dtype)
     targets = cast(sample[1], 'long')
     y = data_parallel(f, inputs, params, sample[2], list(range(opt.ngpu))).float()
     return F.cross_entropy(y, targets), y
Esempio n. 30
0
    def _network_split(self):
        r"""
        1. split the network into several segments with pre-define pivot set
        2. create auxiliary classifiers
        3. create optimizers for network segments and fcs
        """
        # register forward hook
        block_count = 0
        if self.settings.netType in ["ResNet", "PreResNet","CifarResNeXt","DARTSNet"]:
            i=0
            for module in self.model.modules():
                i+=1
                if isinstance(module, (BasicBlock, Bottleneck, PreBasicBlock,ResNeXtBottleneck,Cell)):
                    block_count += 1
                    module.block_index = block_count
                    if block_count in self.settings.pivotSet:
                        module.register_forward_hook(self._forward_hook)

        if self.settings.netType in ["PreResNet", "ResNet","CifarResNeXt","DARTSNet"]:
            if self.settings.netType == "DARTSNet":
                shallow_model = nn.Sequential(
                            nn.Conv2d(3, 3*self.settings.init_channels, 3, padding=1, bias=False),
                            nn.BatchNorm2d(3*self.settings.init_channels)
                        )
            elif self.settings.netType == "PreResNet":
                shallow_model = nn.Sequential(self.model.conv)
            elif self.settings.netType == "CifarResNeXt":
                shallow_model = nn.Sequential(
                    self.model.conv_1_3x3,
                    self.model.bn_1,
                    self.model.relu,)
            else:
                shallow_model = nn.Sequential(
                    self.model.conv1,
                    self.model.bn1,
                    self.model.relu,
                    self.model.maxpool,)
            print "init shallow head done!"
        else:
            assert False, "unsupported netType: %s" % self.settings.netType

        block_count = 0
        for module in self.model.modules():
            if isinstance(module, (PreBasicBlock, Bottleneck, BasicBlock,ResNeXtBottleneck,Cell)):
                # copy blocks
                if shallow_model is not None:
                    shallow_model.add_module(
                        str(len(shallow_model)), module)
                else:
                    shallow_model = nn.Sequential(module)
                block_count += 1

                # if block_count is equals to pivot_num, then create new segment
                if block_count in self.settings.pivotSet:
                    self.segments.append(shallow_model)
                    shallow_model = None
            else:
                pass
        self.segments.append(shallow_model)

        # create auxiliary classifier
        num_classes = self.settings.nClasses
        for i in range(len(self.segments) - 1):
            if isinstance(self.segments[i][-1], (Cell)):
                in_channels = self.segments[i][-1].preprocess1.conv21.in_channels
            elif isinstance(self.segments[i][-1], (ResNeXtBottleneck)):
                in_channels = self.segments[i][-1].conv_expand.out_channels
            elif isinstance(self.segments[i][-1], (PreBasicBlock, BasicBlock)):
                in_channels = self.segments[i][-1].conv2.out_channels
            elif isinstance(self.segments[i][-1], Bottleneck):
                in_channels = self.segments[i][-1].conv3.out_channels

            self.auxfc.append(AuxClassifier(
                in_channels=in_channels,
                num_classes=num_classes),)
        if self.settings.netType == "DARTSNet":
            final_fc = nn.Sequential(
                self.model.auxiliary_head,
                self.model.global_pooling,
                View(),
                self.model.classifier, )
        elif self.settings.netType == "PreResNet":
            final_fc = nn.Sequential(
                self.model.bn,
                self.model.relu,
                self.model.avg_pool,
                View(),
                self.model.fc,)
        elif self.settings.netType == "CifarResNeXt":
            final_fc = nn.Sequential(
                self.model.avg_pool,
                View(),
                self.model.classifier,)
        elif self.settings.netType == "ResNet":
            final_fc = nn.Sequential(
                self.model.avgpool,
                View(),
                self.model.fc,)

        self.auxfc.append(final_fc)

        # model parallel
        """
        self.segments = utils.data_parallel(model=self.segments,
                                            ngpus=self.settings.nGPU,
                                            gpu0=self.settings.GPU)
        """
        self.model = utils.data_parallel(model=self.model,
                                         ngpus=self.settings.nGPU,
                                         gpu0=self.settings.GPU)

        self.auxfc = utils.data_parallel(model=self.auxfc,
                                         ngpus=1,
                                         gpu0=self.settings.GPU)

        # create optimizers
        for i in range(len(self.segments)):
            temp_optim = []
            for j in range(i + 1):
                # add parameters in segmenets into optimizer
                # from the i-th optimizer contains [0:i] segments
                temp_optim.append({'params': self.segments[j].parameters(),
                                   'lr': self.lr_master.lr})

            # optimizer for segments and fc

            temp_seg_optim = torch.optim.SGD(
                temp_optim,
                momentum=self.settings.momentum,
                weight_decay=self.settings.weightDecay,
                nesterov=True,)

            temp_fc_optim = torch.optim.SGD(
                params=self.auxfc[i].parameters(),
                lr=self.lr_master.lr,
                momentum=self.settings.momentum,
                weight_decay=self.settings.weightDecay,
                nesterov=True,)

            self.seg_optimizer.append(temp_seg_optim)
            self.fc_optimizer.append(temp_fc_optim)
Esempio n. 31
0
 def _set_parallel(self):
     self.model = utils.data_parallel(self.model, self.settings.nGPU,
                                      self.settings.GPU)
Esempio n. 32
0
def main():

    opt = Options().parse()
    epoch_step = json.loads(opt.epoch_step)
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id
    # to prevent opencv from initializing CUDA in workers
    torch.randn(8).cuda()
    os.environ['CUDA_VISIBLE_DEVICES'] = ''

    kwargs = {
        'num_workers': opt.nthread,
        'pin_memory': True
    } if opt.cuda else {}
    cv2_scale = lambda x: cv2.resize(
        x, dsize=(opt.imageSize, opt.imageSize
                  ), interpolation=cv2.INTER_AREA).astype(np.uint8)
    np_reshape = lambda x: np.reshape(x, (opt.imageSize, opt.imageSize, opt.
                                          nchannels))
    np_repeat = lambda x: np.repeat(x, 3, axis=2)

    #################################
    # NORMALIZATION: Calculate the mean and std of training.
    #################################
    train_transform = tnt.transform.compose([
        cv2_scale,
        np_reshape,
        transforms.ToTensor(),
    ])
    train_loader = torch.utils.data.DataLoader(OmniglotOS(
        root=opt.dataroot,
        train='train',
        transform=train_transform,
        target_transform=None),
                                               batch_size=opt.batchSize,
                                               shuffle=True,
                                               **kwargs)

    pbar = tqdm(enumerate(train_loader))
    tmp = []
    for batch_idx, (data, labels) in pbar:
        tmp.append(data)
        pbar.set_description('[{}/{} ({:.0f}%)]\t'.format(
            batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader)))
    omn_mean = torch.cat(tmp).mean()
    omn_std = torch.cat(tmp).std()
    # Free cuda memory
    tmp = []
    data = []
    labels = []

    #################################
    # TRANSFORMATIONS: transformations for the TRAIN dataset
    #################################
    train_transform = tnt.transform.compose([
        cv2_scale,
        np_reshape,
        np_repeat,
        T.AugmentationAleju(channel_is_first_axis=False,
                            hflip=opt.hflip,
                            vflip=opt.vflip,
                            rotation_deg=opt.rotation_deg,
                            shear_deg=opt.shear_deg,
                            translation_x_px=opt.translation_px,
                            translation_y_px=opt.translation_px),
        T.Normalize([omn_mean, omn_mean, omn_mean],
                    [omn_std, omn_std, omn_std]),
        transforms.ToTensor(),
    ])

    train_loader = torch.utils.data.DataLoader(OmniglotOS(
        root=opt.dataroot,
        train='train',
        transform=train_transform,
        target_transform=None),
                                               batch_size=opt.batchSize,
                                               shuffle=True,
                                               **kwargs)

    #################################
    # TRANSFORMATIONS: transformations for the EVAL and TEST dataset
    #################################
    eval_test_transform = tnt.transform.compose([
        cv2_scale,
        np_reshape,
        np_repeat,
        T.Normalize([omn_mean, omn_mean, omn_mean],
                    [omn_std, omn_std, omn_std]),
        transforms.ToTensor(),
    ])
    val_loader = torch.utils.data.DataLoader(OmniglotOS(
        root=opt.dataroot,
        train='val',
        transform=eval_test_transform,
        target_transform=None),
                                             batch_size=opt.batchSize,
                                             shuffle=False,
                                             **kwargs)
    test_loader = torch.utils.data.DataLoader(OmniglotOS(
        root=opt.dataroot,
        train='test',
        transform=eval_test_transform,
        target_transform=None),
                                              batch_size=opt.batchSize,
                                              shuffle=False,
                                              **kwargs)

    num_classes = train_loader.dataset.getNumClasses()
    f, params, stats = resnet(opt.depth, opt.width, num_classes, False)

    def create_optimizer(opt, lr):
        print 'creating optimizer with lr = ', lr
        if opt.optim_method == 'SGD':
            return torch.optim.SGD(params.values(),
                                   lr,
                                   0.9,
                                   weight_decay=opt.weightDecay)
        elif opt.optim_method == 'Adam':
            return torch.optim.Adam(params.values(), lr)

    def log(t, optimizer, params, stats, opt):
        torch.save(
            dict(params={k: v.data
                         for k, v in params.iteritems()},
                 stats=stats,
                 optimizer=optimizer.state_dict(),
                 epoch=t['epoch']),
            open(os.path.join(opt.save, 'model.pt7'), 'w'))
        z = vars(opt).copy()
        z.update(t)
        logname = os.path.join(opt.save, 'log.txt')
        with open(logname, 'a') as f:
            f.write('json_stats: ' + json.dumps(z) + '\n')
        print z

    optimizer = create_optimizer(opt, opt.lr)

    epoch = 0
    if opt.resume != '':
        state_dict = torch.load(opt.resume)
        epoch = state_dict['epoch']
        params_tensors, stats = state_dict['params'], state_dict['stats']
        for k, v in params.iteritems():
            v.data.copy_(params_tensors[k])
        optimizer.load_state_dict(state_dict['optimizer'])

    print '\nParameters:'
    kmax = max(len(key) for key in params.keys())
    for i, (key, v) in enumerate(params.items()):
        print str(i).ljust(5), key.ljust(kmax + 3), str(tuple(
            v.size())).ljust(23), torch.typename(v.data)
    print '\nAdditional buffers:'
    kmax = max(len(key) for key in stats.keys())
    for i, (key, v) in enumerate(stats.items()):
        print str(i).ljust(5), key.ljust(kmax + 3), str(tuple(
            v.size())).ljust(23), torch.typename(v)

    n_parameters = sum(p.numel() for p in params.values() + stats.values())
    print '\nTotal number of parameters:', n_parameters

    # Save folder
    best_val_acc = 0
    if opt.save == '':
        opt.save = './logs/resnet_' + str(random.getrandbits(128))[:-20]
    if not os.path.exists(opt.save):
        os.mkdir(opt.save)

    ######################################
    # TRAIN
    ######################################
    for epoch in range(opt.epochs):
        train_all_acc = []
        train_all_losses = []
        tick = time.clock()
        for batch_idx, (data, label) in enumerate(train_loader):
            if opt.cuda:
                data = data.cuda()
                label = label.cuda()
            inputs = Variable(data)
            targets = Variable(label)

            model_training = True
            train_preds = data_parallel(f, inputs, params, stats,
                                        model_training, np.arange(opt.ngpu))
            training_loss = F.cross_entropy(train_preds, targets)
            optimizer.zero_grad()
            training_loss.backward()
            optimizer.step()

            train_probs, train_classes = torch.max(train_preds, 1)
            train_acc = accuracy_score(targets.data.cpu().numpy(),
                                       train_classes.data.cpu().numpy())

            train_all_acc.append(train_acc)
            train_all_losses.append(training_loss.data.cpu().numpy()[0])
            # Free memory
            inputs = []
            targets = []

        # Adjust learning rate
        if epoch in epoch_step:
            lr = optimizer.param_groups[0]['lr']
            optimizer = create_optimizer(opt, lr * opt.lr_decay_ratio)

        # Validation
        if epoch % opt.eval_freq == 0:
            all_preds = []
            all_targets = []
            for batch_idx, (data, label) in enumerate(val_loader):
                if opt.cuda:
                    data = data.cuda()
                inputs = Variable(data)
                model_training = False
                y = data_parallel(f, inputs, params, stats, model_training,
                                  np.arange(opt.ngpu))
                all_preds.append(y.cpu().data.numpy())
                all_targets.append(label.numpy())

            all_preds = np.vstack(all_preds).argmax(1)
            all_targets = np.hstack(all_targets)
            val_acc = accuracy_score(all_targets, all_preds)
            print("++++++++++++++++++++++++")
            print("epoch: %d, val acc: %.2f" % (epoch, val_acc))
            print("++++++++++++++++++++++++")

            if val_acc >= best_val_acc:
                log(
                    {
                        "train_loss": float(np.mean(train_all_losses)),
                        "train_acc": float(np.mean(train_all_acc)),
                        "test_acc": val_acc,
                        "epoch": epoch,
                        "num_classes": num_classes,
                        "n_parameters": n_parameters,
                    }, optimizer, params, stats, opt)
                best_val_acc = val_acc

            # Free memory
            data = [],
            label = []
            y = []

        tock = time.clock()

        print("epoch: %d, train loss: %f, train acc: %.2f, time: %.2f s" %
              (epoch, np.round(np.mean(train_all_losses), 6),
               np.mean(train_all_acc), np.round((tock - tick))))
Esempio n. 33
0
 def h(sample):
     inputs = Variable(cast(sample[0], opt.dtype))
     targets = Variable(cast(sample[1], 'long'))
     y = data_parallel(f, inputs, params, stats, sample[2], list(range(opt.ngpu)))
     return F.cross_entropy(y, targets), y
Esempio n. 34
0
 def h(sample):
     inputs = cast(sample[0], opt.dtype)
     targets = cast(sample[1], 'long')
     y = data_parallel(f, inputs, params, sample[2],
                       list(range(opt.ngpu))).float()
     return F.cross_entropy(y, targets), y