Exemple #1
0
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])

        # Optimizer
        self.setup_meta_learners()
Exemple #2
0
    def __init__(self, device=None, learning_rate=1e-3, act=F.leaky_relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = 3
        self.t = 0
        self.loss_ml = 0

        # Loss
        self.rc_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])

        # Optimizer
        self.optimizer = Adam(learning_rate)  #TODO: adam is appropriate?
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()
Exemple #3
0
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])
        
        # Optimizer
        self.setup_meta_learners()
Exemple #4
0
    def __init__(self, device=None, learning_rate=1e-3, act=F.leaky_relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = 3
        self.t = 0
        self.loss_ml = 0

        # Loss
        self.rc_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])
        
        # Optimizer
        self.optimizer = Adam(learning_rate)  #TODO: adam is appropriate?
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()
Exemple #5
0
def test_forward():
    device = None
    act = F.leaky_relu
    model = Model(device, act)
    model_params = OrderedDict([x for x in model.namedparams()])

    x_data = np.random.rand(4, 3, 32, 32).astype(np.float32)
    y_data = np.random.randint(0, 10, 4).astype(np.int32)
    x = Variable(x_data)
    y = Variable(y_data)

    # forward
    y_pred = model(x, model_params, test=False)
    l = F.softmax_cross_entropy(y_pred, y)

    # backward
    model.cleargrads()
    l.backward(retain_grad=True)

    # change variable held in model_params
    for k, v in model_params.items():
        w = Variable(np.copy(v.grad))
        w_ = F.dropout(w)
        model_params[k] = w_

    # forward
    y_pred = model(x, model_params, test=False)
    l = F.softmax_cross_entropy(y_pred, y)

    # backward
    model.cleargrads()
    l.backward(retain_grad=True)

    # check
    print("after backward")
    for k, v in model_params.items():
        if v.grad is not None:
            print(k)
Exemple #6
0
def test_forward():
    device = None
    act = F.leaky_relu
    model = Model(device, act)
    model_params = OrderedDict([x for x in model.namedparams()])

    x_data = np.random.rand(4, 3, 32, 32).astype(np.float32)
    y_data = np.random.randint(0, 10, 4).astype(np.int32)
    x = Variable(x_data)
    y = Variable(y_data)

    # forward
    y_pred = model(x, model_params, test=False)
    l = F.softmax_cross_entropy(y_pred, y)
    
    # backward
    model.cleargrads()
    l.backward(retain_grad=True)

    # change variable held in model_params
    for k, v in model_params.items():
        w = Variable(np.copy(v.grad))
        w_ = F.dropout(w)
        model_params[k] = w_

    # forward
    y_pred = model(x, model_params, test=False)
    l = F.softmax_cross_entropy(y_pred, y)
    
    # backward
    model.cleargrads()
    l.backward(retain_grad=True)

    # check
    print("after backward")
    for k, v in model_params.items():
        if v.grad is not None:
            print(k)
Exemple #7
0
class Experiment000(object):
    """
    FCNN model
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])

        # Optimizer
        self.setup_meta_learners()

    def setup_meta_learners(self, ):
        #TODO: multiple layers, modification of inputs
        self.meta_learners = []
        self.opt_meta_learners = []

        # Meta-learner
        for k, v in self.model_params.items():
            # meta-learner taking gradient in batch dimension
            #ml = MetaLearner(np.prod(v.shape))
            ml = MetaLearner(1, )
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.opt_meta_learners.append(opt)

    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Foward/Backward of learner w.r.t. cross-entropy
        y_pred0 = self.model(x_l0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred0, y_l)
        self.cleargrads()
        loss_ce.backward(retain_grad=True)
        loss_ce.unchain_backward()

        # Forward of meta-learner, i.e., parameter update
        for i, elm in enumerate(self.model_params.items()):
            k, p = elm
            with cuda.get_device_from_id(self.device):
                shape = p.shape
                xp = cuda.get_array_module(p.data)

                x = p.grad
                #grad = xp.reshape(x, (np.prod(shape), ))
                grad = xp.reshape(x, (np.prod(shape), 1))
                meta_learner = self.meta_learners[i]
                g = meta_learner(Variable(grad))  # forward
                w = p - F.reshape(g, shape)
                self.model_params[k] = w  # parameter update
                #self.model_params[k] = F.reshape(g, shape)

        # Foward/Backward of learner w.r.t. stochastic regularization
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        loss_rec = self.recon_loss(F.softmax(y_pred0), F.softmax(y_pred1))
        self.cleargrads()
        loss_rec.backward()

        # Update meta-learner
        for meta_learner in self.meta_learners:
            meta_learner.cleargrads()
        for opt in self.opt_meta_learners:
            opt.update()

    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()
Exemple #8
0
class Experiment000(object):
    """
    - Stochastic Regularization
    - FCCN
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.leaky_relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = 3
        self.t = 0
        self.loss_ml = 0

        # Loss
        self.rc_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])

        # Optimizer
        self.optimizer = Adam(learning_rate)  #TODO: adam is appropriate?
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()

    def setup_meta_learners(self, ):
        self.meta_learners = []
        self.ml_optimizers = []

        # Meta-learner
        for _ in self.model_params:
            # meta-learner taking gradient in batch dimension
            ml = MetaLearner(inmap=1, midmap=1, outmap=1)
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.ml_optimizers.append(opt)

    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        self._train_for_primary_task(x_l0, y_l)
        self._train_for_auxiliary_task(x_l0, x_l1, y_l, x_u0, x_u1)

        self.t += 1
        if self.t == self.T:
            self._train_meta_learners()
            self.t = 0

    def _train_for_primary_task(self, x_l0, y_l):
        y_pred = self.model(x_l0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred, y_l)
        self._cleargrads()
        loss_ce.backward(retain_grad=True)
        self.optimizer.update(self.model_params)

    def _train_for_auxiliary_task(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Compute gradients
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        loss_rc = self.rc_loss(y_pred0, y_pred1)
        self._cleargrads()
        loss_rc.backward(retain_grad=True)

        # Update optimizee parameters by meta-learner
        model_params = self.model_params
        for i, elm in enumerate(model_params.items()):
            name, w = elm
            meta_learner = self.meta_learners[i]
            ml_optimizer = self.ml_optimizers[i]
            shape = w.shape
            with cuda.get_device_from_id(self.device):
                xp = cuda.get_array_module(w.data)
                g_old = w.grad  # no nedd to deep copy
                grad_data = xp.reshape(g_old, (np.prod(shape), 1))

                # refine grad, update w, and replace
                grad = Variable(grad_data)
                g = meta_learner(grad)  #TODO: use either h or c
                w -= F.reshape(g, shape)
            model_params[name] = w

        # Forward primary taks for training meta-leaners
        #TODO: use the same labeled data?
        y_pred = self.model(x_l0, self.model_params)
        self.loss_ml += F.softmax_cross_entropy(y_pred, y_l)

    def _train_meta_learners(self, ):
        self._cleargrads()
        self.loss_ml.backward(retain_grad=True)
        for opt in self.ml_optimizers:
            opt.update()
        self.loss_ml.unchain_backward()
        self.loss_ml = 0

    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def _cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()
Exemple #9
0
class Experiment000(object):
    """
    - Stochastic Regularization
    - Resnet x 5
    - Objective of meta-learner is T instread of one
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0
        self.loss_ml = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])

        # Optimizer
        self.optimizer = Adam(learning_rate)
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()

    def setup_meta_learners(self, ):
        #TODO: multiple layers, modification of inputs
        self.meta_learners = []
        self.opt_meta_learners = []

        # Meta-learner
        for k, v in self.model_params.items():
            # meta-learner taking gradient in batch dimension
            ml = MetaLearner(np.prod(v.shape))
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.opt_meta_learners.append(opt)

    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Train learner and meta-learner
        self._train(x_l0, x_l1, y_l)
        self._train_meta_learner(x_l0, x_l1, y_l, x_u0, x_u1)

    def _train(self, x0, x1, y):
        # Cross Entropy Loss
        y_pred0 = self.model(x0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred0, y)
        self.cleargrads()
        loss_ce.backward()

        self.optimizer.update(self.model_params)

    def _train_meta_learner(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Stochastic Regularization (i.e, Consistency Loss)
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        loss_rec = self.recon_loss(F.softmax(y_pred0), F.softmax(y_pred1))
        self.cleargrads()
        loss_rec.backward()

        # update learner using loss_rec and meta-learner
        self.update_parameter_by_meta_learner(self.model_params, loss_rec,
                                              x_l0, x_l1, y_l)

        self.t += 1
        if self.t == self.T:
            self.train_meta_learner()

    def update_parameter_by_meta_learner(self, model_params, loss, x_l0, x_l1,
                                         y_l):

        # Forward meta-learner
        namedparams = model_params
        for i, elm in enumerate(namedparams.items()):  # parameter-loop
            k, p = elm
            with cuda.get_device_from_id(self.device):
                shape = p.shape
                xp = cuda.get_array_module(p.data)

                x = p.grad
                grad = xp.reshape(x, (np.prod(shape), ))
                meta_learner = self.meta_learners[i]
                g = meta_learner(Variable(grad))  # forward
                w = p - F.reshape(g, shape)
                self.model_params[k] = w

        # Train meta-learner with main objective
        y_pred = self.model(x_l0, self.model_params)
        self.loss_ml += F.softmax_cross_entropy(y_pred, y_l)

    def train_meta_learner(self, ):
        self.cleargrads()  # need to clear W'grad due to loss_rec.backward
        for meta_learner in self.meta_learners:
            meta_learner.cleargrads()
        self.loss_ml.backward(retain_grad=True)
        for opt in self.opt_meta_learners:
            opt.update()
        self.loss_ml.unchain_backward(
        )  #TODO: here is a proper place to unchain?
        self.loss_ml = 0

    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()
Exemple #10
0
class Experiment000(object):
    """
    FCNN model
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])
        
        # Optimizer
        self.optimizer = Adam(learning_rate)
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()

    def setup_meta_learners(self, ):
        #TODO: multiple layers, modification of inputs
        self.meta_learners = []
        self.opt_meta_learners = []

        # Meta-learner
        for k, v in self.model_params.items():
            # meta-learner taking gradient in batch dimension
            ml = MetaLearner(np.prod(v.shape))
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.opt_meta_learners.append(opt)        
                
    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Train learner and meta-learner
        self._train(x_l0, x_l1, y_l)
        self._train_meta_learner(x_l0, x_l1, y_l, x_u0, x_u1)

    def _train(self, x0, x1, y):
        # Cross Entropy Loss
        y_pred0 = self.model(x0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred0, y)
        self.cleargrads()
        loss_ce.backward()

        self.optimizer.update(self.model_params)

    def _train_meta_learner(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Stochastic Regularization (i.e, Consistency Loss)
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        #TODO: better to not use softmax?
        loss_rec = self.recon_loss(F.softmax(y_pred0), F.softmax(y_pred1))
        self.cleargrads()
        loss_rec.backward()

        # update learner using loss_rec and meta-learner
        self.update_parameter_by_meta_learner(
            self.model_params, loss_rec, 
            x_l0, x_l1, y_l)
        self.train_meta_learner(x_l1, y_l)

    def update_parameter_by_meta_learner(
            self, model_params, loss, 
            x_l0, x_l1, y_l):

        # Forward meta-learner
        namedparams = model_params
        for i, elm in enumerate(namedparams.items()):  # parameter-loop
            k, p = elm
            with cuda.get_device_from_id(self.device):
                shape = p.shape
                xp = cuda.get_array_module(p.data)

                x = p.grad
                grad = xp.reshape(x, (np.prod(shape), ))
                meta_learner = self.meta_learners[i]
                g = meta_learner(Variable(grad))  # forward
                w = p - F.reshape(g, shape)
                self.model_params[k] = w

    def train_meta_learner(self, x_l1, y_l):
        # Train meta-learner with main objective
        y_pred = self.model(x_l1, self.model_params)

        #TODO: recurrent training ends in memory leak
        loss_ml = F.softmax_cross_entropy(y_pred, y_l)
        
        self.cleargrads()  # need to clear W'grad due to loss_rec.backward
        for meta_learner in self.meta_learners:
            meta_learner.cleargrads()
        loss_ml.backward(retain_grad=True)
        for opt in self.opt_meta_learners:
            opt.update()

        loss_ml.unchain_backward()  #TODO: here is a proper place to unchain?

    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()
Exemple #11
0
class Experiment000(object):
    """
    FCNN model
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = T
        self.t = 0

        # Loss
        self.recon_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])
        
        # Optimizer
        self.setup_meta_learners()
        
    def setup_meta_learners(self, ):
        #TODO: multiple layers, modification of inputs
        self.meta_learners = []
        self.opt_meta_learners = []

        # Meta-learner
        for k, v in self.model_params.items():
            # meta-learner taking gradient in batch dimension
            #ml = MetaLearner(np.prod(v.shape))
            ml = MetaLearner(1, )
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.opt_meta_learners.append(opt)        
                
    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Foward/Backward of learner w.r.t. cross-entropy
        y_pred0 = self.model(x_l0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred0, y_l)
        self.cleargrads()
        loss_ce.backward(retain_grad=True)
        loss_ce.unchain_backward()

        # Forward of meta-learner, i.e., parameter update
        for i, elm in enumerate(self.model_params.items()):
            k, p = elm
            with cuda.get_device_from_id(self.device):
                shape = p.shape
                xp = cuda.get_array_module(p.data)

                x = p.grad
                #grad = xp.reshape(x, (np.prod(shape), ))
                grad = xp.reshape(x, (np.prod(shape), 1))
                meta_learner = self.meta_learners[i]
                g = meta_learner(Variable(grad))  # forward
                w = p - F.reshape(g, shape)
                self.model_params[k] = w  # parameter update
                #self.model_params[k] = F.reshape(g, shape)

        # Foward/Backward of learner w.r.t. stochastic regularization
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        loss_rec = self.recon_loss(F.softmax(y_pred0), 
                                   F.softmax(y_pred1))
        self.cleargrads()
        loss_rec.backward()
        
        # Update meta-learner
        for meta_learner in self.meta_learners:
            meta_learner.cleargrads()
        for opt in self.opt_meta_learners:
            opt.update()
            

    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()
Exemple #12
0
class Experiment000(object):
    """
    - Stochastic Regularization
    - FCCN
    """
    def __init__(self, device=None, learning_rate=1e-3, act=F.leaky_relu, T=3):
        # Settings
        self.device = device
        self.act = act
        self.learning_rate = learning_rate
        self.T = 3
        self.t = 0
        self.loss_ml = 0

        # Loss
        self.rc_loss = ReconstructionLoss()

        # Model
        from meta_st.cifar10.cnn_model_001 import Model
        self.model = Model(device, act)
        self.model.to_gpu(device) if device is not None else None
        self.model_params = OrderedDict([x for x in self.model.namedparams()])
        
        # Optimizer
        self.optimizer = Adam(learning_rate)  #TODO: adam is appropriate?
        self.optimizer.setup(self.model)
        self.optimizer.use_cleargrads()
        self.setup_meta_learners()

    def setup_meta_learners(self, ):
        self.meta_learners = []
        self.ml_optimizers = []

        # Meta-learner
        for _ in self.model_params:
            # meta-learner taking gradient in batch dimension
            ml = MetaLearner(inmap=1, midmap=1, outmap=1)
            ml.to_gpu(self.device) if self.device is not None else None
            self.meta_learners.append(ml)

            # optimizer of meta-learner
            opt = optimizers.Adam(1e-3)
            opt.setup(ml)
            opt.use_cleargrads()
            self.ml_optimizers.append(opt)        

    def train(self, x_l0, x_l1, y_l, x_u0, x_u1):
        self._train_for_primary_task(x_l0, y_l)
        self._train_for_auxiliary_task(x_l0, x_l1, y_l, x_u0, x_u1)
        
        self.t += 1
        if self.t == self.T:
            self._train_meta_learners()
            self.t = 0

    def _train_for_primary_task(self, x_l0, y_l):
        y_pred = self.model(x_l0, self.model_params)
        loss_ce = F.softmax_cross_entropy(y_pred, y_l)
        self._cleargrads()
        loss_ce.backward(retain_grad=True)
        self.optimizer.update(self.model_params)
        
    def _train_for_auxiliary_task(self, x_l0, x_l1, y_l, x_u0, x_u1):
        # Compute gradients
        y_pred0 = self.model(x_u0, self.model_params)
        y_pred1 = self.model(x_u1, self.model_params)
        loss_rc = self.rc_loss(y_pred0, y_pred1)
        self._cleargrads()
        loss_rc.backward(retain_grad=True)

        # Update optimizee parameters by meta-learner
        model_params = self.model_params
        for i, elm in enumerate(model_params.items()):
            name, w = elm
            meta_learner = self.meta_learners[i]
            ml_optimizer = self.ml_optimizers[i]
            shape = w.shape
            with cuda.get_device_from_id(self.device):
                xp = cuda.get_array_module(w.data)
                g_old = w.grad  # no nedd to deep copy
                grad_data = xp.reshape(g_old, (np.prod(shape), 1))
            
                # refine grad, update w, and replace
                grad = Variable(grad_data)
                g = meta_learner(grad)  #TODO: use either h or c
                w -= F.reshape(g, shape)
            model_params[name] = w
                
        # Forward primary taks for training meta-leaners
        #TODO: use the same labeled data?
        y_pred = self.model(x_l0, self.model_params)
        self.loss_ml += F.softmax_cross_entropy(y_pred, y_l)

    def _train_meta_learners(self, ):
        self._cleargrads()
        self.loss_ml.backward(retain_grad=True)
        for opt in self.ml_optimizers:
            opt.update()
        self.loss_ml.unchain_backward()
        self.loss_ml = 0
            
    def test(self, x, y):
        y_pred = self.model(x, self.model_params, test=True)
        acc = F.accuracy(y_pred, y)
        return acc

    def _cleargrads(self, ):
        for k, v in self.model_params.items():
            v.cleargrad()