Exemple #1
0
    def test_ofa(self):
        ofa_model = OFA(self.model,
                        self.run_config,
                        distill_config=self.distill_config)

        start_epoch = 0
        for idx in range(len(self.run_config.n_epochs)):
            cur_idx = self.run_config.n_epochs[idx]
            for ph_idx in range(len(cur_idx)):
                cur_lr = self.run_config.init_learning_rate[idx][ph_idx]
                adam = fluid.optimizer.Adam(
                    learning_rate=cur_lr,
                    parameter_list=(
                        ofa_model.parameters() + ofa_model.netAs_param))
                for epoch_id in range(start_epoch,
                                      self.run_config.n_epochs[idx][ph_idx]):
                    for model_no in range(self.run_config.dynamic_batch_size[
                            idx]):
                        output, _ = ofa_model(self.data)
                        loss = fluid.layers.reduce_mean(output)
                        if self.distill_config.mapping_layers != None:
                            dis_loss = ofa_model.calc_distill_loss()
                            loss += dis_loss
                            dis_loss = dis_loss.numpy()[0]
                        else:
                            dis_loss = 0
                        print('epoch: {}, loss: {}, distill loss: {}'.format(
                            epoch_id, loss.numpy()[0], dis_loss))
                        loss.backward()
                        adam.minimize(loss)
                        adam.clear_gradients()
                start_epoch = self.run_config.n_epochs[idx][ph_idx]
Exemple #2
0
def test_ofa():

    model = Model()
    teacher_model = Model()

    default_run_config = {
        'train_batch_size': 256,
        'n_epochs': [[1], [2, 3], [4, 5]],
        'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]],
        'dynamic_batch_size': [1, 1, 1],
        'total_images': 50000,  #1281167,
        'elastic_depth': (2, 5, 8)
    }
    run_config = RunConfig(**default_run_config)

    default_distill_config = {
        'lambda_distill': 0.01,
        'teacher_model': teacher_model,
        'mapping_layers': ['models.0.fn']
    }
    distill_config = DistillConfig(**default_distill_config)

    ofa_model = OFA(model, run_config, distill_config=distill_config)

    train_dataset = paddle.vision.datasets.MNIST(mode='train',
                                                 backend='cv2',
                                                 transform=transform)
    train_loader = paddle.io.DataLoader(train_dataset,
                                        places=place,
                                        feed_list=[image, label],
                                        drop_last=True,
                                        batch_size=64)

    start_epoch = 0
    for idx in range(len(run_config.n_epochs)):
        cur_idx = run_config.n_epochs[idx]
        for ph_idx in range(len(cur_idx)):
            cur_lr = run_config.init_learning_rate[idx][ph_idx]
            adam = paddle.optimizer.Adam(
                learning_rate=cur_lr,
                parameter_list=(ofa_model.parameters() +
                                ofa_model.netAs_param))
            for epoch_id in range(start_epoch,
                                  run_config.n_epochs[idx][ph_idx]):
                for batch_id, data in enumerate(train_loader()):
                    dy_x_data = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data = np.array([x[1] for x in data
                                       ]).astype('int64').reshape(-1, 1)

                    img = paddle.dygraph.to_variable(dy_x_data)
                    label = paddle.dygraph.to_variable(y_data)
                    label.stop_gradient = True

                    for model_no in range(run_config.dynamic_batch_size[idx]):
                        output, _ = ofa_model(img, label)
                        loss = F.mean(output)
                        dis_loss = ofa_model.calc_distill_loss()
                        loss += dis_loss
                        loss.backward()

                        if batch_id % 10 == 0:
                            print(
                                'epoch: {}, batch: {}, loss: {}, distill loss: {}'
                                .format(epoch_id, batch_id,
                                        loss.numpy()[0],
                                        dis_loss.numpy()[0]))
                    ### accumurate dynamic_batch_size network of gradients for same batch of data
                    ### NOTE: need to fix gradients accumulate in PaddlePaddle
                    adam.minimize(loss)
                    adam.clear_gradients()
            start_epoch = run_config.n_epochs[idx][ph_idx]