Exemple #1
0
def build_distill_prog_with_infermodel(executor, place, train_config):
    """build distill program with infermodel"""
    [train_program, feed_target_names, fetch_targets]= paddle.static.load_inference_model( \
        path_prefix=train_config["model_path_prefix"], \
        executor=executor)
    _remove_fetch_node(train_program)
    [teacher_program, teacher_feed_target_names, teacher_fetch_targets]= paddle.static.load_inference_model( \
        path_prefix=train_config["teacher_model_path_prefix"], \
        executor=executor)
    _remove_fetch_node(teacher_program)
    test_program = train_program.clone(for_test=True)

    train_program = _recover_param_attr(train_program)
    train_program = _recover_reserve_space_with_bn(train_program)
    for var in train_program.list_vars():
        var.stop_gradient = False
    train_graph = GraphWrapper(train_program)
    for op in train_graph.ops():
        op._op._set_attr("is_test", False)

    ############################################################################
    # distill
    ############################################################################
    data_name_map = {}
    assert len(feed_target_names) == len(teacher_feed_target_names), \
        "the number of feed nodes in the teacher model is not equal to the student model"
    for i, name in enumerate(feed_target_names):
        data_name_map[teacher_feed_target_names[i]] = name
    merge(teacher_program, train_program, data_name_map, place)

    # all feed node should set stop_gradient is False, for using pact quant algo.
    for var in train_program.list_vars():
        if var.name in data_name_map.values(
        ) or var.name in data_name_map.keys():
            var.stop_gradient = False

    train_fetch_list = []
    train_fetch_name_list = []
    startup_program = paddle.static.Program()
    with paddle.static.program_guard(train_program, startup_program):
        with fluid.unique_name.guard('merge'):
            optimizer = _create_optimizer(train_config)

            distill_loss = _parse_distill_loss(train_config)
            loss = paddle.mean(distill_loss)
            loss.stop_gradient = False
            p_g_list = paddle.static.append_backward(loss=loss)
            opts = optimizer.apply_gradients(p_g_list)

            train_fetch_list.append(loss)
            train_fetch_name_list.append(loss.name)

    return DistillProgramInfo(startup_program, train_program, \
        feed_target_names, train_fetch_list, optimizer, \
        test_program, feed_target_names, fetch_targets)
Exemple #2
0
    def test_loss(self):
        student_main = fluid.Program()
        student_startup = fluid.Program()
        with fluid.program_guard(student_main, student_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            student_predict = conv1 + conv2

        teacher_main = fluid.Program()
        teacher_startup = fluid.Program()
        with fluid.program_guard(teacher_main, teacher_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            sum1 = conv1 + conv2
            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
            sum2 = conv4 + sum1
            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")

        place = fluid.CPUPlace()
        data_name_map = {'image': 'image'}
        merge(teacher_main, student_main, data_name_map, place)
        merged_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                merged_ops.append(op.type)

        def adaptation_loss(t_var, s_var):
            teacher_channel = t_var.shape[1]
            s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
            hint_loss = fluid.layers.reduce_mean(
                fluid.layers.square(s_hint - t_var))
            return hint_loss

        with fluid.program_guard(student_main):
            distill_loss = loss(
                adaptation_loss,
                student_main,
                t_var='teacher_conv6_bn_output.tmp_2',
                s_var='conv2_bn_output.tmp_2')
        loss_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                loss_ops.append(op.type)
        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
        self.assertTrue(
            set(loss_ops).difference(set(merged_ops)) ==
            {'reduce_mean', 'elementwise_sub', 'square'})
Exemple #3
0
    def test_loss(self):
        input = paddle.static.data(name="image", shape=[None, 3, 224, 224])
        conv1 = conv_bn_layer(input, 8, 3, "conv1")
        conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
        student_predict = conv1 + conv2

        teacher_main = paddle.static.Program()
        teacher_startup = paddle.static.Program()
        with paddle.static.program_guard(teacher_main, teacher_startup):
            input = paddle.static.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            sum1 = conv1 + conv2
            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
            sum2 = conv4 + sum1
            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")

        place = paddle.CPUPlace()
        data_name_map = {'image': 'image'}
        merge(teacher_main, paddle.static.default_main_program(),
              data_name_map, place)
        merged_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                merged_ops.append(op.type)

        def adaptation_loss(t_var, s_var):
            hint_loss = paddle.mean(
                paddle.nn.functional.square_error_cost(s_var, t_var))
            return hint_loss

        distill_loss = loss(adaptation_loss,
                            t_var='teacher_conv6_bn_output.tmp_2',
                            s_var='conv2_bn_output.tmp_2')
        loss_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                loss_ops.append(op.type)
        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
        self.assertTrue(
            set(loss_ops).difference(set(merged_ops)) ==
            {'reduce_mean', 'elementwise_sub', 'square'})
    def test_merge(self):
        student_main = fluid.Program()
        student_startup = fluid.Program()
        with fluid.program_guard(student_main, student_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            student_predict = conv1 + conv2

        teacher_main = fluid.Program()
        teacher_startup = fluid.Program()
        with fluid.program_guard(teacher_main, teacher_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            sum1 = conv1 + conv2
            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
            sum2 = conv4 + sum1
            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")

        place = fluid.CPUPlace()
        data_name_map = {'image': 'image'}
        merge(teacher_main, student_main, data_name_map, place)
        merged_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                merged_ops.append(op.type)
        with fluid.program_guard(student_main):
            distill_loss = fsp_loss('teacher_conv5_bn_output.tmp_2',
                                    'teacher_conv6_bn_output.tmp_2',
                                    'conv1_bn_output.tmp_2',
                                    'conv2_bn_output.tmp_2', student_main)
        loss_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                loss_ops.append(op.type)
        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
        self.assertTrue(
            set(loss_ops).difference(set(merged_ops)) ==
            {'elementwise_sub', 'reduce_mean', 'square', 'fsp'})
Exemple #5
0
    def test_merge(self):
        student_main = fluid.Program()
        student_startup = fluid.Program()
        with fluid.program_guard(student_main, student_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            student_predict = conv1 + conv2
        student_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                student_ops.append(op)

        teacher_main = fluid.Program()
        teacher_startup = fluid.Program()
        with fluid.program_guard(teacher_main, teacher_startup):
            input = fluid.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            sum1 = conv1 + conv2
            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
            sum2 = conv4 + sum1
            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
        teacher_ops = []
        for block in teacher_main.blocks:
            for op in block.ops:
                teacher_ops.append(op)

        place = fluid.CPUPlace()
        data_name_map = {'image': 'image'}
        merge(teacher_main, student_main, data_name_map, place)
        merged_ops = []
        for block in student_main.blocks:
            for op in block.ops:
                merged_ops.append(op)
        self.assertTrue(len(student_ops) + len(teacher_ops) == len(merged_ops))
    def test_soft_label_loss(self):
        input = paddle.static.data(name="image", shape=[None, 3, 224, 224])
        conv1 = conv_bn_layer(input, 8, 3, "conv1")
        conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
        student_predict = conv1 + conv2

        teacher_main = paddle.static.Program()
        teacher_startup = paddle.static.Program()
        with paddle.static.program_guard(teacher_main, teacher_startup):
            input = paddle.static.data(name="image", shape=[None, 3, 224, 224])
            conv1 = conv_bn_layer(input, 8, 3, "conv1")
            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
            sum1 = conv1 + conv2
            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
            sum2 = conv4 + sum1
            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")

        place = paddle.CPUPlace()
        data_name_map = {'image': 'image'}
        merge(teacher_main,
              paddle.static.default_main_program(), data_name_map, place)
        merged_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                merged_ops.append(op.type)
        distill_loss = soft_label_loss('teacher_conv6_bn_output.tmp_2',
                                       'conv2_bn_output.tmp_2')
        loss_ops = []
        for block in paddle.static.default_main_program().blocks:
            for op in block.ops:
                loss_ops.append(op.type)
        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
        self.assertTrue(
            set(loss_ops).difference(set(merged_ops)) ==
            {'cross_entropy', 'softmax', 'reduce_mean', 'scale'})
Exemple #7
0
def compress(args):
    if args.data == "cifar10":
        import paddle.dataset.cifar as reader
        train_reader = reader.train10()
        val_reader = reader.test10()
        class_dim = 10
        image_shape = "3,32,32"
    elif args.data == "imagenet":
        import imagenet_reader as reader
        train_reader = reader.train()
        val_reader = reader.val()
        class_dim = 1000
        image_shape = "3,224,224"
    else:
        raise ValueError("{} is not supported.".format(args.data))
    image_shape = [int(m) for m in image_shape.split(",")]

    assert args.model in model_list, "{} is not in lists: {}".format(
        args.model, model_list)
    student_program = fluid.Program()
    s_startup = fluid.Program()

    with fluid.program_guard(student_program, s_startup):
        with fluid.unique_name.guard():
            image = fluid.layers.data(name='image',
                                      shape=image_shape,
                                      dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            train_loader = fluid.io.DataLoader.from_generator(
                feed_list=[image, label],
                capacity=64,
                use_double_buffer=True,
                iterable=True)
            valid_loader = fluid.io.DataLoader.from_generator(
                feed_list=[image, label],
                capacity=64,
                use_double_buffer=True,
                iterable=True)
            # model definition
            model = models.__dict__[args.model]()
            out = model.net(input=image, class_dim=class_dim)
            cost = fluid.layers.cross_entropy(input=out, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    train_reader = paddle.batch(train_reader,
                                batch_size=args.batch_size,
                                drop_last=True)
    val_reader = paddle.batch(val_reader,
                              batch_size=args.batch_size,
                              drop_last=True)
    val_program = student_program.clone(for_test=True)

    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    train_loader.set_sample_list_generator(train_reader, places)
    valid_loader.set_sample_list_generator(val_reader, place)

    teacher_model = models.__dict__[args.teacher_model]()
    # define teacher program
    teacher_program = fluid.Program()
    t_startup = fluid.Program()
    with fluid.program_guard(teacher_program, t_startup):
        with fluid.unique_name.guard():
            image = fluid.layers.data(name='image',
                                      shape=image_shape,
                                      dtype='float32')
            predict = teacher_model.net(image, class_dim=class_dim)

    exe.run(t_startup)
    if not os.path.exists(args.teacher_pretrained_model):
        _download(
            'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar',
            '.')
        _decompress('./ResNet50_vd_pretrained.tar')
    assert args.teacher_pretrained_model and os.path.exists(
        args.teacher_pretrained_model
    ), "teacher_pretrained_model should be set when teacher_model is not None."

    def if_exist(var):
        return os.path.exists(
            os.path.join(args.teacher_pretrained_model, var.name))

    fluid.io.load_vars(exe,
                       args.teacher_pretrained_model,
                       main_program=teacher_program,
                       predicate=if_exist)

    data_name_map = {'image': 'image'}
    merge(teacher_program, student_program, data_name_map, place)

    with fluid.program_guard(student_program, s_startup):
        distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
                                       student_program)
        loss = avg_cost + distill_loss
        lr, opt = create_optimizer(args)
        opt.minimize(loss)
    exe.run(s_startup)
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    parallel_main = fluid.CompiledProgram(student_program).with_data_parallel(
        loss_name=loss.name, build_strategy=build_strategy)

    for epoch_id in range(args.num_epochs):
        for step_id, data in enumerate(train_loader):
            lr_np, loss_1, loss_2, loss_3 = exe.run(parallel_main,
                                                    feed=data,
                                                    fetch_list=[
                                                        lr.name, loss.name,
                                                        avg_cost.name,
                                                        distill_loss.name
                                                    ])
            if step_id % args.log_period == 0:
                _logger.info(
                    "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}"
                    .format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0],
                            loss_3[0]))
        val_acc1s = []
        val_acc5s = []
        for step_id, data in enumerate(valid_loader):
            val_loss, val_acc1, val_acc5 = exe.run(
                val_program,
                data,
                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
            val_acc1s.append(val_acc1)
            val_acc5s.append(val_acc5)
            if step_id % args.log_period == 0:
                _logger.info(
                    "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}"
                    .format(epoch_id, step_id, val_loss[0], val_acc1[0],
                            val_acc5[0]))
        _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format(
            epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))
Exemple #8
0
def compress(args):
    if args.data == "cifar10":
        train_dataset = paddle.vision.datasets.Cifar10(mode='train')
        val_dataset = paddle.vision.datasets.Cifar10(mode='test')
        class_dim = 10
        image_shape = "3,32,32"
    elif args.data == "imagenet":
        import imagenet_reader as reader
        train_dataset = reader.ImageNetDataset(mode='train')
        val_dataset = reader.ImageNetDataset(mode='val')
        class_dim = 1000
        image_shape = "3,224,224"
    else:
        raise ValueError("{} is not supported.".format(args.data))
    image_shape = [int(m) for m in image_shape.split(",")]

    assert args.model in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    student_program = paddle.static.Program()
    s_startup = paddle.static.Program()
    places = paddle.static.cuda_places(
    ) if args.use_gpu else paddle.static.cpu_places()
    place = places[0]

    with paddle.static.program_guard(student_program, s_startup):
        with paddle.fluid.unique_name.guard():
            image = paddle.static.data(
                name='image', shape=[None] + image_shape, dtype='float32')
            label = paddle.static.data(
                name='label', shape=[None, 1], dtype='int64')
            train_loader = paddle.io.DataLoader(
                train_dataset,
                places=places,
                feed_list=[image, label],
                drop_last=True,
                batch_size=args.batch_size,
                return_list=False,
                shuffle=True,
                use_shared_memory=True,
                num_workers=4)
            valid_loader = paddle.io.DataLoader(
                val_dataset,
                places=place,
                feed_list=[image, label],
                drop_last=False,
                return_list=False,
                use_shared_memory=True,
                batch_size=args.batch_size,
                shuffle=False)
            # model definition
            model = models.__dict__[args.model]()
            out = model.net(input=image, class_dim=class_dim)
            cost = paddle.nn.functional.loss.cross_entropy(
                input=out, label=label)
            avg_cost = paddle.mean(x=cost)
            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)

    val_program = student_program.clone(for_test=True)
    exe = paddle.static.Executor(place)

    teacher_model = models.__dict__[args.teacher_model]()
    # define teacher program
    teacher_program = paddle.static.Program()
    t_startup = paddle.static.Program()
    with paddle.static.program_guard(teacher_program, t_startup):
        with paddle.fluid.unique_name.guard():
            image = paddle.static.data(
                name='image', shape=[None] + image_shape, dtype='float32')
            predict = teacher_model.net(image, class_dim=class_dim)

    exe.run(t_startup)
    if not os.path.exists(args.teacher_pretrained_model):
        _download(
            'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar',
            '.')
        _decompress('./ResNet50_vd_pretrained.tar')
    assert args.teacher_pretrained_model and os.path.exists(
        args.teacher_pretrained_model
    ), "teacher_pretrained_model should be set when teacher_model is not None."

    def if_exist(var):
        exist = os.path.exists(
            os.path.join(args.teacher_pretrained_model, var.name))
        if args.data == "cifar10" and (var.name == 'fc_0.w_0' or
                                       var.name == 'fc_0.b_0'):
            exist = False
        return exist

    paddle.static.load(teacher_program, args.teacher_pretrained_model, exe)

    data_name_map = {'image': 'image'}
    merge(teacher_program, student_program, data_name_map, place)

    with paddle.static.program_guard(student_program, s_startup):
        distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
                                       student_program)
        loss = avg_cost + distill_loss
        lr, opt = create_optimizer(args)
        opt.minimize(loss)
    exe.run(s_startup)
    build_strategy = paddle.static.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    parallel_main = paddle.static.CompiledProgram(
        student_program).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)

    for epoch_id in range(args.num_epochs):
        for step_id, data in enumerate(train_loader):
            loss_1, loss_2, loss_3 = exe.run(
                parallel_main,
                feed=data,
                fetch_list=[loss.name, avg_cost.name, distill_loss.name])
            if step_id % args.log_period == 0:
                _logger.info(
                    "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}".
                    format(epoch_id, step_id,
                           lr.get_lr(), loss_1[0], loss_2[0], loss_3[0]))
            lr.step()
        val_acc1s = []
        val_acc5s = []
        for step_id, data in enumerate(valid_loader):
            val_loss, val_acc1, val_acc5 = exe.run(
                val_program,
                data,
                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
            val_acc1s.append(val_acc1)
            val_acc5s.append(val_acc5)
            if step_id % args.log_period == 0:
                _logger.info(
                    "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}".
                    format(epoch_id, step_id, val_loss[0], val_acc1[0],
                           val_acc5[0]))
        if args.save_inference:
            paddle.fluid.io.save_inference_model(
                os.path.join("./saved_models", str(epoch_id)), ["image"],
                [out], exe, student_program)
        _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format(
            epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))