Beispiel #1
0
    def test_transformer_sort_gradient_float32(self):
        seed = 90

        with guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            backward_strategy = fluid.dygraph.BackwardStrategy()
            backward_strategy.sort_sum_gradient = True
            transformer = TransFormer('transformer',
                                      ModelHyperParams.src_vocab_size,
                                      ModelHyperParams.trg_vocab_size,
                                      ModelHyperParams.max_length + 1,
                                      ModelHyperParams.n_layer,
                                      ModelHyperParams.n_head,
                                      ModelHyperParams.d_key,
                                      ModelHyperParams.d_value,
                                      ModelHyperParams.d_model,
                                      ModelHyperParams.d_inner_hid,
                                      ModelHyperParams.prepostprocess_dropout,
                                      ModelHyperParams.attention_dropout,
                                      ModelHyperParams.relu_dropout,
                                      ModelHyperParams.preprocess_cmd,
                                      ModelHyperParams.postprocess_cmd,
                                      ModelHyperParams.weight_sharing,
                                      TrainTaskConfig.label_smooth_eps,
                                      use_py_reader=use_py_reader,
                                      is_test=False)
            if sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(learning_rate=0.003)
            dy_param_init = dict()
            dy_param_updated = dict()

            helper = DyGraphProgramDescTracerTestHelper(self)
            program = None

            for i in range(batch_num):
                enc_inputs, dec_inputs, label, weights = create_data()
                if i % 2 == 0:
                    outs, traced_layer = TracedLayer.trace(
                        transformer, [enc_inputs, dec_inputs, label, weights])

                    ins_static = enc_inputs + dec_inputs + [label, weights]
                    outs_static = traced_layer(ins_static)
                    helper.assertEachVar(outs, outs_static)
                    if program is not None:
                        self.assertTrue(
                            is_equal_program(program, traced_layer.program))

                    program = traced_layer.program
                    traced_layer.save_inference_model(
                        './infer_imperative_transformer',
                        feed=range(len(ins_static)),
                        fetch=range(len(outs_static)))
                else:
                    outs = transformer(enc_inputs, dec_inputs, label, weights)

                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = outs

                if i == 0:
                    for param in transformer.parameters():
                        dy_param_init[param.name] = param.numpy()

                dy_avg_cost.backward(backward_strategy)
                optimizer.minimize(dy_avg_cost)
                transformer.clear_gradients()

                if i == batch_num - 1:
                    for param in transformer.parameters():
                        dy_param_updated[param.name] = param.numpy()

            dy_avg_cost_value = dy_avg_cost.numpy()
            dy_sum_cost_value = dy_sum_cost.numpy()
            dy_predict_value = dy_predict.numpy()
            dy_token_num_value = dy_token_num.numpy()

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            transformer = TransFormer('transformer',
                                      ModelHyperParams.src_vocab_size,
                                      ModelHyperParams.trg_vocab_size,
                                      ModelHyperParams.max_length + 1,
                                      ModelHyperParams.n_layer,
                                      ModelHyperParams.n_head,
                                      ModelHyperParams.d_key,
                                      ModelHyperParams.d_value,
                                      ModelHyperParams.d_model,
                                      ModelHyperParams.d_inner_hid,
                                      ModelHyperParams.prepostprocess_dropout,
                                      ModelHyperParams.attention_dropout,
                                      ModelHyperParams.relu_dropout,
                                      ModelHyperParams.preprocess_cmd,
                                      ModelHyperParams.postprocess_cmd,
                                      ModelHyperParams.weight_sharing,
                                      TrainTaskConfig.label_smooth_eps,
                                      use_py_reader=use_py_reader,
                                      is_test=False)
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            optimizer = fluid.optimizer.SGD(learning_rate=0.003)

            data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                                     -1] + label_data_input_fields
            all_inputs = make_all_inputs(data_input_names)
            enc_inputs_len = len(encoder_data_input_fields)
            dec_inputs_len = len(decoder_data_input_fields[:-1])
            enc_inputs = all_inputs[0:enc_inputs_len]
            dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
                                    dec_inputs_len]
            label = all_inputs[-2]
            weights = all_inputs[-1]
            static_param_updated = dict()
            static_param_init = dict()
            static_param_name_list = list()
            static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
                enc_inputs, dec_inputs, label, weights)
            optimizer.minimize(static_avg_cost)
            for param in transformer.parameters():
                static_param_name_list.append(param.name)
            out = exe.run(fluid.default_startup_program(),
                          fetch_list=static_param_name_list)
            for i in range(len(static_param_name_list)):
                static_param_init[static_param_name_list[i]] = out[i]
            static_sum_cost_value = None
            static_avg_cost_value = None
            static_predict_value = None
            static_token_num_value = None
            for i in range(batch_num):
                feed_dict = create_feed_dict_list(create_data(True))
                fetch_list = [
                    static_sum_cost, static_avg_cost, static_predict,
                    static_token_num
                ]

                fetch_list.extend(static_param_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed=feed_dict,
                              fetch_list=fetch_list)
                static_sum_cost_value = out[0]
                static_avg_cost_value = out[1]
                static_predict_value = out[2]
                static_token_num_value = out[3]
                if i == batch_num - 1:
                    for k in range(4, len(out)):
                        static_param_updated[static_param_name_list[
                            k - 4]] = out[k]

        self.assertTrue(
            np.array_equal(static_avg_cost_value, dy_avg_cost_value))
        self.assertTrue(
            np.array_equal(static_sum_cost_value, dy_sum_cost_value))
        self.assertTrue(np.array_equal(static_predict_value, dy_predict_value))
        self.assertTrue(
            np.array_equal(static_token_num_value, dy_token_num_value))

        for key, value in six.iteritems(static_param_init):
            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
    def test_resnet_float32(self):
        seed = 90

        batch_size = train_parameters["batch_size"]
        batch_num = 10

        traced_layer = None

        with fluid.dygraph.guard():
            paddle.manual_seed(seed)
            paddle.framework.random._manual_program_seed(seed)

            resnet = ResNet()
            optimizer = optimizer_setting(train_parameters,
                                          parameter_list=resnet.parameters())
            np.random.seed(seed)

            batch_py_reader = fluid.io.PyReader(capacity=1)
            batch_py_reader.decorate_sample_list_generator(
                paddle.batch(self.reader_decorator(
                    paddle.dataset.flowers.train(use_xmap=False)),
                             batch_size=batch_size,
                             drop_last=True),
                places=fluid.CPUPlace())

            dy_param_init_value = {}
            for param in resnet.parameters():
                dy_param_init_value[param.name] = param.numpy()

            helper = DyGraphProgramDescTracerTestHelper(self)
            program = None

            for batch_id, data in enumerate(batch_py_reader()):
                if batch_id >= batch_num:
                    break

                img = data[0]
                label = data[1]
                label.stop_gradient = True

                out = None
                if batch_id % 5 == 0:
                    out, traced_layer = TracedLayer.trace(resnet, img)
                    if program is not None:
                        self.assertTrue(
                            is_equal_program(program, traced_layer.program))

                    traced_layer.save_inference_model(
                        './infer_imperative_resnet')

                    program = traced_layer.program
                else:
                    out = resnet(img)

                if traced_layer is not None:
                    resnet.eval()
                    traced_layer._switch(is_test=True)
                    out_dygraph = resnet(img)
                    out_static = traced_layer([img])
                    traced_layer._switch(is_test=False)
                    helper.assertEachVar(out_dygraph, out_static)
                    resnet.train()

                loss = fluid.layers.cross_entropy(input=out, label=label)
                avg_loss = fluid.layers.mean(x=loss)

                dy_out = avg_loss.numpy()

                if batch_id == 0:
                    for param in resnet.parameters():
                        if param.name not in dy_param_init_value:
                            dy_param_init_value[param.name] = param.numpy()

                avg_loss.backward()

                dy_grad_value = {}
                for param in resnet.parameters():
                    if param.trainable:
                        np_array = np.array(
                            param._grad_ivar().value().get_tensor())
                        dy_grad_value[param.name +
                                      core.grad_var_suffix()] = np_array

                optimizer.minimize(avg_loss)
                resnet.clear_gradients()

                dy_param_value = {}
                for param in resnet.parameters():
                    dy_param_value[param.name] = param.numpy()

        with new_program_scope():
            paddle.manual_seed(seed)
            paddle.framework.random._manual_program_seed(seed)

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

            resnet = ResNet()
            optimizer = optimizer_setting(train_parameters)

            np.random.seed(seed)
            train_reader = paddle.batch(
                paddle.dataset.flowers.train(use_xmap=False),
                batch_size=batch_size)

            img = fluid.layers.data(name='pixel',
                                    shape=[3, 224, 224],
                                    dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            out = resnet(img)
            loss = fluid.layers.cross_entropy(input=out, label=label)
            avg_loss = fluid.layers.mean(x=loss)
            optimizer.minimize(avg_loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            static_grad_name_list = []
            for param in resnet.parameters():
                static_param_name_list.append(param.name)
            for param in resnet.parameters():
                if param.trainable:
                    static_grad_name_list.append(param.name +
                                                 core.grad_var_suffix())

            out = exe.run(fluid.default_startup_program(),
                          fetch_list=static_param_name_list)

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            for batch_id, data in enumerate(train_reader()):
                if batch_id >= batch_num:
                    break

                static_x_data = np.array([
                    x[0].reshape(3, 224, 224) for x in data
                ]).astype('float32')
                y_data = np.array([x[1] for x in data
                                   ]).astype('int64').reshape([batch_size, 1])

                if traced_layer is not None:
                    traced_layer([static_x_data])

                fetch_list = [avg_loss.name]
                fetch_list.extend(static_param_name_list)
                fetch_list.extend(static_grad_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "pixel": static_x_data,
                                  "label": y_data
                              },
                              fetch_list=fetch_list)

                static_param_value = {}
                static_grad_value = {}
                static_out = out[0]
                param_start_pos = 1
                grad_start_pos = len(static_param_name_list) + param_start_pos
                for i in range(param_start_pos,
                               len(static_param_name_list) + param_start_pos):
                    static_param_value[static_param_name_list[
                        i - param_start_pos]] = out[i]
                for i in range(grad_start_pos,
                               len(static_grad_name_list) + grad_start_pos):
                    static_grad_value[static_grad_name_list[
                        i - grad_start_pos]] = out[i]

        print("static", static_out)
        print("dygraph", dy_out)
        self.assertTrue(np.allclose(static_out, dy_out))

        self.assertEqual(len(dy_param_init_value),
                         len(static_param_init_value))

        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
            self.assertTrue(np.isfinite(value.all()))
            self.assertFalse(np.isnan(value.any()))

        self.assertEqual(len(dy_grad_value), len(static_grad_value))
        for key, value in six.iteritems(static_grad_value):
            self.assertTrue(np.allclose(value, dy_grad_value[key]))
            self.assertTrue(np.isfinite(value.all()))
            self.assertFalse(np.isnan(value.any()))

        self.assertEqual(len(dy_param_value), len(static_param_value))
        for key, value in six.iteritems(static_param_value):
            self.assertTrue(np.allclose(value, dy_param_value[key]))
            self.assertTrue(np.isfinite(value.all()))
            self.assertFalse(np.isnan(value.any()))
Beispiel #3
0
    def ptb_rnn_cpu_float32(self, is_sparse):
        seed = 90
        hidden_size = 10
        vocab_size = 1000
        num_layers = 1
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
        batch_num = 200
        traced_layer = None

        with fluid.dygraph.guard():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=ptb_model.parameters())
            dy_param_updated = dict()
            dy_param_init = dict()
            dy_loss = None
            last_hidden = None
            last_cell = None

            helper = DyGraphProgramDescTracerTestHelper(self)
            program = None

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                if i % 5 == 0 and _in_legacy_dygraph():
                    outs, traced_layer = TracedLayer.trace(
                        ptb_model, [x, y, init_hidden, init_cell])
                    outs_static = traced_layer([x, y, init_hidden, init_cell])
                    helper.assertEachVar(outs, outs_static)

                    if program is not None:
                        self.assertTrue(
                            is_equal_program(traced_layer.program, program))

                    program = traced_layer.program

                    traced_layer.save_inference_model(
                        './infe_imperative_ptb_rnn', feed=list(range(4)))
                else:
                    outs = ptb_model(x, y, init_hidden, init_cell)

                dy_loss, last_hidden, last_cell = outs

                if i == 0:
                    for param in ptb_model.parameters():
                        dy_param_init[param.name] = param.numpy()
                dy_loss.backward()
                sgd.minimize(dy_loss)
                ptb_model.clear_gradients()
                if i == batch_num - 1:
                    for param in ptb_model.parameters():
                        dy_param_updated[param.name] = param.numpy()

            dy_loss_value = dy_loss.numpy()
            dy_last_cell_value = last_cell.numpy()
            dy_last_hidden_value = last_hidden.numpy()

        with new_program_scope():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            sgd = SGDOptimizer(learning_rate=1e-3)
            x = fluid.layers.data(name="x",
                                  shape=[-1, num_steps],
                                  dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(name="init_hidden",
                                            shape=[1],
                                            dtype='float32')
            init_cell = fluid.layers.data(name="init_cell",
                                          shape=[1],
                                          dtype='float32')

            static_loss, static_last_hidden, static_last_cell = ptb_model(
                x, y, init_hidden, init_cell)
            sgd.minimize(static_loss)
            static_param_updated = dict()
            static_param_init = dict()
            static_param_name_list = list()
            for param in ptb_model.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(framework.default_startup_program(),
                          fetch_list=static_param_name_list)
            for i in range(len(static_param_name_list)):
                static_param_init[static_param_name_list[i]] = out[i]
            static_loss_value = None
            static_last_cell_value = None
            static_last_hidden_value = None
            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                fetch_list = [
                    static_loss, static_last_hidden, static_last_cell
                ]
                fetch_list.extend(static_param_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "x": x_data,
                                  "y": y_data,
                                  "init_hidden": init_hidden_data,
                                  "init_cell": init_cell_data
                              },
                              fetch_list=fetch_list)
                static_loss_value = out[0]
                static_last_hidden_value = out[1]
                static_last_cell_value = out[2]

                if i == batch_num - 1:
                    for k in range(3, len(out)):
                        static_param_updated[static_param_name_list[
                            k - 3]] = out[k]

        self.assertTrue(np.array_equal(static_loss_value, dy_loss_value))
        self.assertTrue(
            np.array_equal(static_last_cell_value, dy_last_cell_value))
        self.assertTrue(
            np.array_equal(static_last_hidden_value, dy_last_hidden_value))
        for key, value in six.iteritems(static_param_init):
            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
Beispiel #4
0
        def run_dygraph():
            # NOTE(xiongkun03): In new executor, the inplace strategy is on by default, which will cause result of sumop have some differences. So we disable inplace.
            fluid.set_flags({'FLAGS_new_executor_use_inplace': False})
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            transformer = TransFormer(ModelHyperParams.src_vocab_size,
                                      ModelHyperParams.trg_vocab_size,
                                      ModelHyperParams.max_length + 1,
                                      ModelHyperParams.n_layer,
                                      ModelHyperParams.n_head,
                                      ModelHyperParams.d_key,
                                      ModelHyperParams.d_value,
                                      ModelHyperParams.d_model,
                                      ModelHyperParams.d_inner_hid,
                                      ModelHyperParams.prepostprocess_dropout,
                                      ModelHyperParams.attention_dropout,
                                      ModelHyperParams.relu_dropout,
                                      ModelHyperParams.preprocess_cmd,
                                      ModelHyperParams.postprocess_cmd,
                                      ModelHyperParams.weight_sharing,
                                      TrainTaskConfig.label_smooth_eps,
                                      use_py_reader=use_py_reader,
                                      is_test=False,
                                      is_sparse=is_sparse)
            if sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
                optimizer = fluid.optimizer.Adam(
                    learning_rate=learning_rate,
                    beta1=TrainTaskConfig.beta1,
                    beta2=TrainTaskConfig.beta2,
                    epsilon=TrainTaskConfig.eps,
                    parameter_list=transformer.parameters())
            else:
                optimizer = fluid.optimizer.SGD(
                    learning_rate=0.003,
                    parameter_list=transformer.parameters())
            dy_param_init = dict()
            dy_param_updated = dict()

            helper = DyGraphProgramDescTracerTestHelper(self)
            program = None

            for i in range(batch_num):
                enc_inputs, dec_inputs, label, weights = create_data()
                if False:
                    outs, traced_layer = TracedLayer.trace(
                        transformer, [enc_inputs, dec_inputs, label, weights])

                    ins_static = enc_inputs + dec_inputs + [label, weights]
                    outs_static = traced_layer(ins_static)
                    helper.assertEachVar(outs, outs_static)
                    if program is not None:
                        self.assertTrue(
                            is_equal_program(program, traced_layer.program))

                    program = traced_layer.program
                    traced_layer.save_inference_model(
                        './infer_imperative_transformer',
                        feed=list(range(len(ins_static))),
                        fetch=list(range(len(outs_static))))
                else:
                    outs = transformer(enc_inputs, dec_inputs, label, weights)

                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = outs

                if i == 0:
                    for param in transformer.parameters():
                        dy_param_init[param.name] = param.numpy()

                dy_avg_cost.backward()
                optimizer.minimize(dy_avg_cost)
                transformer.clear_gradients()

                if i == batch_num - 1:
                    for param in transformer.parameters():
                        dy_param_updated[param.name] = param.numpy()

            dy_avg_cost_value = dy_avg_cost.numpy()
            dy_sum_cost_value = dy_sum_cost.numpy()
            dy_predict_value = dy_predict.numpy()
            dy_token_num_value = dy_token_num.numpy()

            return dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \
                dy_param_init, dy_param_updated