def test_tensor_sub_scalar(self): # tensor(int64) - scalar(int) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='int64') b = 1 c = paddle.zeros([2, 2, 2], dtype="int64") self.check_operation(a, b, c, '-') # tensor(float32) - scalar(int) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='float32') b = 1 c = paddle.zeros([2, 2, 2], dtype="float32") self.check_operation(a, b, c, '-') # tensor(int64) - scalar(float, .0) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='int64') b = 1.0 c = paddle.zeros([2, 2, 2], dtype="float32") self.check_operation(a, b, c, '-') # tensor(int64) - scalar(float, .5) with program_guard(Program()): a = paddle.full([2, 2, 2], 2, dtype='int64') b = 1.5 c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '-') # tensor(float32) - scalar(float) with program_guard(Program()): a = paddle.full([2, 2, 2], 2, dtype='float32') b = 1.5 c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '-')
def test_scalar_div_tensor(self): # scalar(int) / tensor(int64) with program_guard(Program()): a = 1 b = paddle.full([2, 2, 2], 2, dtype='int64') c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '/') # scalar(int) / tensor(float32) with program_guard(Program()): a = 1 b = paddle.full([2, 2, 2], 0.5, dtype='float32') c = paddle.full([2, 2, 2], 2, dtype="float32") self.check_operation(a, b, c, '/') # scalar(float) / tensor(int64) with program_guard(Program()): a = 1.0 b = paddle.full([2, 2, 2], 2, dtype='int64') c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '/') # scalar(float) / tensor(float32) with program_guard(Program()): a = 1.0 b = paddle.full([2, 2, 2], 0.5, dtype='float32') c = paddle.full([2, 2, 2], 2, dtype="float32") self.check_operation(a, b, c, '/')
def test_tensor_mod_scalar(self): # tensor(int64) % scalar(int) with program_guard(Program()): a = paddle.full([2, 2, 2], 3, dtype='int64') b = 2 c = paddle.full([2, 2, 2], 1, dtype="int64") self.check_operation(a, b, c, '%') # tensor(int64) % scalar(float) with program_guard(Program()): a = paddle.full([2, 2, 2], 3, dtype='int64') b = 2.0 c = paddle.full([2, 2, 2], 1, dtype="float32") self.check_operation(a, b, c, '%') # tensor(float32) % scalar(int) with program_guard(Program()): a = paddle.full([2, 2, 2], 3, dtype='float32') b = 2 c = paddle.full([2, 2, 2], 1, dtype="float32") self.check_operation(a, b, c, '%') # tensor(float32) % scalar(float) with program_guard(Program()): a = paddle.full([2, 2, 2], 3, dtype='float32') b = 2.0 c = paddle.full([2, 2, 2], 1, dtype="float32") self.check_operation(a, b, c, '%')
def test_scalar_pow_tensor(self): # scalar(int) ** tensor(int64) with program_guard(Program()): a = 3 b = paddle.full([2, 2, 2], 2, dtype='int64') c = paddle.full([2, 2, 2], 9, dtype="int64") self.check_operation(a, b, c, '**') # scalar(float) ** tensor(int64) with program_guard(Program()): a = 3.0 b = paddle.full([2, 2, 2], 2, dtype='int64') c = paddle.full([2, 2, 2], 9, dtype="float32") self.check_operation(a, b, c, '**') # scalar(int) ** tensor(float32) with program_guard(Program()): a = 3 b = paddle.full([2, 2, 2], 2, dtype='float32') c = paddle.full([2, 2, 2], 9, dtype="float32") self.check_operation(a, b, c, '**') # tensor(float32) ** scalar(float) with program_guard(Program()): a = 3.0 b = paddle.full([2, 2, 2], 2, dtype='float32') c = paddle.full([2, 2, 2], 9, dtype="float32") self.check_operation(a, b, c, '**')
def test_tensor_div_scalar(self): # tensor(int64) / scalar(int) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='int64') b = 2 c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '/') # tensor(float32) / scalar(int) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='float32') b = 2 c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '/') # tensor(int64) / scalar(float, .0) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='int64') b = 2.0 c = paddle.full([2, 2, 2], 0.5, dtype="float32") self.check_operation(a, b, c, '/') # tensor(int64) / scalar(float, .5) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='int64') b = 0.5 c = paddle.full([2, 2, 2], 2, dtype="float32") self.check_operation(a, b, c, '/') # tensor(float32) / scalar(float) with program_guard(Program()): a = paddle.ones([2, 2, 2], dtype='float32') b = 0.5 c = paddle.full([2, 2, 2], 2, dtype="float32") self.check_operation(a, b, c, '/')
def test_scalar_sub_tensor(self): # scalar(int) - tensor(int64) with program_guard(Program()): a = 1 b = paddle.ones([2, 2, 2], dtype='int64') c = paddle.zeros([2, 2, 2], dtype="int64") self.check_operation(a, b, c, '-') # scalar(int) - tensor(float32) with program_guard(Program()): a = 1 b = paddle.ones([2, 2, 2], dtype='float32') c = paddle.zeros([2, 2, 2], dtype="float32") self.check_operation(a, b, c, '-') # scalar(float, .0) - tensor(int64) with program_guard(Program()): a = 1.0 b = paddle.ones([2, 2, 2], dtype='int64') c = paddle.zeros([2, 2, 2], dtype="float32") self.check_operation(a, b, c, '-') # scalar(float, .5) - tensor(int64) with program_guard(Program()): a = 1.5 b = paddle.full([2, 2, 2], 2, dtype='int64') c = paddle.full([2, 2, 2], -0.5, dtype="float32") self.check_operation(a, b, c, '-') # scalar(float) - tensor(float32) with program_guard(Program()): a = 1.5 b = paddle.full([2, 2, 2], 2, dtype='float32') c = paddle.full([2, 2, 2], -0.5, dtype="float32") self.check_operation(a, b, c, '-')
def test_errors(self): # test static computation graph: dtype can not be int8 paddle.enable_static() with program_guard(Program(), Program()): x = paddle.static.data(name='x', shape=[100], dtype=np.int8) y = paddle.static.data(name='y', shape=[100], dtype=np.int8) self.assertRaises(TypeError, paddle.inner, x, y) # test static computation graph: inputs must be broadcastable with program_guard(Program(), Program()): x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64) y = paddle.static.data(name='y', shape=[20], dtype=np.float64) self.assertRaises(ValueError, paddle.inner, x, y) np.random.seed(7) # test dynamic computation graph: dtype can not be int8 paddle.disable_static() x_data = np.random.randn(200).astype(np.int8) y_data = np.random.randn(200).astype(np.int8) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) self.assertRaises(RuntimeError, paddle.inner, x, y) # test dynamic computation graph: inputs must be broadcastable x_data = np.random.rand(20, 5) y_data = np.random.rand(10, 2) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) self.assertRaises(ValueError, paddle.inner, x, y) # test dynamic computation graph: dtype must be same x_data = np.random.randn(200).astype(np.float32) y_data = np.random.randn(200).astype(np.float64) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) self.assertRaises(ValueError, paddle.inner, x, y) # test dynamic computation graph: dtype must be Tensor type x_data = np.random.randn(200).astype(np.float64) y_data = np.random.randn(200).astype(np.float64) y = paddle.to_tensor(y_data) self.assertRaises(ValueError, paddle.inner, x_data, y) # test dynamic computation graph: dtype must be Tensor type x_data = np.random.randn(200).astype(np.float64) y_data = np.random.randn(200).astype(np.float64) x = paddle.to_tensor(x_data) self.assertRaises(ValueError, paddle.inner, x, y_data) # test dynamic computation graph: dtype must be Tensor type x_data = np.random.randn(200).astype(np.float32) y_data = np.random.randn(200).astype(np.float32) self.assertRaises(ValueError, paddle.inner, x_data, y_data)
def run_static_api(self, place): paddle.enable_static() expected = calc_margin_rank_loss(self.x_data, self.y_data, self.label_data, margin=margin, reduction=reduction) with program_guard(Program(), Program()): x = paddle.static.data(name="x", shape=[10, 10], dtype="float64") y = paddle.static.data(name="y", shape=[10, 10], dtype="float64") label = paddle.static.data(name="label", shape=[10, 10], dtype="float64") margin_rank_loss = paddle.nn.loss.MarginRankingLoss( margin=margin, reduction=reduction) result = margin_rank_loss(x, y, label) exe = paddle.static.Executor(place) result_numpy, = exe.run(feed={ "x": self.x_data, "y": self.y_data, "label": self.label_data }, fetch_list=[result]) self.assertTrue(np.allclose(result_numpy, expected)) self.assertTrue('loss' in result.name)
def test_tensor_floordiv_scalar(self): # tensor(int64) // scalar(int) with program_guard(Program()): a = paddle.full([2, 2, 2], 3, dtype='int64') b = 2 c = paddle.full([2, 2, 2], 1, dtype="int64") self.check_operation(a, b, c, '//')
def _test_api(self): paddle.enable_static() input = np.random.random([2, 25]).astype("float32") shape = [2, 5, 5] main_prog = Program() with program_guard(main_prog, Program()): positive_five = self.fill_constant([1], "int32", 5) x = self.data(name="x", shape=[2, 25], dtype="float32") actual_shape = self.data(name="shape", shape=[3], dtype="int32") # situation 1: have shape( list, no tensor), no actual shape(Tensor) out_1 = self.reshape(x, shape) # situation 2: have shape(list, no tensor), have actual shape(Tensor) out_2 = fluid.layers.reshape( x, shape=shape, actual_shape=actual_shape) # Situation 3: have shape(list, have tensor), no actual shape(Tensor) out_3 = self.reshape(x, shape=[positive_five, 10]) # Situation 4: have shape(Tensor), no actual shape(Tensor) out_4 = self.reshape(x, shape=actual_shape) exe = paddle.static.Executor(place=paddle.CPUPlace()) res_1, res_2, res_3, res_4 = exe.run( main_prog, feed={"x": input, "shape": np.array([2, 5, 5]).astype("int32")}, fetch_list=[out_1, out_2, out_3, out_4]) assert np.array_equal(res_1, input.reshape(shape)) assert np.array_equal(res_2, input.reshape(shape)) assert np.array_equal(res_3, input.reshape([5, 10])) assert np.array_equal(res_4, input.reshape(shape))
def test_api(self): shape = [1000, 784] train_program = Program() startup_program = Program() with program_guard(train_program, startup_program): x1 = paddle.randn(shape, 'float32') x2 = paddle.randn(shape, 'float64') dim_1 = paddle.fluid.layers.fill_constant([1], "int64", 20) dim_2 = paddle.fluid.layers.fill_constant([1], "int32", 50) x3 = paddle.randn([dim_1, dim_2, 784]) var_shape = paddle.static.data('X', [2], 'int32') x4 = paddle.randn(var_shape) place = paddle.CUDAPlace( 0) if core.is_compiled_with_cuda() else paddle.CPUPlace() exe = paddle.static.Executor(place) res = exe.run(train_program, feed={'X': np.array(shape, dtype='int32')}, fetch_list=[x1, x2, x3, x4]) for out in res: self.assertAlmostEqual(np.mean(out), .0, delta=0.1) self.assertAlmostEqual(np.std(out), 1., delta=0.1)
def setUp(self): self._places = [paddle.CPUPlace()] if paddle.device.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) self._ema_decay = 0.999 self._param_name = "fc.weight" self._train_program = static.Program() self._startup_prog = static.Program() strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True paddle.distributed.fleet.init(is_collective=True, strategy=strategy) with static.program_guard(self._train_program, self._startup_prog): with utils.unique_name.guard(): data = static.data(name='x', shape=[-1, 5], dtype='float32') hidden = static.nn.fc(x=data, size=10, weight_attr=self._param_name) cost = paddle.mean(hidden) self._test_program = static.default_main_program().clone( for_test=True) optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer = paddle.distributed.fleet.distributed_optimizer( optimizer, strategy) optimizer.minimize(cost) self._ema = static.ExponentialMovingAverage(self._ema_decay) self._ema.update()
def rnn_pretrain_forward(train_program, start_program, topo=None): with static.program_guard(train_program, start_program), paddle.utils.unique_name.guard(): batch_size = 1 tokens = static.data( name="tokens", shape=[batch_size, -1], dtype="int64") seq_len = static.data(name="ids", shape=[batch_size], dtype="int64") labels = static.data(name="labels", shape=[batch_size], dtype="int64") data_holders = [tokens, seq_len, labels] vocab_size = 10 num_classes = 2 pad_token_id = 0 model = RNNModel( vocab_size, num_classes, direction='forward', padding_idx=pad_token_id, pooling_type='max') optimizer = paddle.optimizer.Adam( parameters=model.parameters(), learning_rate=0.001) criterion = paddle.nn.CrossEntropyLoss() preds = model(tokens, seq_len) loss = criterion(preds, labels) return train_program, start_program, loss, optimizer, data_holders
def mlp_forward(train_program, start_program): with static.program_guard(train_program,start_program), \ utils.unique_name.guard(): batch_size = 4 hidden_size = 64 input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [0, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def build_program(main_program, startup_program, image_shape, archs, args, is_train): with static.program_guard(main_program, startup_program): data_loader, data, label, drop_path_prob, drop_path_mask = create_data_loader( image_shape, is_train, args) logits, logits_aux = archs(data, drop_path_prob, drop_path_mask, is_train, 10) top1 = paddle.metric.accuracy(input=logits, label=label, k=1) top5 = paddle.metric.accuracy(input=logits, label=label, k=5) loss = paddle.mean(F.softmax_with_cross_entropy(logits, label)) if is_train: if auxiliary: loss_aux = paddle.mean( F.softmax_with_cross_entropy(logits_aux, label)) loss = loss + auxiliary_weight * loss_aux step_per_epoch = int(trainset_num / args.batch_size) learning_rate = paddle.optimizer.lr.CosineAnnealingDecay( lr, T_max=step_per_epoch * args.retain_epoch) optimizer = paddle.optimizer.Momentum( learning_rate, momentum, weight_decay=paddle.regularizer.L2Decay(weight_decay), grad_clip=nn.ClipGradByGlobalNorm(clip_norm=5.0)) optimizer.minimize(loss) outs = [loss, top1, top5] else: outs = [loss, top1, top5] return outs, (data, label), data_loader
def test_pipeline_amp_optimizer(self): """ test pipeline& with device:all """ role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.amp = True strategy.pipeline = True strategy.pipeline_configs = { 'micro_batch_size': 1, 'accumulate_steps': 2 } train_prog, startup_prog = static.Program(), static.Program() with static.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = self.net() optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) ops = train_prog._pipeline_opt['section_program'].global_block().ops ops = [op.type for op in ops] self.assertEqual(ops.count('send_v2'), 1) self.assertEqual(ops.count('recv_v2'), 1)
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) loader = paddle.io.DataLoader.from_generator(feed_list=[input, label], capacity=4 * batch_size, iterable=True) return loss, train_program, start_program, loader
def linear_static(func, device, dtype, np_x, np_weight, np_bias): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype) weight = static.data(name="weight", shape=np_weight.shape, dtype=dtype) bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype) x.stop_gradient = False weight.stop_gradient = False bias.stop_gradient = False out = func(x, weight, bias) mean_out = paddle.mean(out) static.append_backward(mean_out) exe = static.Executor() exe.run(static.default_startup_program()) out_v, x_grad_v, weight_grad_v, bias_grad_v = exe.run( static.default_main_program(), feed={ "x": np_x.astype(dtype), "weight": np_weight.astype(dtype), "bias": np_bias.astype(dtype) }, fetch_list=[ out.name, x.name + "@GRAD", weight.name + "@GRAD", bias.name + "@GRAD" ]) paddle.disable_static() return out_v, x_grad_v, weight_grad_v, bias_grad_v
def test_static_graph(self): paddle.enable_static() dtype = 'float32' train_program = Program() startup_program = Program() with program_guard(train_program, startup_program): x = np.random.random(self.x_shape).astype(dtype) data_x = paddle.static.data('x', shape=self.data_x_shape, dtype=dtype) out = paddle.empty_like(data_x) place = paddle.CUDAPlace( 0) if core.is_compiled_with_cuda() else paddle.CPUPlace() exe = paddle.static.Executor(place) res = exe.run(train_program, feed={'x': x}, fetch_list=[out]) self.dst_dtype = dtype self.dst_shape = x.shape self.__check_out__(res[0]) paddle.disable_static()
def custom_relu_static_pe(func, device, dtype, np_x, use_func=True): paddle.enable_static() paddle.set_device(device) places = static.cpu_places() if device is 'cpu' else static.cuda_places() with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = func(x) if use_func else paddle.nn.functional.relu(x) static.append_backward(out) exe = static.Executor() exe.run(static.default_startup_program()) # in static mode, x data has been covered by out compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=out.name, places=places) out_v = exe.run(compiled_prog, feed={'X': np_x}, fetch_list=[out.name]) paddle.disable_static() return out_v
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) return avg_cost, train_program, start_program
def custom_relu_static(func, device, dtype, np_x, use_func=True, test_infer=False): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = func(x) if use_func else paddle.nn.functional.relu(x) static.append_backward(out) exe = static.Executor() exe.run(static.default_startup_program()) # in static mode, x data has been covered by out out_v = exe.run(static.default_main_program(), feed={'X': np_x}, fetch_list=[out.name]) paddle.disable_static() return out_v
def check_static_result(self, place): from paddle.distributed.fleet.meta_parallel.parallel_layers.random import dropout with static.program_guard(static.Program(), static.Program()): input = static.data(name="input", shape=[40, 40], dtype="float32") res1 = dropout( input, p=0.3, training=True, mode='upscale_in_train', rng_name='seed0') res2 = dropout( input, p=0.3, training=True, mode='upscale_in_train', rng_name='seed1') res3 = dropout(input, p=0.3) in_np = np.random.random([40, 40]).astype("float32") exe = static.Executor(place) res_list = [res1, res2] for i in range(2): out1, out2 = exe.run(static.default_main_program(), feed={"input": in_np}, fetch_list=res_list) self.assertTrue(np.allclose(out1, out2))
def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size], dtype='int32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [-1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def test_attr_tensor_API(self): startup_program = Program() train_program = Program() with program_guard(train_program, startup_program): fill_value = 2.0 input = paddle.fluid.data(name='input', dtype='float32', shape=[2, 3]) output = paddle.full_like(input, fill_value) output_dtype = paddle.full_like(input, fill_value, dtype='float32') place = paddle.CPUPlace() if core.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) exe = paddle.static.Executor(place) exe.run(startup_program) img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32) res = exe.run(train_program, feed={'input': img}, fetch_list=[output]) out_np = np.array(res[0]) self.assertTrue(not (out_np - np.full_like(img, fill_value)).any(), msg="full_like output is wrong, out = " + str(out_np))
def test_static_graph(self): for x_stop_gradient in [False, True]: for vec_stop_gradient in [False, True]: paddle.enable_static() train_program = Program() startup_program = Program() self.input_x = np.random.rand(5, 100).astype("float64") self.input_vec = np.random.rand(100).astype("float64") with program_guard(train_program, startup_program): data_x = paddle.static.data("x", shape=[5, 100], dtype="float64") data_vec = paddle.static.data("vec", shape=[100], dtype="float64") data_x.stop_gradient = x_stop_gradient data_vec.stop_gradient = vec_stop_gradient result_vec = paddle.mv(data_x, data_vec) self.place = paddle.CPUPlace() exe = paddle.static.Executor(self.place) res, = exe.run(feed={ "x": self.input_x, "vec": self.input_vec }, fetch_list=[result_vec]) z_expected = np.array(np.dot(self.input_x, self.input_vec)) self.assertTrue(np.allclose(res, z_expected))
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) out = mlp(input) return train_program, start_program
def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sqrt_hidden_size = 32 double_hidden_size = 64 input = static.data(name="input", shape=[8, 8, 16], dtype='int32') input = paddle.reshape(input, [hidden_size]) input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size]) embedding = paddle.nn.Embedding(2, batch_size, sparse=True) input = embedding(input) input = paddle.reshape(input, [hidden_size, batch_size]) input = paddle.transpose(input, perm=[1, 0]) matmulinput = static.data(name="matmulinput", shape=[hidden_size, hidden_size], dtype='float32') input = layers.matmul(x=input, y=matmulinput) label = static.data(name="label", shape=[batch_size, 1], dtype='float32') mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) m = paddle.nn.Softmax() loss = m(loss) return loss, train_program, start_program
def gpt_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 16 sequence_len = 512 input_ids = static.data( name="input_ids", shape=[batch_size, sequence_len], dtype='int64') position_ids = static.data( name="position_ids", shape=[batch_size, sequence_len], dtype='int64') attention_mask = static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float64') labels = static.data( name="labels", shape=[batch_size, sequence_len], dtype='int64') loss_mask = static.data( name="loss_mask", shape=[batch_size, sequence_len], dtype='float64') if _global_parallel_strategy == "dp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) gpt = GPTModel( vocab_size=32768, hidden_size=1024, num_hidden_layers=2, num_attention_heads=16, intermediate_size=4096, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=1024, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, topo=None) model = GPTForPretraining(gpt) preds = model(input_ids, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) return train_program, start_program
def test_static_empty_input_error(self): paddle.enable_static() x_list_n_n, x_list_m_n = gen_empty_input() for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p)