def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size], dtype='int32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [-1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sqrt_hidden_size = 32 double_hidden_size = 64 input = static.data(name="input", shape=[8, 8, 16], dtype='int32') input = paddle.reshape(input, [hidden_size]) input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size]) embedding = paddle.nn.Embedding(2, batch_size, sparse=True) input = embedding(input) input = paddle.reshape(input, [hidden_size, batch_size]) input = paddle.transpose(input, perm=[1, 0]) matmulinput = static.data(name="matmulinput", shape=[hidden_size, hidden_size], dtype='float32') input = layers.matmul(x=input, y=matmulinput) label = static.data(name="label", shape=[batch_size, 1], dtype='float32') mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) m = paddle.nn.Softmax() loss = m(loss) return loss, train_program, start_program
def linear_static(func, device, dtype, np_x, np_weight, np_bias): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype) weight = static.data(name="weight", shape=np_weight.shape, dtype=dtype) bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype) x.stop_gradient = False weight.stop_gradient = False bias.stop_gradient = False out = func(x, weight, bias) mean_out = paddle.mean(out) static.append_backward(mean_out) exe = static.Executor() exe.run(static.default_startup_program()) out_v, x_grad_v, weight_grad_v, bias_grad_v = exe.run( static.default_main_program(), feed={ "x": np_x.astype(dtype), "weight": np_weight.astype(dtype), "bias": np_bias.astype(dtype) }, fetch_list=[ out.name, x.name + "@GRAD", weight.name + "@GRAD", bias.name + "@GRAD" ]) paddle.disable_static() return out_v, x_grad_v, weight_grad_v, bias_grad_v
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) return avg_cost, train_program, start_program
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) loader = paddle.io.DataLoader.from_generator(feed_list=[input, label], capacity=4 * batch_size, iterable=True) return loss, train_program, start_program, loader
def mlp_forward(train_program, start_program): with static.program_guard(train_program,start_program), \ utils.unique_name.guard(): batch_size = 4 hidden_size = 64 input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [0, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def rnn_pretrain_forward(train_program, start_program, topo=None): with static.program_guard(train_program, start_program), paddle.utils.unique_name.guard(): batch_size = 1 tokens = static.data( name="tokens", shape=[batch_size, -1], dtype="int64") seq_len = static.data(name="ids", shape=[batch_size], dtype="int64") labels = static.data(name="labels", shape=[batch_size], dtype="int64") data_holders = [tokens, seq_len, labels] vocab_size = 10 num_classes = 2 pad_token_id = 0 model = RNNModel( vocab_size, num_classes, direction='forward', padding_idx=pad_token_id, pooling_type='max') optimizer = paddle.optimizer.Adam( parameters=model.parameters(), learning_rate=0.001) criterion = paddle.nn.CrossEntropyLoss() preds = model(tokens, seq_len) loss = criterion(preds, labels) return train_program, start_program, loss, optimizer, data_holders
def gpt_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 16 sequence_len = 512 input_ids = static.data( name="input_ids", shape=[batch_size, sequence_len], dtype='int64') position_ids = static.data( name="position_ids", shape=[batch_size, sequence_len], dtype='int64') attention_mask = static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float64') labels = static.data( name="labels", shape=[batch_size, sequence_len], dtype='int64') loss_mask = static.data( name="loss_mask", shape=[batch_size, sequence_len], dtype='float64') if _global_parallel_strategy == "dp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) gpt = GPTModel( vocab_size=32768, hidden_size=1024, num_hidden_layers=2, num_attention_heads=16, intermediate_size=4096, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=1024, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, topo=None) model = GPTForPretraining(gpt) preds = model(input_ids, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) return train_program, start_program
def network(): img = static.data(name='image', shape=[None, 784]) hidden = static.nn.fc(input=img, size=200, act='relu') hidden = F.dropout(hidden, p=0.5) loss = F.cross_entropy(input=static.nn.fc(hidden, size=10, act='softmax'), label=static.data(name='label', shape=[1], dtype='int64')) avg_loss = paddle.mean(loss) return avg_loss
def get_prog(self): main_program = Program() with program_guard(main_program): a = static.data(name="a", shape=[32, 32], dtype='float32') b = static.data(name="b", shape=[32, 32], dtype='float32') out = a / b fp16_a = a.cast(paddle.float16) fp16_b = b.cast(paddle.float16) out = fp16_a + fp16_b return main_program, out
def get_model(self, place, gradient_merge, batch_size, max_step): paddle.seed(2021) random.seed(2021) np.random.seed(2021) hidden_size = 128 global _global_parallel_strategy global _global_process_mesh world_size = paddle.distributed.get_world_size() if world_size == 1: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0]) elif world_size == 2: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0, 1]) train_program = static.Program() startup_program = static.Program() dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True #if gradient_merge: # dist_strategy.gradient_merge = True # dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} fleet.init(is_collective=True, strategy=dist_strategy) with static.program_guard(train_program, startup_program), \ utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') input.stop_gradient = False loss = mlp_forward(input, label, hidden_size) optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) input_data = np.random.random(size=(128, hidden_size)).astype('float32') label_data = np.random.random(size=(128, 1)).astype('float32') def reader(): for i in range(max_step): x_data = input_data[i * batch_size:(i + 1) * batch_size, :] y_data = label_data[i * batch_size:(i + 1) * batch_size, :] yield x_data, y_data return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
def build_program(main_program, startup_program, image_shape, dataset, archs, args, places, is_test=False): with static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): data_shape = [None] + image_shape data = static.data(name='data', shape=data_shape, dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') if args.data == 'cifar10': paddle.assign(paddle.reshape(label, [-1, 1]), label) if is_test: data_loader = paddle.io.DataLoader(dataset, places=places, feed_list=[data, label], drop_last=False, batch_size=args.batch_size, return_list=False, shuffle=False) else: data_loader = paddle.io.DataLoader(dataset, places=places, feed_list=[data, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) output = archs(data) output = static.nn.fc(output, size=args.class_dim) softmax_out = F.softmax(output) cost = F.cross_entropy(softmax_out, label=label) avg_cost = paddle.mean(cost) acc_top1 = paddle.metric.accuracy(input=softmax_out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=softmax_out, label=label, k=5) if is_test == False: optimizer = create_optimizer(args) optimizer.minimize(avg_cost) return data_loader, avg_cost, acc_top1, acc_top5
def test_static(self): mp, sp = static.Program(), static.Program() with static.program_guard(mp, sp): x = static.data("x", shape=[10, 10], dtype="float64") y = static.data("y", shape=[10, 10], dtype="float64") out = paddle.complex(x, y) exe = static.Executor() exe.run(sp) [out_np] = exe.run(mp, feed={"x": self.x, "y": self.y}, fetch_list=[out]) self.assertTrue(np.allclose(self.out, out_np))
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) out = mlp(input) return train_program, start_program
def check_static_result(self, place): from paddle.distributed.fleet.meta_parallel.parallel_layers.random import dropout with static.program_guard(static.Program(), static.Program()): input = static.data(name="input", shape=[40, 40], dtype="float32") res1 = dropout( input, p=0.3, training=True, mode='upscale_in_train', rng_name='seed0') res2 = dropout( input, p=0.3, training=True, mode='upscale_in_train', rng_name='seed1') res3 = dropout(input, p=0.3) in_np = np.random.random([40, 40]).astype("float32") exe = static.Executor(place) res_list = [res1, res2] for i in range(2): out1, out2 = exe.run(static.default_main_program(), feed={"input": in_np}, fetch_list=res_list) self.assertTrue(np.allclose(out1, out2))
def custom_relu_static(func, device, dtype, np_x, use_func=True, test_infer=False): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = func(x) if use_func else paddle.nn.functional.relu(x) static.append_backward(out) exe = static.Executor() exe.run(static.default_startup_program()) # in static mode, x data has been covered by out out_v = exe.run(static.default_main_program(), feed={'X': np_x}, fetch_list=[out.name]) paddle.disable_static() return out_v
def setUp(self): self._places = [paddle.CPUPlace()] if paddle.device.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) self._ema_decay = 0.999 self._param_name = "fc.weight" self._train_program = static.Program() self._startup_prog = static.Program() strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True paddle.distributed.fleet.init(is_collective=True, strategy=strategy) with static.program_guard(self._train_program, self._startup_prog): with utils.unique_name.guard(): data = static.data(name='x', shape=[-1, 5], dtype='float32') hidden = static.nn.fc(x=data, size=10, weight_attr=self._param_name) cost = paddle.mean(hidden) self._test_program = static.default_main_program().clone( for_test=True) optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer = paddle.distributed.fleet.distributed_optimizer( optimizer, strategy) optimizer.minimize(cost) self._ema = static.ExponentialMovingAverage(self._ema_decay) self._ema.update()
def custom_relu_static_pe(func, device, dtype, np_x, use_func=True): paddle.enable_static() paddle.set_device(device) places = static.cpu_places() if device is 'cpu' else static.cuda_places() with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = func(x) if use_func else paddle.nn.functional.relu(x) static.append_backward(out) exe = static.Executor() exe.run(static.default_startup_program()) # in static mode, x data has been covered by out compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=out.name, places=places) out_v = exe.run(compiled_prog, feed={'X': np_x}, fetch_list=[out.name]) paddle.disable_static() return out_v
def custom_relu_static_inference(func, device, np_data, np_label, path_prefix): paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): # simple module data = static.data(name='data', shape=[None, 1, 28, 28], dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') hidden = static.nn.fc(data, size=128) hidden = func(hidden) hidden = static.nn.fc(hidden, size=128) predict = static.nn.fc(hidden, size=10, activation='softmax') loss = paddle.nn.functional.cross_entropy(input=hidden, label=label) avg_loss = paddle.mean(loss) opt = paddle.optimizer.SGD(learning_rate=0.1) opt.minimize(avg_loss) # run start up model exe = static.Executor() exe.run(static.default_startup_program()) # train for i in range(4): avg_loss_v = exe.run(static.default_main_program(), feed={ 'data': np_data, 'label': np_label }, fetch_list=[avg_loss]) # save inference model static.save_inference_model(path_prefix, [data], [predict], exe) # get train predict value predict_v = exe.run(static.default_main_program(), feed={ 'data': np_data, 'label': np_label }, fetch_list=[predict]) return predict_v
def test_static_empty_input_error(self): paddle.enable_static() x_list_n_n, x_list_m_n = gen_empty_input() for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p)
def build_program(): program = static.Program() with static.program_guard(program): data = static.data(name='x', shape=[None, 13], dtype='float32') hidden = static.nn.fc(data, size=10) loss = paddle.mean(hidden) paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) return program
def concat_static(func, dtype, np_inputs, axis_v, with_attr=False): paddle.enable_static() paddle.set_device("cpu") with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x1 = static.data(name="x1", shape=[2, 3], dtype=dtype) x2 = static.data(name="x2", shape=[2, 3], dtype=dtype) if with_attr: axis = axis_v else: axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) x1.stop_gradient = False x2.stop_gradient = False total_time = 0 for i in range(TEST_TIME): start = time.time() out = func([x1, x2], axis) total_time += time.time() - start print("- static mode concat time cost: {} s".format(total_time / TEST_TIME)) # mean only support float, so here use sum sum_out = paddle.sum(out) static.append_backward(sum_out) exe = static.Executor() exe.run(static.default_startup_program()) if with_attr: feed_dict = { "x1": np_inputs[0].astype(dtype), "x2": np_inputs[1].astype(dtype) } else: feed_dict = { "x1": np_inputs[0].astype(dtype), "x2": np_inputs[1].astype(dtype), "axis": axis } out_v, x1_grad_v, x2_grad_v = exe.run( static.default_main_program(), feed=feed_dict, fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"]) paddle.disable_static() return out_v, x1_grad_v, x2_grad_v
def net(self): input_size = 4096 output_size = 4096 x = static.data(name='X', shape=[1000, 4096], dtype='float32') label = static.data(name='Y', shape=[1000, 4096], dtype='float32') model = SimpleNet(input_size, output_size) # 定义模型 mse = paddle.nn.MSELoss() out = model(x) loss = mse(out, label) opt = paddle.fluid.optimizer.Adam( learning_rate=0.0001, parameter_list=model.parameters()) # 定义优化器 opt = paddle.static.amp.decorate(opt, init_loss_scaling=128.0, use_dynamic_loss_scaling=True) opt.minimize(loss) return model, loss, opt
def test_static_api_error(self): paddle.enable_static() # test raising errors when 'cond' is called in static mode p_list_error = ('f ro', 'fre', 'NUC', -1.6, 0, 5) x_list_n_n, x_list_m_n = gen_input() for p in p_list_error: for x in (x_list_n_n + x_list_m_n): with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in p_list_n_n: for x in x_list_m_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) self.assertRaises(ValueError, paddle.linalg.cond, x_data, p)
def test_static(self): mp, sp = static.Program(), static.Program() with static.program_guard(mp, sp): x = static.data("x", shape=[2, 3], dtype="complex128") out = paddle.angle(x) exe = static.Executor() exe.run(sp) [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out]) self.assertTrue(np.allclose(self.out, out_np))
def create_data_loader(image_shape, is_train, args): image = static.data(name="image", shape=[None] + image_shape, dtype="float32") label = static.data(name="label", shape=[None, 1], dtype="int64") data_loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=True) drop_path_prob = '' drop_path_mask = '' if is_train: drop_path_prob = static.data(name="drop_path_prob", shape=[args.batch_size, 1], dtype="float32") drop_path_mask = static.data(name="drop_path_mask", shape=[args.batch_size, 20, 4, 2], dtype="float32") return data_loader, image, label, drop_path_prob, drop_path_mask
def test_static_assert_true(self, x_list, p_list): for p in p_list: for x in x_list: with static.program_guard(static.Program(), static.Program()): input_data = static.data("X", shape=x.shape, dtype=x.dtype) output = paddle.linalg.cond(input_data, p) exe = static.Executor() result = exe.run(feed={"X": x}, fetch_list=[output]) expected_output = np.linalg.cond(x, p) np.testing.assert_allclose(result[0], expected_output, rtol=5e-5)
def test_dtype_error(self): # in static mode with self.assertRaises(TypeError): with static.program_guard(static.Program()): x = static.data(name="x", shape=self._shape, dtype="float32") out = paddle_apis[self.api](x, name="real_res") # in dynamic mode with self.assertRaises(RuntimeError): with fluid.dygraph.guard(): input = np.random.random(self._shape).astype("float32") input_t = paddle.to_tensor(input) res = paddle_apis[self.api](input_t)
def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') loss_func = paddle.nn.CrossEntropyLoss(reduction="none") mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = loss_func(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def decoder_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input_ids = static.data(name="input_ids", shape=[batch_size, sequence_len], dtype='int64') position_ids = static.data(name="position_ids", shape=[batch_size, sequence_len], dtype='int64') decoder = DecoderLayer(vocab_size=32768, hidden_size=hidden_size, sequence_len=sequence_len, max_position_embeddings=512, intermediate_size=4 * hidden_size, num_heads=16, dropout_ratio=0.1, initializer_range=0.02) out = decoder(input_ids, position_ids) return train_program, start_program