def test_pipeline_optimizer(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = { 'micro_batch_size': 1, 'accumulate_steps': 2 } train_prog, startup_prog = static.Program(), static.Program() with static.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = self.net() optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_dtype_error(self): # in static mode with self.assertRaises(TypeError): with static.program_guard(static.Program()): x = static.data(name="x", shape=self._shape, dtype="float32") out = paddle_apis[self.api](x, name="real_res") # in dynamic mode with self.assertRaises(RuntimeError): with fluid.dygraph.guard(): input = np.random.random(self._shape).astype("float32") input_t = paddle.to_tensor(input) res = paddle_apis[self.api](input_t)
def test_mlp_serial(self): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) suffix = core.kAutoParallelSuffix() for block in distributed_main_program.blocks: for op in block.ops: for attr_name in op.attr_names: self.assertTrue(suffix not in attr_name) # print_program_with_dist_attr(distributed_main_program) self.assertIsNotNone(distributed_startup_program) self.assertIsNotNone(distributed_main_program)
def test_in_static_mode(self): def init_input_output(dtype): input = np.random.random(self._shape).astype( dtype) + 1j * np.random.random(self._shape).astype(dtype) return {'x': input}, numpy_apis[self.api](input) for dtype in self.dtypes: input_dict, np_res = init_input_output(dtype) for place in self.places: with static.program_guard(static.Program()): x = static.data(name="x", shape=self._shape, dtype=dtype) out = paddle_apis[self.api](x) exe = static.Executor(place) out_value = exe.run(feed=input_dict, fetch_list=[out.name]) self.assertTrue(np.array_equal(np_res, out_value[0]))
def test_in_static_mode(self): def init_input_output(dtype): input = np.random.random(self._shape).astype(dtype) return {'x': input}, psi(input) for dtype in self.dtypes: input_dict, sc_res = init_input_output(dtype) for place in self.places: with static.program_guard(static.Program()): x = static.data(name="x", shape=self._shape, dtype=dtype) out = paddle.digamma(x) exe = static.Executor(place) out_value = exe.run(feed=input_dict, fetch_list=[out.name]) self.assertEqual( np.allclose(out_value[0], sc_res, rtol=1e-5), True)
def custom_relu_static_inference(func, device, np_data, np_label, path_prefix): paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): # simple module data = static.data(name='data', shape=[None, 1, 28, 28], dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') hidden = static.nn.fc(data, size=128) hidden = func(hidden) hidden = static.nn.fc(hidden, size=128) predict = static.nn.fc(hidden, size=10, activation='softmax') loss = paddle.nn.functional.cross_entropy(input=hidden, label=label) avg_loss = paddle.mean(loss) opt = paddle.optimizer.SGD(learning_rate=0.1) opt.minimize(avg_loss) # run start up model exe = static.Executor() exe.run(static.default_startup_program()) # train for i in range(4): avg_loss_v = exe.run(static.default_main_program(), feed={ 'data': np_data, 'label': np_label }, fetch_list=[avg_loss]) # save inference model static.save_inference_model(path_prefix, [data], [predict], exe) # get train predict value predict_v = exe.run(static.default_main_program(), feed={ 'data': np_data, 'label': np_label }, fetch_list=[predict]) return predict_v
def concat_static(func, dtype, np_inputs, axis_v, with_attr=False): paddle.enable_static() paddle.set_device("cpu") with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x1 = static.data(name="x1", shape=[2, 3], dtype=dtype) x2 = static.data(name="x2", shape=[2, 3], dtype=dtype) if with_attr: axis = axis_v else: axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) x1.stop_gradient = False x2.stop_gradient = False total_time = 0 for i in range(TEST_TIME): start = time.time() out = func([x1, x2], axis) total_time += time.time() - start print("- static mode concat time cost: {} s".format(total_time / TEST_TIME)) # mean only support float, so here use sum sum_out = paddle.sum(out) static.append_backward(sum_out) exe = static.Executor() exe.run(static.default_startup_program()) if with_attr: feed_dict = { "x1": np_inputs[0].astype(dtype), "x2": np_inputs[1].astype(dtype) } else: feed_dict = { "x1": np_inputs[0].astype(dtype), "x2": np_inputs[1].astype(dtype), "axis": axis } out_v, x1_grad_v, x2_grad_v = exe.run( static.default_main_program(), feed=feed_dict, fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"]) paddle.disable_static() return out_v, x1_grad_v, x2_grad_v
def test_conj_static_mode(self): def init_input_output(dtype): input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( [2, 20, 2, 3]).astype(dtype) return {'x': input}, np.conj(input) for dtype in self._dtypes: input_dict, np_res = init_input_output(dtype) for place in self._places: with static.program_guard(static.Program()): x_dtype = np.complex64 if dtype == "float32" else np.complex128 x = static.data( name="x", shape=[2, 20, 2, 3], dtype=x_dtype) out = paddle.conj(x) exe = static.Executor(place) out_value = exe.run(feed=input_dict, fetch_list=[out.name]) self.assertTrue(np.array_equal(np_res, out_value[0]))
def conj_static(func, shape, dtype, np_input): paddle.enable_static() paddle.set_device("cpu") with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name="x", shape=shape, dtype=dtype) x.stop_gradient = False out = func(x) sum_out = paddle.sum(out) static.append_backward(sum_out) exe = static.Executor() exe.run(static.default_startup_program()) out_v, x_grad_v = exe.run(static.default_main_program(), feed={"x": np_input}, fetch_list=[out.name, x.name + "@GRAD"]) paddle.disable_static() return out_v, x_grad_v
def test_relu2_static(device, dtype): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = librelu2_op.relu2(x) static.append_backward(out) print(static.default_main_program()) exe = static.Executor() exe.run(static.default_startup_program()) x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) out, = exe.run(static.default_main_program(), feed={'X': x}, fetch_list=[out.name]) print(out)
def test_relu2_static(device, dtype, use_custom=True): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = custom_relu_op_rf.relu2( x) if use_custom else paddle.nn.functional.relu(x) static.append_backward(out) print(static.default_main_program()) places = static.cuda_places() print(places) exe = static.Executor() compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=out.name, places=static.cuda_places()) x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) out, = exe.run(compiled_prog, feed={'X': x}, fetch_list=[out.name]) print(out)
def linear_static(func, dtype, np_x, np_weight, np_bias): paddle.enable_static() paddle.set_device("cpu") with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name="x", shape=np_x.shape, dtype=dtype) weight = static.data( name="weight", shape=np_weight.shape, dtype=dtype) bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype) out = func(x, weight, bias) exe = static.Executor() exe.run(static.default_startup_program()) out_v, = exe.run(static.default_main_program(), feed={ "x": np_x.astype(dtype), "weight": np_weight.astype(dtype), "bias": np_bias.astype(dtype) }, fetch_list=[out.name]) paddle.disable_static() return out_v
import paddle import paddle.static as static paddle.enable_static() startup_prog = static.Program() main_prog = static.Program() with static.program_guard(startup_prog, main_prog): x = static.data(name='X', shape=[1000, 784], dtype='float32') y = static.data(name='Y', shape=[784, 100], dtype='float32') z = paddle.matmul(x=x, y=y) binary_str = static.default_main_program().desc.serialize_to_string() prog_restored = static.default_main_program().parse_from_string(binary_str) print(static.default_main_program()) print(prog_restored)
def test_opt_sharding_with_pp(self): train_prog, startup_prog = static.Program(), static.Program() avg_cost, strategy = self.pp_net(train_prog, startup_prog) self.set_strategy(strategy, 'pipeline') strategy.sharding = True strategy.sharding_configs = { "sharding_degree": 1, "pp_degree": 2, "dp_degree": 2, "_dp_as_optimizer_sharding": True, } strategy.fuse_all_reduce_ops = False self.optimizer(avg_cost, strategy, train_prog, startup_prog) train_prog = train_prog._pipeline_opt['section_program'] startup_prog = startup_prog._pipeline_opt['startup_program'] self.debug_program(train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops # check program startup_prog_op_types = [op.type for op in startup_prog_ops] main_prog_op_types = [op.type for op in main_prog_ops] # global, sharding, pp_send, pp_recv self.assertEqual(startup_prog_op_types, [ 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) # should has ring id for pp created_ring_ids = [ op.desc.attr("ring_id") for op in startup_prog_ops if op.type == "c_comm_init" ] self.assertIn(self.dp_ring_id, created_ring_ids) self.assertIn(self.pp_pair_ring_id, created_ring_ids) # check correctness of pp group pp_group_waiting_prots = None for op in startup_prog_ops: if op.type == "c_gen_nccl_id" and \ op.desc.output_arg_names()[0] == "comm_id_0": pp_group_waiting_prots = op.desc.attr("other_endpoints") self.assertEqual(pp_group_waiting_prots, ['127.0.0.1:36003']) # check correctness of sharding group dp_group_waiting_ports = None for op in startup_prog_ops: if op.type == "c_gen_nccl_id" \ and op.desc.output_arg_names()[0] == "comm_id_3": dp_group_waiting_ports = op.desc.attr("other_endpoints") self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
def test_opt_sharding_with_pp_amp_gclip_boundary(self): """ test optimizer sharding without parameter test loss grad scale value """ train_prog, startup_prog = static.Program(), static.Program() avg_cost, strategy = self.boundary_net(train_prog, startup_prog) self.set_strategy(strategy, 'amp') self.set_strategy(strategy, 'pipeline') strategy.sharding = True strategy.sharding_configs = { "sharding_degree": 1, "pp_degree": 2, "dp_degree": 2, "_dp_as_optimizer_sharding": True, } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) self.optimizer(avg_cost, strategy, train_prog, startup_prog, grad_clip=clip) train_prog = train_prog._pipeline_opt['section_program'] startup_prog = startup_prog._pipeline_opt['startup_program'] self.debug_program(train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops # check program startup_prog_op_types = [op.type for op in startup_prog_ops] main_prog_op_types = [op.type for op in main_prog_ops] # check loss scale for hybrid for op in main_prog_ops: if is_loss_grad_op(op): self.assertEqual(op.type, 'fill_constant') self.assertTrue(op.has_attr('value')) scale = strategy.pipeline_configs[ 'accumulate_steps'] * strategy.sharding_configs['dp_degree'] loss_scale = 1.0 / scale self.assertAlmostEqual(float(op.attr('value')), loss_scale) # global, sharding, pp_send, pp_recv self.assertEqual(startup_prog_op_types, [ 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ 'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad', 'cast', 'matmul_grad', 'c_sync_calc_stream', 'send_v2', 'fill_constant', 'cast', 'sum', 'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling', 'fill_constant', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div', 'c_broadcast' ])
def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): train_prog, startup_prog = static.Program(), static.Program() avg_cost, strategy = self.pp_net(train_prog, startup_prog) self.set_strategy(strategy, 'amp') self.set_strategy(strategy, 'pipeline') strategy.sharding = True strategy.sharding_configs = { "sharding_degree": 1, "pp_degree": 2, "dp_degree": 2, "_dp_as_optimizer_sharding": True, } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_merge = True clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) self.optimizer(avg_cost, strategy, train_prog, startup_prog, grad_clip=clip) train_prog = train_prog._pipeline_opt['section_program'] startup_prog = startup_prog._pipeline_opt['startup_program'] self.debug_program(train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops # check program startup_prog_op_types = [op.type for op in startup_prog_ops] main_prog_op_types = [op.type for op in main_prog_ops] # global, sharding, pp_send, pp_recv self.assertEqual(startup_prog_op_types, [ 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ 'recv_v2', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', 'send_v2', 'cast', 'sum', 'cast', 'sum', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', 'coalesce_tensor', 'c_broadcast', 'coalesce_tensor', 'c_broadcast' ])
def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self): train_prog, startup_prog = static.Program(), static.Program() avg_cost, strategy = self.pp_net(train_prog, startup_prog) self.set_strategy(strategy, 'pipeline') self.set_strategy(strategy, 'amp') strategy.amp_configs = { 'custom_black_varnames': ['fc_6.b_0'], } strategy.recompute = True strategy.recompute_configs = { "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2", "fc_2.tmp_2", "fc_3.tmp_2"] } strategy.sharding = True strategy.sharding_configs = { "sharding_degree": 1, "pp_degree": 2, "dp_degree": 2, "_dp_as_optimizer_sharding": True, 'optimize_cast': True, } strategy.fuse_all_reduce_ops = True strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_merge = True self.optimizer(avg_cost, strategy, train_prog, startup_prog) train_prog = train_prog._pipeline_opt['section_program'] startup_prog = startup_prog._pipeline_opt['startup_program'] # self._debug = True self.debug_program(train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops # check program startup_prog_op_types = [op.type for op in startup_prog_ops] main_prog_op_types = [op.type for op in main_prog_ops] # global, sharding, pp_send, pp_recv self.assertEqual(startup_prog_op_types, [ 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ 'recv_v2', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast', 'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', 'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', 'c_broadcast', 'coalesce_tensor', 'c_broadcast' ])
def search(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) else: ### start a client sa_nas = SANAS(config, server_addr=(args.server_address, args.port), init_temperature=init_temperature, is_server=False) image_shape = [3, image_size, image_size] for step in range(args.search_steps): archs = sa_nas.next_archs()[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_fetch_list, _, train_loader = build_program(train_program, startup_program, image_shape, archs, args, is_train=True) current_params = count_parameters_in_MB( train_program.global_block().all_parameters(), 'cifar10') _logger.info('step: {}, current_params: {}M'.format( step, current_params)) if current_params > float(3.77): continue test_fetch_list, _, test_loader = build_program(test_program, startup_program, image_shape, archs, args, is_train=False) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) train_reader = reader.train_valid(batch_size=args.batch_size, is_train=True, is_shuffle=True) test_reader = reader.train_valid(batch_size=args.batch_size, is_train=False, is_shuffle=False) train_loader.set_batch_generator(train_reader, places=place) test_loader.set_batch_generator(test_reader, places=place) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel( loss_name=train_fetch_list[0].name, build_strategy=build_strategy) valid_top1_list = [] for epoch_id in range(args.retain_epoch): train_top1 = train(train_compiled_program, exe, epoch_id, train_loader, train_fetch_list, args) _logger.info("TRAIN: step: {}, Epoch {}, train_acc {:.6f}".format( step, epoch_id, train_top1)) valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list, args) _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format( epoch_id, valid_top1)) valid_top1_list.append(valid_top1) sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, lstm_num_layers=1, hidden_size=10, temperature=1.0) else: ### start a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), lstm_num_layers=1, hidden_size=10, temperature=1.0, controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): archs = rl_nas.next_archs(1)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) rl_nas.reward(np.float32(finally_reward[1]))
def test_name_argument(self): with static.program_guard(static.Program()): x = static.data(name="x", shape=self._shape, dtype=self.dtypes[0]) out = paddle_apis[self.api](x, name="real_res") self.assertTrue("real_res" in out.name)
def get_sparse_model(model_file, param_file, ratio, save_path): """ Using the unstructured sparse algorithm to compress the network. This interface is only used to evaluate the latency of the compressed network, and does not consider the loss of accuracy. Args: model_file(str), param_file(str): The inference model to be pruned. ratio(float): The ratio to prune the model. save_path(str): The save path of pruned model. """ assert os.path.exists(model_file), f'{model_file} does not exist.' assert os.path.exists( param_file) or param_file is None, f'{param_file} does not exist.' paddle.enable_static() SKIP = ['image', 'feed', 'pool2d_0.tmp_0'] folder = os.path.dirname(model_file) model_name = model_file.split('/')[-1] if param_file is None: param_name = None else: param_name = param_file.split('/')[-1] main_prog = static.Program() startup_prog = static.Program() exe = paddle.static.Executor(paddle.CPUPlace()) exe.run(startup_prog) [inference_program, feed_target_names, fetch_targets] = ( fluid.io.load_inference_model( folder, exe, model_filename=model_name, params_filename=param_name)) thresholds = {} graph = GraphWrapper(inference_program) for op in graph.ops(): for inp in op.all_inputs(): name = inp.name() if inp.name() in SKIP: continue if 'tmp' in inp.name(): continue # 1x1_conv cond_conv = len(inp._var.shape) == 4 and inp._var.shape[ 2] == 1 and inp._var.shape[3] == 1 cond_fc = False if cond_fc or cond_conv: array = np.array(paddle.static.global_scope().find_var(name) .get_tensor()) flatten = np.abs(array.flatten()) index = min(len(flatten) - 1, int(ratio * len(flatten))) ind = np.unravel_index( np.argsort( flatten, axis=None), flatten.shape) thresholds[name] = ind[0][:index] for op in graph.ops(): for inp in op.all_inputs(): name = inp.name() if name in SKIP: continue if 'tmp' in inp.name(): continue cond_conv = (len(inp._var.shape) == 4 and inp._var.shape[2] == 1 and inp._var.shape[3] == 1) cond_fc = False # only support 1x1_conv now if not (cond_conv or cond_fc): continue array = np.array(paddle.static.global_scope().find_var(name) .get_tensor()) if thresholds.get(name) is not None: np.put(array, thresholds.get(name), 0) assert (abs(1 - np.count_nonzero(array) / array.size - ratio) < 1e-2 ), 'The model sparsity is abnormal.' paddle.static.global_scope().find_var(name).get_tensor().set( array, paddle.CPUPlace()) fluid.io.save_inference_model( save_path, feeded_var_names=feed_target_names, target_vars=fetch_targets, executor=exe, main_program=inference_program, model_filename=model_name, params_filename=param_name) print("The pruned model is saved in: ", save_path)
def get_program(): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True # fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() with fluid.program_guard(train_program, start_program): # 循环计数器 i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) auto.shard_tensor(i, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) # 循环次数 loop_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=epoch_num) auto.shard_tensor(loop_len, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) # input input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder, capacity=4 * batch_size, iterable=False) dataloader.set_batch_generator(batch_generator_creator(), places=paddle.static.cuda_places()) # data dist_attr auto.shard_tensor(input, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) # fill constant bsz like tmp = paddle.fluid.layers.fill_constant_batch_size_like( input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0) auto.shard_tensor(tmp, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, 0, -1, -1] }) # model mlp_start = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_start(input) input_array = fluid.layers.array_write(pred, i) auto.shard_tensor(input_array, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) cond = fluid.layers.less_than(x=i, y=loop_len) auto.shard_tensor(cond, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) while_op = fluid.layers.While(cond=cond) with while_op.block(): pre_input = fluid.layers.array_read(array=input_array, i=i) auto.shard_tensor(pre_input, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) mlp_while = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) cur_pred = mlp_while(pre_input) # 更新循环条件 i = fluid.layers.increment(x=i, value=1, in_place=True) fluid.layers.array_write(cur_pred, array=input_array, i=i) fluid.layers.less_than(x=i, y=loop_len, cond=cond) end_pred = fluid.layers.array_read(array=input_array, i=i) auto.shard_tensor(end_pred, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) mlp_end = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_end(end_pred) error_cost = paddle.nn.functional.square_error_cost(pred, label) auto.shard_tensor(error_cost, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) loss = paddle.mean(error_cost) auto.shard_tensor(loss, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) return train_program, start_program, dataloader, i, loss
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS( key='ddpg', configs=config, is_sync=False, obs_dim=26, ### step + length_of_token server_addr=(args.server_address, args.port)) else: ### start a client rl_nas = RLNAS(key='ddpg', configs=config, is_sync=False, obs_dim=26, server_addr=(args.server_address, args.port), is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): if step == 0: action_prev = [1. for _ in rl_nas.range_tables] else: action_prev = rl_nas.tokens[0] obs = [step] obs.extend(action_prev) archs = rl_nas.next_archs(obs=obs)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) obs = np.expand_dims(obs, axis=0).astype('float32') actions = rl_nas.tokens obs_next = [step + 1] obs_next.extend(actions[0]) obs_next = np.expand_dims(obs_next, axis=0).astype('float32') if step == args.search_steps - 1: terminal = np.expand_dims([True], axis=0).astype(np.bool) else: terminal = np.expand_dims([False], axis=0).astype(np.bool) rl_nas.reward(np.expand_dims(np.float32(finally_reward[1]), axis=0), obs=obs, actions=actions.astype('float32'), obs_next=obs_next, terminal=terminal) if step == 2: sys.exit(0)
def test_search_result(tokens, image_size, args, config): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') archs = sa_nas.tokens2arch(tokens)[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) current_flops = flops(train_program) print('current_flops: {}'.format(current_flops)) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. format(epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. format(batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2]))
def get_program(): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True # fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() with static.program_guard(train_program, start_program): # input input = static.data( name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data( name="label", shape=[batch_size, sequence_len, 1], dtype='float32') data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False) dataloader.set_batch_generator( batch_generator_creator(), places=paddle.static.cuda_places()) # data dist_attr auto.shard_tensor( input, dist_attr={ "process_mesh": _g_process_mesh[0], "dims_mapping": [0, -1, -1] }) auto.shard_tensor( label, dist_attr={ "process_mesh": _g_process_mesh[0], "dims_mapping": [0, -1, -1] }) mlp_start = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_start(input) mlp_mid = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_mid(pred) mlp_end = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_end(pred) error_cost = paddle.nn.functional.square_error_cost(pred, label) loss = paddle.mean(error_cost) optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) feed_vars = {"inputs": [input], "labels": [label]} fetch_vars = {"loss": [loss]} return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
print("op outputs are {}".format(op.output_arg_names)) for key, value in sorted(six.iteritems(op.all_attrs())): if key not in ['op_callstack', 'op_role_var']: print(" [ attrs: {}: {} ]".format(key, value)) def network(): img = static.data(name='image', shape=[None, 784]) hidden = static.nn.fc(input=img, size=200, act='relu') hidden = F.dropout(hidden, p=0.5) loss = F.cross_entropy(input=static.nn.fc(hidden, size=10, act='softmax'), label=static.data(name='label', shape=[1], dtype='int64')) avg_loss = paddle.mean(loss) return avg_loss train_program_2 = static.Program() startup_program_2 = static.Program() test_program_2 = static.Program() with static.program_guard(train_program_2, startup_program_2): with utils.unique_name.guard(): avg_loss = network() sgd = paddle.optimizer.SGD(learning_rate=1e-3) sgd.minimize(avg_loss) # the test startup program is not used. with static.program_guard(test_program_2, startup_program_2): with utils.unique_name.guard(): avg_loss = network() print_prog(test_program_2)
import paddle import paddle.static as static paddle.enable_static() main_program = static.Program() startup_program = static.Program() with static.program_guard(main_program=main_program, startup_program=startup_program): x = static.data(name="x", shape=[-1, 784], dtype='float32') y = static.data(name="y", shape=[-1, 1], dtype='int32') z = static.nn.fc(name="fc", input=x, size=10, act="relu") print("main program is: {}".format(main_program)) print("start up program is: {}".format(startup_program))
def get_prune_model(model_file, param_file, ratio, save_path): """ Using the structured pruning algorithm to compress the network. This interface is only used to evaluate the latency of the compressed network, and does not consider the loss of accuracy. Args: model_file(str), param_file(str): The inference model to be pruned. ratio(float): The ratio to prune the model. save_path(str): The save path of pruned model. """ assert os.path.exists(model_file), f'{model_file} does not exist.' assert os.path.exists( param_file) or param_file is None, f'{param_file} does not exist.' paddle.enable_static() SKIP = ['image', 'feed', 'pool2d_0.tmp_0'] folder = os.path.dirname(model_file) model_name = model_file.split('/')[-1] if param_file is None: param_name = None else: param_name = param_file.split('/')[-1] main_prog = static.Program() startup_prog = static.Program() place = paddle.CPUPlace() exe = paddle.static.Executor() scope = static.global_scope() exe.run(startup_prog) [inference_program, feed_target_names, fetch_targets] = ( fluid.io.load_inference_model( folder, exe, model_filename=model_name, params_filename=param_name)) prune_params = [] graph = GraphWrapper(inference_program) for op in graph.ops(): for inp in op.all_inputs(): name = inp.name() if inp.name() in SKIP: continue if 'tmp' in inp.name(): continue cond_conv = len(inp._var.shape) == 4 and 'conv' in name # only prune conv if cond_conv: prune_params.append(name) # drop last conv prune_params.pop() ratios = [ratio] * len(prune_params) pruner = Pruner() main_program, _, _ = pruner.prune( inference_program, scope, params=prune_params, ratios=ratios, place=place, lazy=False, only_graph=False, param_backup=None, param_shape_backup=None) fluid.io.save_inference_model( save_path, feeded_var_names=feed_target_names, target_vars=fetch_targets, executor=exe, main_program=main_program, model_filename=model_name, params_filename=param_name)
def test_mapper_misc(self): self.assertEqual(get_dtype_bytes(paddle.float64), 8) self.assertEqual(get_dtype_bytes(paddle.float32), 4) self.assertEqual(get_dtype_bytes(paddle.float16), 2) self.assertEqual(get_dtype_bytes(paddle.bfloat16), 2) self.assertEqual(get_dtype_bytes(paddle.int64), 8) self.assertEqual(get_dtype_bytes(paddle.int32), 4) self.assertEqual(get_dtype_bytes(paddle.int16), 2) self.assertEqual(get_dtype_bytes(paddle.int8), 1) self.assertEqual(get_dtype_bytes(paddle.uint8), 1) self.assertRaises(ValueError, get_dtype_bytes, "unknown type") train_program = static.Program() startup_program = static.Program() ring_id = 0 root_id = 0 nranks = 2 with fluid.program_guard(train_program, startup_program): input = layers.data(name="input", shape=[10, 10], dtype='float32') output = train_program.current_block().create_var( name="outofbroadcast", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) broadcast_op = train_program.global_block().append_op( type="c_broadcast", inputs={'X': input}, attrs={ 'ring_id': ring_id, 'root': root_id }, outputs={'Out': output}) self.assertEqual(get_comm_volume(broadcast_op, 0, 1), 400) self.assertEqual(get_comm_volume(broadcast_op, 1, 0), None) allgather_op = train_program.global_block().append_op( type="c_allgather", inputs={'X': input}, attrs={ 'ring_id': ring_id, 'nranks': nranks }, outputs={'Out': output}) self.assertEqual(get_comm_volume(allgather_op, 0, 1), 400) self.assertEqual(get_comm_volume(allgather_op, 0, 0), None) reduce_op = train_program.global_block().append_op( type="c_reduce_sum", inputs={'X': input}, attrs={ 'ring_id': ring_id, 'root_id': root_id }, outputs={'Out': output}) self.assertEqual(get_comm_volume(reduce_op, 0, 1), None) self.assertEqual(get_comm_volume(reduce_op, 1, 0), 400) cast_op = train_program.global_block().append_op( type="cast", inputs={"X": input}, outputs={"Out": output}, attrs={ "in_dtype": fluid.core.VarDesc.VarType.FP32, "out_dtype": fluid.core.VarDesc.VarType.FP32 }) self.assertRaises(ValueError, get_comm_volume, cast_op, 0, 1)
def final_test(config, args, image_size, token=None): assert token != None, "If you want to start a final experiment, you must input a token." places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] sa_nas = SANAS(config, server_addr=(args.server_address, args.port), is_server=True) image_shape = [3, image_size, image_size] archs = sa_nas.tokens2arch(token)[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_fetch_list, (data, label), train_loader = build_program(train_program, startup_program, image_shape, archs, args, is_train=True) current_params = count_parameters_in_MB( train_program.global_block().all_parameters(), 'cifar10') _logger.info('current_params: {}M'.format(current_params)) test_fetch_list, _, test_loader = build_program(test_program, startup_program, image_shape, archs, args, is_train=False) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) train_reader = reader.train_valid(batch_size=args.batch_size, is_train=True, is_shuffle=True) test_reader = reader.train_valid(batch_size=args.batch_size, is_train=False, is_shuffle=False) train_loader.set_batch_generator(train_reader, places=place) test_loader.set_batch_generator(test_reader, places=place) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=train_fetch_list[0].name, build_strategy=build_strategy) valid_top1_list = [] for epoch_id in range(args.retain_epoch): train_top1 = train(train_compiled_program, exe, epoch_id, train_loader, train_fetch_list, args) _logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format( epoch_id, train_top1)) valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list, args) _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format( epoch_id, valid_top1)) valid_top1_list.append(valid_top1) output_dir = os.path.join('darts_output', str(epoch_id)) if not os.path.exists(output_dir): os.makedirs(output_dir) static.save_inference_model(output_dir, [data], test_fetch_list, exe)