def build_strategy(self, args): self.strategy = None if args.mode == "async": self.strategy = StrategyFactory.create_async_strategy() elif args.mode == "sync": self.strategy = StrategyFactory.create_sync_strategy() elif args.mode == "half_async": self.strategy = StrategyFactory.create_half_async_strategy() elif args.mode == "geo": self.strategy = StrategyFactory.create_geo_strategy( args.geo_sgd_need_push_nums) self.dump_param = os.getenv("dump_param", "").split(",") self.dump_fields = os.getenv("dump_fields", "").split(",") self.dump_fields_path = os.getenv("dump_fields_path", "") debug = int(os.getenv("Debug", "0")) if debug: self.strategy.set_debug_opt({ "dump_param": self.dump_param, "dump_fields": self.dump_fields, "dump_fields_path": self.dump_fields_path }) return self.strategy
def test_geo_strategy(self): strategy = StrategyFactory.create_geo_strategy(5) self.assertEqual(strategy._program_config.sync_mode, False) self.assertEqual(strategy._program_config.runtime_split_send_recv, True) self.assertEqual(strategy._program_config.geo_sgd_mode, True) self.assertEqual(strategy._program_config.geo_sgd_need_push_nums, 5) self.assertEqual(strategy._build_strategy.async_mode, True) # test set_build_strategy using fluid.BuildStrategy build_strategy_class = fluid.BuildStrategy() build_strategy_class.memory_optimize = False strategy.set_build_strategy(build_strategy_class) build_strategy = strategy.get_build_strategy() self.assertEqual(build_strategy.memory_optimize, False) # test set_build_strategy using dict build_strategy_dict = dict() build_strategy_dict['memory_optimize'] = True strategy.set_build_strategy(build_strategy_dict) build_strategy = strategy.get_build_strategy() self.assertEqual(build_strategy.memory_optimize, True) # test set_build_strategy exception build_strategy_dict['unknown'] = None self.assertRaises(Exception, strategy.set_build_strategy, build_strategy_dict) build_strategy_illegal = None self.assertRaises(Exception, strategy.set_build_strategy, build_strategy_illegal)
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.SGD(base_lr) strategy = StrategyFactory.create_geo_strategy(20) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def test_dist_geo_server_transpiler(self): num_voc = 128 embed_dim = 64 x_shape, x_lod = [16, 10], [[3, 5, 2, 6]] x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1) hash_embd = fluid.contrib.layers.search_pyramid_hash( input=x, num_emb=embed_dim, space_len=num_voc * embed_dim, pyramid_layer=4, rand_len=16, drop_out_percent=0.5, is_training=True, use_filter=False, white_list_len=6400, black_list_len=2800, seed=3, lr=0.002, param_attr=fluid.ParamAttr( name="PyramidHash_emb_0", learning_rate=0, ), param_attr_wl=fluid.ParamAttr( name="Filter", learning_rate=0, ), param_attr_bl=None, distribute_update_vars=["PyramidHash_emb_0"], name=None) cost = fluid.layers.reduce_sum(hash_embd) role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = StrategyFactory.create_geo_strategy(5) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def _get_distributed_strategy(self): from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] strategy = None if not self.user_defined_strategy.a_sync and k_steps == 0: strategy = StrategyFactory.create_sync_strategy() if self.user_defined_strategy.a_sync and k_steps == 0: strategy = StrategyFactory.create_async_strategy() if self.user_defined_strategy.a_sync and k_steps > 0: strategy = StrategyFactory.create_geo_strategy(k_steps) if not strategy: raise ValueError("k_steps must be invalid value, please check") return strategy
def build_strategy(self): mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync", "half_async"] strategy = None if mode == "async": strategy = StrategyFactory.create_async_strategy() elif mode == "geo": push_num = envs.get_global_env("train.strategy.mode.push_num", 100) strategy = StrategyFactory.create_geo_strategy(push_num) elif mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() assert strategy is not None self.strategy = strategy return strategy
def _build_strategy(self, context): from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync", "half_async"] strategy = None if mode == "async": strategy = StrategyFactory.create_async_strategy() elif mode == "geo": push_num = envs.get_global_env("train.strategy.mode.push_num", 100) strategy = StrategyFactory.create_geo_strategy(push_num) elif mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() assert strategy is not None context["strategy"] = strategy return strategy
def test_geo_strategy(self): strategy = StrategyFactory.create_geo_strategy(5) self.assertEqual(strategy._program_config.sync_mode, False) self.assertEqual(strategy._program_config.runtime_split_send_recv, True) self.assertEqual(strategy._program_config.geo_sgd_mode, True) self.assertEqual(strategy._program_config.geo_sgd_need_push_nums, 5) self.assertEqual(strategy._build_strategy.async_mode, True) # test set_build_strategy using fluid.BuildStrategy build_strategy_class = fluid.BuildStrategy() build_strategy_class.memory_optimize = False strategy.set_build_strategy(build_strategy_class) build_strategy = strategy.get_build_strategy() self.assertEqual(build_strategy.memory_optimize, False) # test set_build_strategy using dict build_strategy_dict = dict() build_strategy_dict['memory_optimize'] = True strategy.set_build_strategy(build_strategy_dict) build_strategy = strategy.get_build_strategy() self.assertEqual(build_strategy.memory_optimize, True) # test set_build_strategy exception build_strategy_dict['unknown'] = None self.assertRaises(Exception, strategy.set_build_strategy, build_strategy_dict) build_strategy_illegal = None self.assertRaises(Exception, strategy.set_build_strategy, build_strategy_illegal) os.environ["CPU_NUM"] = '100' trainer_runtime_config = strategy.get_trainer_runtime_config() runtime_configs = trainer_runtime_config.get_communicator_flags() self.assertIn('communicator_thread_pool_size', runtime_configs) self.assertIn('communicator_send_wait_times', runtime_configs) self.assertNotIn('communicator_independent_recv_thread', runtime_configs)
def test_pserver(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = StrategyFactory.create_geo_strategy(5) avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def _set_strategy(self, args): """配置运行的distributed_strategy, build_strategy 配置在do_training中""" if int(os.getenv("PADDLE_COMPATIBILITY_CHECK", '0')): self.strategy = DistributeTranspilerConfig() if args.run_params["sync_mode"] == "sync": self.strategy.sync_mode = True self.strategy.runtime_split_send_recv = False self.async_mode = False elif args.run_params["sync_mode"] == "half_async": self.strategy.sync_mode = False self.async_mode = False elif args.run_params["sync_mode"] == "async": self.strategy.sync_mode = False self.async_mode = True elif args.run_params["sync_mode"] == "geo_async": self.strategy.sync_mode = False self.async_mode = True self.strategy.geo_sgd_mode = True self.strategy.geo_sgd_need_push_nums = 400 self.strategy.mode = "pserver" self.strategy.slice_var_up = args.run_params['slice_var_up'] self.strategy.enable_dc_asgd = args.run_params['enable_dc_asgd'] #TODO: split_method=HashName, it will cause a bug, this option can open after repair # if args.run_params['split_method']: # self.strategy.split_method = HashName # else: # self.strategy.split_method = RoundRobin self.strategy.wait_port = args.run_params['wait_port'] self.strategy.runtime_split_send_recv = args.run_params[ 'runtime_split_send_recv'] self.strategy.use_hierarchical_allreduce = args.run_params[ 'use_hierarchical_allreduce'] self.strategy.geo_sgd_need_push_nums = args.run_params['push_nums'] else: self.strategy = StrategyFactory.create_sync_strategy() # trainer_runtime_config = TrainerRuntimeConfig() # trainer_runtime_config.send_queue_size = "16" # trainer_runtime_config.thread_pool_size="32" # trainer_runtime_config.max_merge_var_num="16" # trainer_runtime_config.is_sgd_communicator="0" if args.run_params["sync_mode"] == "sync": self.strategy = StrategyFactory.create_sync_strategy() elif args.run_params["sync_mode"] == "half_async": self.strategy = StrategyFactory.create_half_async_strategy() elif args.run_params["sync_mode"] == "async": self.strategy = StrategyFactory.create_async_strategy() build_strategy = self.strategy.get_build_strategy() build_strategy.memory_optimize = False self.strategy.set_build_strategy(build_strategy) elif args.run_params["sync_mode"] == "geo_async": self.strategy = StrategyFactory.create_geo_strategy(400) program_config = self.strategy.get_program_config() program_config.slice_var_up = args.run_params['slice_var_up'] program_config.enable_dc_asgd = args.run_params['enable_dc_asgd'] #TODO: split_method=HashName, it will cause a bug, this option can open after repair # if args.run_params['split_method']: # program_config.split_method = HashName # else: # program_config.split_method = RoundRobin program_config.wait_port = args.run_params['wait_port'] program_config.runtime_split_send_recv = args.run_params[ 'runtime_split_send_recv'] program_config.use_hierarchical_allreduce = args.run_params[ 'use_hierarchical_allreduce'] program_config.geo_sgd_need_push_nums = args.run_params[ 'push_nums']