def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker( init_timeout_seconds=100, run_timeout_seconds=100, http_ip_port="127.0.0.1:36003") #role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) #fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() http_server_d = {} http_server_d["running"] = False size_d = {} role_maker._GeneralRoleMaker__start_kv_server(http_server_d, size_d) except: print("do not support pslib test, skip") return from paddle.fluid.incubate.fleet.base.role_maker import MockBarrier mb = MockBarrier() mb.barrier() mb.barrier_all() mb.all_reduce(1) mb.all_gather(1) os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36005" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36005" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36006" os.environ["PADDLE_IS_BARRIER_ALL_ROLE"] = "0" role_maker = GeneralRoleMaker(path="test_mock1") role_maker.generate_role()
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) strategy = {} strategy["embedding"] = {} strategy["embedding"]["sparse_accessor_class"] = "DownpourUnitAccessor" strategy["embedding"]["embed_sparse_optimizer"] = "naive" try: adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer(adam1, strategy=strategy) adam1.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adagrad" adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2, strategy=strategy) adam2.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adam" adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer(adam3, strategy=strategy) adam3.minimize([cost], [scope]) except: print("do not support pslib test, skip") return
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() #role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) #fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') bow = fluid.layers.data_norm(input=bow, epsilon=1e-4, name="norm") fc = fluid.layers.fc(input=bow, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return try: # worker should call these methods instead of server # the following is only for test when with_pslib=off def test_func(): """ it is only a test function """ return True fleet._role_maker.is_first_worker = test_func fleet._role_maker._barrier_worker = test_func fleet.save_model("./model_000") fleet.save_one_table(0, "./model_001") fleet.save_one_table(0, "./model_002", prefix="hahaha") fleet.load_model("./model_0003") fleet.load_one_table(0, "./model_004") except: print("do not support pslib test, skip") return
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return fleet.clear_one_table(0) from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker try: role = MPISymetricRoleMaker() role._all_reduce([1], [2]) except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "min") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "max") except: print("catch expected error of not inited") try: role = MPISymetricRoleMaker() role._all_reduce([1], [2], "unknown") except: print("catch expected error of unknown type")
def test_pslib_2(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "1" place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init(None) except: print("no mpi4py, skip test_pslib_2") return train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return os.environ["TRAINING_ROLE"] = "wrong" try: role1 = GeneralRoleMaker(path="./test_gloo_1") role1.generate_role() except: print("catch expected error of wrong TRAINING_ROLE") os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" role2 = GeneralRoleMaker(path="./test_gloo_2") role2._finalize() role2._all_gather(1) role2._all_gather(1) role2._barrier_server() role2._all_gather(1) role3 = GeneralRoleMaker(path="./test_gloo_3") role3._worker_gather(1) role3._worker_gather(1) os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" role4 = GeneralRoleMaker(path="./test_gloo_4") role4._worker_gather(1) role4._get_rank() role4._get_size() role4._all_comm.init() role5 = GeneralRoleMaker(path="./test_gloo_5") role5.get_local_endpoint() role5.get_local_endpoint() role6 = GeneralRoleMaker(path="./test_gloo_6") role6.get_trainer_endpoints() role6.get_trainer_endpoints() role7 = GeneralRoleMaker(path="./test_gloo_7") role7.get_pserver_endpoints() role7.get_pserver_endpoints() role8 = GeneralRoleMaker(path="./test_gloo_8") role8.is_worker() role8.is_worker() role9 = GeneralRoleMaker(path="./test_gloo_9") role9.is_server() role9.is_server() role10 = GeneralRoleMaker(path="./test_gloo_10") role10.is_first_worker() role10.is_first_worker() role11 = GeneralRoleMaker(path="./test_gloo_11") role11.worker_index() role11.worker_index() role12 = GeneralRoleMaker(path="./test_gloo_12") role12.server_index() role12.server_index() role13 = GeneralRoleMaker(path="./test_gloo_13") role13.worker_num() role13.worker_num() role14 = GeneralRoleMaker(path="./test_gloo_14") role14.server_num() role14.server_num() role15 = GeneralRoleMaker(path="./test_gloo_15") role15._barrier_worker() role15._barrier_worker() role16 = GeneralRoleMaker(path="./test_gloo_16") role16._barrier_all() role16._barrier_all() role17 = GeneralRoleMaker(path="./test_gloo_17") role17._barrier_server() role17._barrier_server() role18 = GeneralRoleMaker(path="./test_gloo_18") role18._worker_num() role18._worker_num() role19 = GeneralRoleMaker(path="./test_gloo_19") role19._server_num() role19._server_num() role20 = GeneralRoleMaker(path="./test_gloo_20") a = [1] b = [0] role20._all_reduce(a, b) role21 = GeneralRoleMaker(path="./test_gloo_21") role21.all_reduce_worker([], []) role21.all_reduce_worker([], []) role21.barrier_worker() role21.barrier_all() role22 = GeneralRoleMaker(path="./test_gloo_22") role22._get_rank() role22._get_rank() os.environ["PADDLE_PSERVER_ID"] = "0" role23 = GeneralRoleMaker(path="./test_gloo_23") role23._get_size() role23._get_size() with open("test_fleet_gloo_role_maker_1.txt", "w") as f: data = "1 1 1 1\n" f.write(data) dataset = paddle.distributed.InMemoryDataset() dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset._set_use_var([show, label]) dataset.load_into_memory() dataset.get_memory_data_size(fleet) dataset.get_shuffle_data_size(fleet) os.remove("./test_fleet_gloo_role_maker_1.txt") class TmpClass(): """ dummy tmp class """ def __init__(self): pass def all_reduce_worker(self, input, output): """ dummy all reduce worker Args: input(None): fake input output(None): fale output """ pass def barrier_worker(self): """ dummy barrier worker """ pass from paddle.fluid.incubate.fleet.base.fleet_base import Fleet class TmpFleet(Fleet): """ dummy tmp fleet """ def __init__(self): super(TmpFleet, self).__init__() self._role_maker = None def init_worker(self): """ dummy init worker """ pass def init_server(self, model_dir=None): """ dummy init server Args: model_dir(None): fake model_dir """ pass def run_server(self): """ dummy run server """ pass def stop_worker(self): """ dummy stop worker """ pass def distributed_optimizer(self, optimizer, strategy=None): """ dummy distributed optimizer Args: optimizer(None): fake optimizer strategy(None): fake strategy """ pass def save_inference_model(self): """ dummy save inference model """ pass def save_persistables(self): """ dummy save persistables """ pass os.environ["TRAINING_ROLE"] = "TRAINER" tmp = TmpFleet() tmp._role_maker = TmpClass() tmp.all_reduce_worker([], []) tmp.barrier_worker() from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker tmp = RoleMakerBase() tmp.all_gather(1) tmp.all_reduce_worker([], []) tmp.barrier_worker() tmp.barrier_all() from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker tmp1 = MPISymetricRoleMaker() tmp1.all_gather(1) tmp1.all_gather(1) tmp2 = MPISymetricRoleMaker() tmp2.all_reduce_worker([], []) tmp3 = MPISymetricRoleMaker() tmp3.barrier_worker() tmp3.barrier_worker() tmp4 = MPISymetricRoleMaker() tmp4.barrier_all() tmp4.barrier_all()
def test_pslib_1(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import \ fleet_embedding, _prepare_params, _fleet_embedding, \ _fleet_embedding_v2, FLEET_GLOBAL_DICT from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker try: import netifaces except: print("warning: no netifaces, skip test_pslib_1") return os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" role_maker = GeneralRoleMaker() role_maker.generate_role() place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init(role_maker) train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() global FLEET_GLOBAL_DICT with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) click = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) emb = fluid.layers.data_norm(input=emb, name="a", epsilon=1e-4, param_attr={ "batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4 }) fc = fluid.layers.fc(input=emb, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) adam.minimize([cost], [scope]) except: print("do not support pslib test, skip") return FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _prepare_params(input=show, size=[1, 1]) except: print("catch expected exception of param_attr=None") try: _prepare_params(input=show, size=[1, 1], param_attr=fluid.ParamAttr()) except: print("catch expected exception of name=None") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=1, param_attr=tmp) except: print("catch expected exception of size not list") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 12], param_attr=tmp) except: print("catch expected exception of size not equal") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, is_sparse=False) except: print("catch expected exception of is_sparse=False") try: tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp, \ is_sparse=True, is_distributed=False) except: print("catch expected exception of is_distributed=False") try: _prepare_params(input=show, size=[-1, 1], \ param_attr=fluid.ParamAttr(name="embedding"), \ is_sparse=True, is_distributed=True, dtype="abc") except: print("catch expected exception of unknown dtype") try: FLEET_GLOBAL_DICT["emb_to_accessor"]["embedding"] = "unknown" tmp = fluid.ParamAttr(name="embedding") _prepare_params(input=show, size=[-1, 1], param_attr=tmp) except: print("catch expected exception of unknown accessor") FLEET_GLOBAL_DICT["cur_accessor"] = "DownpourCtrAccessor" try: _fleet_embedding(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") try: _fleet_embedding_v2(input=show, size=[-1, 1], is_sparse=True, \ is_distributed=True, dtype="float32", \ param_attr=fluid.ParamAttr(name="embedding")) except: print("catch expected exception of unknown accessor") adam1 = fluid.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer( adam1, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor" } }) try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of empty emb_to_table") try: pre = FLEET_GLOBAL_DICT["emb_to_table"] FLEET_GLOBAL_DICT["emb_to_table"] = {} FLEET_GLOBAL_DICT["emb_to_table"]["emb1"] = 0 adam1.minimize([cost], [scope]) except: FLEET_GLOBAL_DICT["emb_to_table"] = pre print("catch expected exception of error emb_to_table") try: adam2 = fluid.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2) adam2.supported_embedding_types = [] adam2.minimize([cost], [scope]) except: print("catch expected exception of embedding_types") try: adam3 = fluid.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer( adam3, strategy={ "embedding": { "sparse_accessor_class": "DownpourSparseValueAccessor", "sparse_embedx_dim": 999 } }) adam3.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") try: adam4 = fluid.optimizer.Adam(learning_rate=0.000005) adam4 = fleet.distributed_optimizer( adam4, strategy={ "embedding": { "sparse_accessor_class": "DownpourCtrAccessor", "sparse_embedx_dim": 999 } }) adam4.minimize([cost], [scope]) except: print("catch expected exception of embedx_dim error") train_program1 = fluid.Program() startup_program1 = fluid.Program() FLEET_GLOBAL_DICT["emb_to_accessor"] = {} with fluid.program_guard(train_program1, startup_program1): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) with fleet_embedding(click_name=click.name): emb = fluid.layers.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding")) with fleet_embedding(click_name=click.name): emb1 = fluid.embedding(input=show, size=[1, 1], \ is_sparse=True, is_distributed=True, \ param_attr=fluid.ParamAttr(name="embedding"))
def init(self, context): """R """ role_maker = None if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu': afs_config = self.global_config['io']['afs'] role_maker = GeneralRoleMaker( hdfs_name=afs_config['fs_name'], hdfs_ugi=afs_config['fs_ugi'], path=self.global_config['output_path'] + "/gloo", init_timeout_seconds=1200, run_timeout_seconds=1200) fleet.init(role_maker) data_var_list = [] data_var_name_dict = {} runnnable_scope = [] runnnable_cost_op = [] context['status'] = 'startup' for executor in self.global_config['executor']: scope = fluid.Scope() self._exector_context[executor['name']] = {} self._exector_context[executor['name']]['scope'] = scope self._exector_context[ executor['name']]['model'] = model_basic.create(executor) model = self._exector_context[executor['name']]['model'] self._metrics.update(model.get_metrics()) runnnable_scope.append(scope) runnnable_cost_op.append(model.get_cost_op()) for var in model._data_var: if var.name in data_var_name_dict: continue data_var_list.append(var) data_var_name_dict[var.name] = var optimizer = model_basic.YamlModel.build_optimizer({ 'metrics': self._metrics, 'optimizer_conf': self.global_config['optimizer'] }) optimizer.minimize(runnnable_cost_op, runnnable_scope) for executor in self.global_config['executor']: scope = self._exector_context[executor['name']]['scope'] model = self._exector_context[executor['name']]['model'] program = model._build_param['model']['train_program'] if not executor['is_update_sparse']: program._fleet_opt["program_configs"][str( id(model.get_cost_op().block.program) )]["push_sparse"] = [] if 'train_thread_num' not in executor: executor['train_thread_num'] = self.global_config[ 'train_thread_num'] with fluid.scope_guard(scope): self._exe.run(model._build_param['model']['startup_program']) model.dump_model_program('./') # server init done if fleet.is_server(): return 0 self._dataset = {} for dataset_item in self.global_config['dataset']['data_list']: dataset_item['data_vars'] = data_var_list dataset_item.update(self.global_config['io']['afs']) dataset_item["batch_size"] = self.global_config['batch_size'] self._dataset[dataset_item[ 'name']] = dataset.FluidTimeSplitDataset(dataset_item) # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass: # util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3) fleet.init_worker() pass