def prepare_nccl2_env(self, is_local): """ :param is_local: :return: """ if not is_local: logging.debug("is_distributed: %s" % self.params["is_distributed"]) if self.params["is_distributed"]: trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) logging.debug( "worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=self.train_program if self.params["is_do_train"] else self.test_program, startup_program=self.startup_program) self.num_trainers = trainers_num self.trainer_id = trainer_id
def pserver_prepare(args, train_prog, startup_prog): config = fluid.DistributeTranspilerConfig() config.slice_var_up = args.split_var t = fluid.DistributeTranspiler(config=config) envs = args.dist_env training_role = envs["training_role"] t.transpile(envs["trainer_id"], program=train_prog, pservers=envs["pserver_endpoints"], trainers=envs["num_trainers"], sync_mode=not args.async_mode, startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(envs["current_endpoint"]) pserver_startup_program = t.get_startup_program( envs["current_endpoint"], pserver_program, startup_program=startup_prog) return pserver_program, pserver_startup_program elif training_role == "TRAINER": train_program = t.get_trainer_program() return train_program, startup_prog else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def transpile2dist(): # pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints # eplist = [] # port = os.getenv("PADDLE_INIT_PORT") # for ip in pserver_ips.split(","): # eplist.append(':'.join([ip, port])) # pserver_endpoints = ",".join(eplist) pserver_endpoints = os.getenv("PSERVERS") print("pserver endpoints: ", pserver_endpoints) trainers = int(os.getenv("TRAINERS")) # total trainer count print("trainers total: ", trainers) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")) # current_endpoint = os.getenv( # "POD_IP") + ":" + port # current pserver endpoint current_endpoint = os.getenv("SERVER_ENDPOINT") role = os.getenv( "TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver t = fluid.DistributeTranspiler() t.transpile( optimize_ops, params_grads, trainer_id, pservers=pserver_endpoints, trainers=trainers) return t, role, current_endpoint, trainer_id
def train(): args = parse_args() if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, data_list, auc_var, batch_auc_var = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer.minimize(loss) if args.is_local: logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var, 1, 0) else: logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": logger.info("run pserver") prog = t.get_pserver_program(args.current_endpoint) startup = t.get_startup_program(args.current_endpoint, pserver_program=prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif args.role == "trainer": logger.info("run trainer") train_prog = t.get_trainer_program() train_loop(args, train_prog, data_list, loss, auc_var, batch_auc_var, args.trainers, args.trainer_id)
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): t = fluid.DistributeTranspiler() t.transpile(trainer_id=trainer_id, program=main_program, pservers=pserver_endpoints, trainers=trainers) return t
def train(nn_type, use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') if nn_type == 'mlp': net_conf = mlp else: net_conf = conv_net prediction, avg_loss, acc = net_conf(img, label) optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=BATCH_SIZE) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) def train_loop(main_program): startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) st = fluid.ExecutionStrategy() st.num_threads = 1 st.allow_op_delay = False exe = fluid.ParallelExecutor(use_cuda, avg_loss.name, exec_strategy=st) for pass_id in range(100): for batch_id, data in enumerate(train_reader()): loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) print(loss) # port = os.getenv("PADDLE_PSERVER_PORT", "6174") # pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... # eplist = [] # for ip in pserver_ips.split(","): # eplist.append(':'.join([ip, port])) # pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = os.getenv("PADDLE_PSERVER_ENDPOINTS") trainers = int(os.getenv("PADDLE_TRAINERS")) # current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) ps_exe = fluid.Executor(fluid.CPUPlace()) ps_exe.run(pserver_startup) ps_exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id): x = fluid.layers.data(name='x', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') # loss function cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) # optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) port = os.getenv("PADDLE_INIT_PORT", port) pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip) # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("TRAINERS", trainer_count)) current_endpoint = os.getenv("POD_IP", ip) + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id)) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=sync_mode) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog)
def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): remove_ps_flag(os.getpid()) x = fluid.layers.data(name='x', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') # loss function cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) # optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) pserver_endpoints = ip + ":" + port current_endpoint = ip + ":" + port config = fluid.DistributeTranspilerConfig() config.sync_mode = sync_mode t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=sync_mode) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog)
def dist_transpile(trainer_id, args, train_prog, startup_prog): port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port training_role = os.getenv("PADDLE_TRAINING_ROLE") config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, program=train_prog, pservers=pserver_endpoints, trainers=trainers, sync_mode=not args.async_mode, startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( current_endpoint, pserver_program, startup_program=startup_prog) return pserver_program, pserver_startup_program elif training_role == "TRAINER": train_program = t.get_trainer_program() return train_program, startup_prog else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers, sync_mode, dc_asgd=False, current_endpoint=None, nccl_comm_num=1, hogwild_mode=False): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd config.sync_mode = sync_mode config.runtime_split_send_recv = hogwild_mode if nccl_comm_num > 1: config.nccl_comm_num = nccl_comm_num # config.runtime_split_send_recv = True t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id=trainer_id, program=main_program, pservers=pserver_endpoints, trainers=trainers, sync_mode=sync_mode, current_endpoint=current_endpoint) return t
def _transpiler_instance(self): main = self.get_main_program() t = fluid.DistributeTranspiler() t.transpile(self.trainer_id, program=main, pservers=self.pserver_eps, trainers=self.trainers) return t
def nccl2_prepare(trainer_id, startup_prog, main_prog): config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), startup_program=startup_prog, program=main_prog)
def train(): y = fluid.layers.data(name='y', shape=[1], dtype='float32') x = fluid.layers.data(name='x', shape=[13], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) loss = fluid.layers.square_error_cost(input=y_predict, label=y) avg_loss = fluid.layers.mean(loss) opt = fluid.optimizer.SGD(learning_rate=0.001) opt.minimize(avg_loss) place = fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) exe = fluid.Executor(place) # fetch distributed training environment setting training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "localhost") trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) trainers = int(os.getenv("PADDLE_TRAINERS", "1")) current_endpoint = os.getenv("PADDLE_CURRENT_IP", "localhost") + ":" + port t = fluid.DistributeTranspiler() t.transpile(trainer_id=trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=False) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) startup_prog = t.get_startup_program(current_endpoint, pserver_prog) exe.run(startup_prog) exe.run(pserver_prog) elif training_role == "TRAINER": trainer_prog = t.get_trainer_program() exe.run(fluid.default_startup_program()) for epoch in range(EPOCH_NUM): for batch_id, batch_data in enumerate(train_reader()): avg_loss_value, = exe.run(trainer_prog, feed=feeder.feed(batch_data), fetch_list=[avg_loss]) if (batch_id + 1) % 10 == 0: print("Epoch: {0}, Batch: {1}, loss: {2}".format( epoch, batch_id, avg_loss_value[0])) # destory the resource of current trainer node in pserver server node exe.close() else: raise AssertionError( "PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")
def nccl2_prepare(args, startup_prog): config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) envs = args.dist_env t.transpile(envs["trainer_id"], trainers=','.join(envs["trainer_endpoints"]), current_endpoint=envs["current_endpoint"], startup_program=startup_prog)
def _transpiler_instance(self, config=None, sync_mode=True): if not self.transpiler: main = self.get_main_program() self.transpiler = fluid.DistributeTranspiler(config=config) self.transpiler.transpile(self.trainer_id, program=main, pservers=self.pserver_eps, trainers=self.trainers, sync_mode=sync_mode) return self.transpiler
def train(): args = parse_args() if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc_var, batch_auc_var, py_reader, _ = ctr_dnn_model( args.embedding_size, args.sparse_feature_dim) optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer.minimize(loss) if args.cloud_train: # the port of all pservers, needed by both trainer and pserver port = os.getenv("PADDLE_PORT", "6174") # comma separated ips of all pservers, needed by trainer and pserver_ips = os.getenv("PADDLE_PSERVERS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) args.endpoints = ",".join(eplist) args.trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) args.current_endpoint = os.getenv("POD_IP", "localhost") + ":" + port args.role = os.getenv("TRAINING_ROLE", "TRAINER") args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) args.is_local = bool(int(os.getenv("PADDLE_IS_LOCAL", 0))) if args.is_local: logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, py_reader, loss, auc_var, batch_auc_var, 1, 0) else: logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver" or args.role == "PSERVER": logger.info("run pserver") prog = t.get_pserver_program(args.current_endpoint) startup = t.get_startup_program(args.current_endpoint, pserver_program=prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif args.role == "trainer" or args.role == "TRAINER": logger.info("run trainer") train_prog = t.get_trainer_program() train_loop(args, train_prog, py_reader, loss, auc_var, batch_auc_var, args.trainers, args.trainer_id) else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def _build_trainer_program_for_job( self, trainer_id=0, program=None, ps_endpoints=[], trainers=0, sync_mode=True, startup_program=None, job=None): transpiler = fluid.DistributeTranspiler() transpiler.transpile(trainer_id, program=program, pservers=",".join(ps_endpoints), trainers=trainers, sync_mode=sync_mode, startup_program=startup_program) main = transpiler.get_trainer_program(wait_port=False) job._trainer_startup_programs.append(startup_program) job._trainer_main_programs.append(main)
def nccl2_prepare(args, startup_prog, main_prog): config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) #envs = args.dist_env trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "127.0.0.1:6170") trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "127.0.0.1:6170") t.transpile(trainer_id, trainers=trainer_endpoints, current_endpoint=current_endpoint, startup_program=startup_prog, program=main_prog)
def dist_transpile(trainer_id, args, train_prog, startup_prog): if trainer_id < 0: return None, None # the port of all pservers, needed by both trainer and pserver port = os.getenv("PADDLE_PSERVER_PORT", "6174") # comma separated ips of all pservers, needed by trainer and # pserver pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # total number of workers/trainers in the job, needed by # trainer and pserver trainers = int(os.getenv("PADDLE_TRAINERS")) # the IP of the local machine, needed by pserver only current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") print("TRAINING_ROLE:", training_role) with fluid.program_guard(train_prog, startup_prog): config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to # generate different program for train and test. program=train_prog, pservers=pserver_endpoints, trainers=trainers, sync_mode=not args.async_mode) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( current_endpoint, pserver_program, startup_program=startup_prog) return pserver_program, pserver_startup_program elif training_role == "TRAINER": train_program = t.get_trainer_program() return train_program, startup_prog else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def init_distribuition_env(program): if status.mode == DistributionMode.LOCAL: log.info('Initializing local training') elif status.mode == DistributionMode.NCCL: config = F.DistributeTranspilerConfig() config.mode = "nccl2" F.DistributeTranspiler(config=config).transpile( status.replica_id, trainers=','.join(status._env), current_endpoint=status._this, program=program.train_program, startup_program=program.startup_program) log.info('Initializing distribution training with config %s' % (repr(dis_config))) if status.is_master: sleep(30)
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers, sync_mode, dc_asgd=False): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id=trainer_id, program=main_program, pservers=pserver_endpoints, trainers=trainers, sync_mode=sync_mode) return t
def _define_pserver_executor(self, pserver_args, train_program, startup_program, test_program): sync_mode = pserver_args['sync_mode'] role = pserver_args['role'] trainer_id = pserver_args['trainer_id'] # get actual trainer id here trainers = pserver_args['trainers'] current_endpoint = pserver_args['current_endpoint'] endpoints = pserver_args['endpoints'] # ip:port,ip:port or ip,ip;port def _process_endpoints(endpoints): if not ';' in endpoints: return endpoints ips, port = endpoints.split(';') return ','.join(['%s:%s' % (ip, port) for ip in ips.split(',')]) endpoints = _process_endpoints(endpoints) dist_config = fluid.DistributeTranspilerConfig() dist_config.slice_var_up = False t = fluid.DistributeTranspiler(config=dist_config) t.transpile(trainer_id, program=train_program, startup_program=startup_program, pservers=endpoints, trainers=trainers, sync_mode=sync_mode) if role == "PSERVER": ps_prog, ps_startup = t.get_pserver_programs(current_endpoint) # use CPU to execute pserver self.base_exe = fluid.Executor(fluid.CPUPlace()) self.base_exe.run(ps_startup) self.ckp_step = self.load_model( self.model_dir ) # PSERVER to load model, and TRAINER to save model self.base_exe.run(ps_prog) # will hold on here exit() elif role == "TRAINER": place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() self.base_exe = fluid.Executor(place) self.base_exe.run(startup_program) train_program = t.get_trainer_program() self.ckp_step = self.get_lastest_checkpoint(self.model_dir) self._define_parallel_executor(train_program, test_program)
def train(args): if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0: os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_dir) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75) id_frequencys_pow = np_power / np_power.sum() loss, py_reader = skip_gram_word2vec( word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=args.base_lr, decay_steps=100000, decay_rate=0.999, staircase=True)) optimizer.minimize(loss) logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile( args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": print("run psever") pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_startup = t.get_startup_program(args.current_endpoint, pserver_prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(pserver_startup) exe.run(pserver_prog) elif args.role == "trainer": print("run trainer") train_loop(args, t.get_trainer_program(), word2vec_reader, py_reader, loss, args.trainer_id, id_frequencys_pow)
def append_nccl2_prepare(trainer_id, startup_prog): trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) port = os.getenv("PADDLE_PSERVER_PORT") worker_ips = os.getenv("PADDLE_TRAINER_IPS") worker_endpoints = [] for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port num_trainers = len(worker_endpoints) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=','.join(worker_endpoints), current_endpoint=current_endpoint, startup_program=startup_prog) return num_trainers, trainer_id
def _build_server_programs_for_job( self, program=None, ps_endpoints=[], trainers=0, sync_mode=True, startup_program=None, job=None): transpiler = fluid.DistributeTranspiler() trainer_id = 0 transpiler.transpile( trainer_id, program=program, pservers=",".join(ps_endpoints), trainers=trainers, sync_mode=sync_mode, startup_program=startup_program) job.set_server_endpoints(ps_endpoints) for endpoint in ps_endpoints: main_prog = transpiler.get_pserver_program(endpoint) startup_prog = transpiler.get_startup_program(endpoint, main_prog) job._server_startup_programs.append(startup_prog) job._server_main_programs.append(main_prog)
def test_nccl2_transpile(self): if fluid.core.is_compiled_with_cuda(): #test nccl2 only with cuda main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): self.net_conf() config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(0, trainers="127.0.0.1:6174,127.0.0.1:6175", current_endpoint="127.0.0.1:6174", startup_program=startup) print([op.type for op in startup.global_block().ops]) self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id") self.assertIsNotNone(startup.global_block().vars.get("NCCLID")) else: pass
def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers, trainer_id): remove_ps_flag(os.getpid()) x = fluid.layers.data(name='x', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False) y = fluid.layers.data(name='y', shape=[1], dtype='float32') # loss function cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) # optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) ps1 = ip + ":" + str(int(port) + 1) ps2 = ip + ":" + port pserver_endpoints = ps1 + "," + ps2 config = fluid.DistributeTranspilerConfig() config.sync_mode = sync_mode config.slice_var_up = False t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=sync_mode) pserver_prog = t.get_pserver_program(ps2) # pserver2 have no parameter assert (len(pserver_prog.blocks) == 2) assert (len(pserver_prog.blocks[1].ops) == 0) pserver_startup = t.get_startup_program(ps2, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog)
def train(args): if args.enable_ce: SEED = 102 fluid.default_startup_program().random_seed = SEED fluid.default_main_program().random_seed = SEED use_cuda = True if args.use_cuda else False parallel = True if args.parallel else False print("use_cuda:", use_cuda, "parallel:", parallel) train_reader, vocab_size = utils.construct_train_data( args.train_dir, args.vocab_path, args.batch_size * get_cards(args)) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim, args.hidden_size) # Train program train_input_data, cos_pos, avg_cost, acc = ssr.train() # Optimization to minimize lost optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr) optimizer.minimize(avg_cost) print("run distribute training") t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": print("run psever") pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_startup = t.get_startup_program(args.current_endpoint, pserver_prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(pserver_startup) exe.run(pserver_prog) elif args.role == "trainer": print("run trainer") train_loop(t.get_trainer_program(), avg_cost, acc, train_input_data, place, args, train_reader)
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): PASS_NUM = 100 EMBED_SIZE = 32 HIDDEN_SIZE = 256 N = 5 BATCH_SIZE = 32 IS_SPARSE = is_sparse def __network__(words): embed_first = fluid.layers.embedding(input=words[0], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr='shared_w') embed_second = fluid.layers.embedding(input=words[1], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr='shared_w') embed_third = fluid.layers.embedding(input=words[2], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr='shared_w') embed_forth = fluid.layers.embedding(input=words[3], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr='shared_w') concat_embed = fluid.layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], axis=1) hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) avg_cost = fluid.layers.mean(cost) return avg_cost, predict_word word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') if not is_parallel: avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: raise NotImplementedError() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch(paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder( feed_list=[first_word, second_word, third_word, forth_word, next_word], place=place) def train_loop(main_program): exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): for data in train_reader(): avg_cost_np = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: if save_dirname is not None: fluid.io.save_inference_model( save_dirname, ['firstw', 'secondw', 'thirdw', 'forthw'], [predict_word], exe) return if math.isnan(float(avg_cost_np[0])): sys.exit("got NaN loss, training failed.") raise AssertionError("Cost is too large {0:2.2}".format( avg_cost_np[0])) if is_local: train_loop(fluid.default_main_program()) else: port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
def train_main(use_cuda, is_sparse, is_local=True): if use_cuda and not fluid.core.is_compiled_with_cuda(): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() context = encoder(is_sparse) rnn_out = decoder_train(context, is_sparse) label = pd.data(name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) avg_cost = pd.mean(cost) optimizer = fluid.optimizer.Adagrad( learning_rate=1e-4, regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.1)) optimize_ops, params_grads = optimizer.minimize(avg_cost) train_data = paddle.batch(paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) exe = Executor(place) def train_loop(main_program): exe.run(framework.default_startup_program()) batch_id = 0 for pass_id in xrange(1): for data in train_data(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(main_program, feed={ 'src_word_id': word_data, 'target_language_word': trg_word, 'target_language_next_word': trg_word_next }, fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 3: break batch_id += 1 if is_local: train_loop(framework.default_main_program()) else: port = os.getenv("PADDLE_INIT_PORT", "6174") pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) training_role = os.getenv("TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())