def train(args): import logging log.setLevel(logging.DEBUG) log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) data = load_raw_edges_fn(args.edge_path, args.undirected) edges = data[0] weights = data[1] node2idx = data[2] num_nodes = len(node2idx) model = DeepwalkModel(num_nodes, args.hidden_size, args.neg_num, args.is_sparse, args.is_distributed, 1.) pyreader = model.pyreader loss = model.forward() # init fleet log.info("init_role") init_role() train_steps = math.ceil(1. * num_nodes * args.epoch / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) if args.optimizer == "sgd": args.lr *= args.batch_size * args.walk_len * args.win_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): log.info("PS server mode") fleet.init_server() fleet.run_server() if fleet.is_worker(): log.info("start init worker done") exe = F.Executor(F.CPUPlace()) exe.run(F.default_startup_program()) log.info("Startup done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") print("LEO num_nodes:",num_nodes, len(edges)) edges_feat={} edges_feat["weight"] = np.array(weights) graph = pgl.graph.Graph(num_nodes, edges, edge_feat=edges_feat) # bind gen gen_func = build_gen_func(args, graph) pyreader.decorate_tensor_provider(gen_func) train_prog(exe, F.default_main_program(), loss, pyreader, args, train_steps) print("fleet try to stop worker\r\n") fleet.stop_worker() print("Game over\r\n")
def test_single_run_ps_minimize(self): paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.mean(x=cost) fleet.init() strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(paddle.static.default_startup_program()) step = 10 for i in range(step): cost_val = exe.run(program=fluid.default_main_program(), feed=self.gen_data(), fetch_list=[avg_cost.name]) print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0]))
def run(self): fleet.init() self.network() if fleet.is_server(): self.run_server() elif fleet.is_worker(): self.run_worker() fleet.stop_worker() logger.info("Run Success, Exit.")
def run(self): self.init_fleet_with_gloo() self.network() if fleet.is_server(): self.run_server() elif fleet.is_worker(): self.run_offline_infer() fleet.stop_worker() # self.record_result() logger.info("Run Success, Exit.")
def run_single_pass(self): self.init_fleet_with_gloo() self.model = get_model(config) input_data = self.model.create_feeds() metrics = self.model.net(input_data) loss = self.model._cost user_defined_strategy = get_user_defined_strategy(config) learning_rate = config.get("hyper_parameters.optimizer.learning_rate") sync_mode = self.config.get("runner.sync_mode") inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) startup_program = paddle.static.default_startup_program() inner_optimizer.minimize(loss, startup_program) if self.config['debug_new_pass'] == 1: print("entering run {} - new".format( str(config["applied_pass_name"]))) from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, user_defined_strategy) ps_optimizer._set_origin_programs([loss]) ps_optimizer._init_ps_pass_context(loss, startup_program) _main = ps_optimizer.pass_ctx._attrs['cloned_main'] append_send_ops_pass = new_pass(config["applied_pass_name"], ps_optimizer.pass_ctx._attrs) append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx) else: print("entering run {} - old".format( str(config["applied_pass_name"]))) from paddle.fluid.incubate.fleet.parameter_server.ir import public as public dist_strategy = get_distributed_strategy(user_defined_strategy) compiled_config = public.CompileTimeStrategy( loss.block.program, startup_program, dist_strategy, self.role_maker) _main = compiled_config.origin_main_program.clone() _startup = compiled_config.origin_startup_program.clone() from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker _main = worker.append_send_ops_pass(_main, compiled_config) if fleet.is_server(): _main_file = ps_log_root_dir + sync_mode + "_" + str( config["applied_pass_name"]) + '_debug:_' + str( self.config['debug_new_pass']) + '_server_main.prototxt' debug_program(_main_file, _main) elif fleet.is_worker(): _main_file = ps_log_root_dir + sync_mode + "_" + str( config["applied_pass_name"]) + '_debug:_' + str( self.config['debug_new_pass']) + '_worker_main.prototxt' debug_program(_main_file, _main)
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() fake_num_nodes = 1 py_reader, loss = StaticSkipGramModel( fake_num_nodes, args.neg_num, args.embed_size, sparse_embedding=True, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = build_graph(args) # bind gen train_ds = ShardedDataset(graph.nodes, args.epoch) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.cpu_batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) py_reader.set_batch_generator(lambda: data_loader) train_loss = train(exe, paddle.static.default_main_program(), py_reader, loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def main(args): paddle.set_device("cpu") paddle.enable_static() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel( num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker()
def test_barrier(): """test ps barrier""" role = PaddleCloudRoleMaker(is_collective=False, init_gloo=True, path="./tmp_gloo") fleet.init(role) if fleet.is_server(): fleet.util.barrier("server") print("test_all_servers barrier ... ok") elif fleet.is_worker(): fleet.util.barrier("worker") print("test_all_workers barrier ... ok") fleet.util.barrier("test_barrier ... ok") print("all servers and workers arrive here")
def test_all_reduce(): """test ps all reduce""" role = PaddleCloudRoleMaker(is_collective=False, init_gloo=True, path="./tmp_gloo") fleet.init(role) if fleet.is_server(): input = [1, 2] output = fleet.util.all_reduce(input, "sum", "server") print(output[0]) assert output[0] == 1 elif fleet.is_worker(): input = np.array([3, 4]) output = fleet.util.all_reduce(input, "sum", "worker") print(output[0]) assert output[0] == 3 output = fleet.util.all_reduce(input, "sum", "all") print(output)
def run_minimize(self): self.init_fleet_with_gloo() self.model = get_model(self.config) print("cpu_num: {}".format(os.getenv("CPU_NUM"))) self.input_data = self.model.create_feeds() self.metrics = self.model.net(self.input_data) loss = self.model._cost user_defined_strategy = get_user_defined_strategy(self.config) learning_rate = self.config.get( "hyper_parameters.optimizer.learning_rate") sync_mode = self.config.get("runner.sync_mode") inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) self.role_maker._generate_role() # 必要 if self.config['debug_new_minimize'] == 1: print("entering run_minimize -- new") from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, user_defined_strategy) ps_optimizer.minimize_impl(loss) else: print("entering run_minimize -- old") fleet_obj = fleet.distributed_optimizer( inner_optimizer, user_defined_strategy) ## Fleet 对象 fleet_obj.minimize(loss) if fleet.is_server(): _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_server_main.prototxt' debug_program(_main_file, loss.block.program) elif fleet.is_worker(): _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_worker_main.prototxt' debug_program(_main_file, loss.block.program) elif self.role_maker._is_heter_worker(): _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize'] ) + '_heter_worker_main.prototxt' debug_program(_main_file, loss.block.program)
def run_the_one_ps(self): self.init_fleet_with_gloo() self.model = get_model(self.config) self.input_data = self.model.create_feeds() self.metrics = self.model.net(self.input_data) loss = self.model._cost user_defined_strategy = get_user_defined_strategy(self.config) learning_rate = self.config.get( "hyper_parameters.optimizer.learning_rate") sync_mode = self.config.get("runner.sync_mode") inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) self.role_maker._generate_role() # 必要 if self.config['debug_the_one_ps'] == 1: print("entering run_the_one_ps -- new") from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, user_defined_strategy) ps_optimizer.minimize_impl(loss) from paddle.distributed.ps.the_one_ps import TheOnePSRuntime _runtime_handle = TheOnePSRuntime() # ps 目录下重构版的 TheOnePSRuntime _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs) if fleet.is_worker(): worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc( ) with open( ps_log_root_dir + sync_mode + '_' + 'new_worker_ps_desc', 'w') as f: f.write(worker_desc) if fleet.is_server(): server_desc = _runtime_handle.ps_desc_builder.build_server_desc( ) with open( ps_log_root_dir + sync_mode + '_' + 'new_server_ps_desc', 'w') as f: f.write(server_desc) else: pass ''' print("entering run_the_one_ps -- old") fleet_obj = fleet.distributed_optimizer( inner_optimizer, user_defined_strategy) fleet_obj.minimize(loss) if fleet.is_worker(): worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False) server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f: f.write(str(worker_desc) + str(server_desc)) if fleet.is_server(): server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f: f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string())) ''' if fleet.is_server(): _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( self.config['debug_the_one_ps']) + '_server_main.prototxt' debug_program(_main_file, loss.block.program) elif fleet.is_worker(): _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( self.config['debug_the_one_ps']) + '_worker_main.prototxt' debug_program(_main_file, loss.block.program) elif self.role_maker._is_heter_worker(): _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( self.config['debug_the_one_ps'] ) + '_heter_worker_main.prototxt' debug_program(_main_file, loss.block.program)
def fit(): EPOCH_NUM = 3 BATCH_SIZE = 128 type_size = 10 role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True type_size = createDataList('F:/机器学习/CNN/train', 'D:/cnn/cnn.model.data' + "/") # 用于训练的数据提供器 train_reader = dataReader("D:/cnn/cnn.model.data/trainer.list") train_reader = paddle.batch(paddle.reader.shuffle(reader=train_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) test_reader = dataReader("D:/cnn/cnn.model.data/test.list") test_reader = paddle.batch(paddle.reader.shuffle(reader=test_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] paddle.enable_static() images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images, type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) # 定义优化方法 optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): fleet.init_worker() ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print("完成") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) for pass_id in range(EPOCH_NUM): # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run( program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run( program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict, "D:/cnn/cnn.model", exe)
def runtime_main(test_class): parser = argparse.ArgumentParser(description='Run Fleet test.') parser.add_argument('--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--trainer_endpoints', type=str, required=False, default="") parser.add_argument('--gloo_path', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--mode', type=str, required=False, default='geo') parser.add_argument('--geo_sgd_need_push_nums', type=int, required=False, default=2) parser.add_argument('--reader', type=str, required=False, default='dataset') parser.add_argument('--test', type=int, required=False, default=0) parser.add_argument('--model_dir', type=str, required=False, default="") args = parser.parse_args() model = test_class() role = model.build_role(args) # for distributed inference if args.test and args.model_dir != "": avg_cost = model.net(args, is_train=False) dist_infer = DistributedInfer() dist_infer.init_distributed_infer_env(exe=model.get_executor(), loss=model.avg_cost, role_maker=role, dirname=args.model_dir) if fleet.is_worker(): with paddle.static.program_guard( main_program=dist_infer.get_dist_infer_program()): model.do_distributed_testing(fleet) fleet.stop_worker() return if fleet.is_server(): return fleet.init(role) strategy = model.build_strategy(args) avg_cost = model.net(args) model.build_optimizer(avg_cost, strategy) if args.role == "pserver": model.run_pserver(args) else: if args.reader == "dataset": model.run_dataset_trainer(args) else: model.run_pyreader_trainer(args) if args.test: test_origin_program = paddle.static.Program() test_startup_program = paddle.static.Program() with paddle.static.program_guard( main_program=test_origin_program, startup_program=test_startup_program): with paddle.utils.unique_name.guard(): avg_cost = model.net(args, is_train=False) dist_infer = DistributedInfer(main_program=test_origin_program, startup_program=test_startup_program) with paddle.static.program_guard( main_program=dist_infer.get_dist_infer_program()): model.do_distributed_testing(fleet) fleet.stop_worker()
fleet.init(is_collective=False) model = WideDeepModel() model.net(is_train=True) optimizer = paddle.optimizer.SGD(learning_rate=0.0001) strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(model.cost) if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() distributed_training(exe, model) clear_metric_state(model, place) distributed_infer(exe, model) fleet.stop_worker()
def test_is_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) if fleet.is_worker(): print("test fleet is worker")
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel(num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) cpu_num = int(os.environ.get('CPU_NUM', 1)) if int(cpu_num) > 1: parallel_places = [paddle.CPUPlace()] * cpu_num exec_strategy = paddle.static.ExecutionStrategy() exec_strategy.num_threads = int(cpu_num) build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce compiled_prog = paddle.static.CompiledProgram( paddle.static.default_main_program()).with_data_parallel( loss_name=loss.name, places=parallel_places, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = paddle.static.default_main_program() for epoch in range(args.epoch): train_loss = train(exe, compiled_prog, data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def test_is_worker(): """test_is_worker""" assert fleet.is_worker() == True print("{} ... ok".format(sys._getframe().f_code.co_name))
def test_init_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) if fleet.is_worker(): fleet.init_worker()