def test_barrier(self): try: import netifaces except: print("warning: no netifaces, skip test_barrier") return gloo = fluid.core.Gloo() gloo.set_rank(0) gloo.set_size(1) gloo.set_prefix("123") gloo.set_iface("lo") gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "") gloo.init() role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.SERVER, worker_endpoints=["127.0.0.1:6003"], server_endpoints=["127.0.0.1:6001"]) role._node_type_comm = gloo role._role_is_generated = True fleet_util._set_role_maker(role) fleet_util.barrier("worker")
def test_all_gather(self): try: import netifaces except: print("warning: no netifaces, skip test_all_gather") return gloo = fluid.core.Gloo() gloo.set_rank(0) gloo.set_size(1) gloo.set_prefix("123") gloo.set_iface("lo") gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "") gloo.init() role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.SERVER, worker_endpoints=["127.0.0.1:6003"], server_endpoints=["127.0.0.1:6001"]) role._node_type_comm = gloo role._all_comm = gloo role._role_is_generated = True fleet_util._set_role_maker(role) output = fleet_util.all_gather(1, comm_world="all") print(output) # self.assertTrue(len(output) == 1 and output[0] == 1) self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay(learning_rate=base_lr, decay_steps=500, decay_rate=0.969, staircase=True)) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def test_print_on_rank(): """test ps print on rank""" role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) fleet.util.print_on_rank("test_print_on_rank0 ... ok", 0)
def test_tr_rolemaker(self): ro = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"], role=role_maker.Role.WORKER, current_id=0, worker_num=2) self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints()) self.assertTrue(ro._is_worker()) self.assertEqual(ro._role_id(), 0)
def test_ps_rolemaker(self): ro = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"], role=role_maker.Role.SERVER, current_id=0, worker_num=2) self.assertEqual(ro._server_num(), 2) ro._generate_role() self.assertTrue(ro._is_server()) self.assertEqual(ro._role_id(), 0)
def build_role(self, args): if args.role.upper() == "PSERVER": role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, path=args.gloo_path, current_id=args.current_id, role=role_maker.Role.SERVER, worker_endpoints=args.trainer_endpoints.split(","), server_endpoints=args.endpoints.split(",")) else: role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, path=args.gloo_path, current_id=args.current_id, role=role_maker.Role.WORKER, worker_endpoints=args.trainer_endpoints.split(","), server_endpoints=args.endpoints.split(",")) self.role = role return role
def test_file_shard(): """test ps file shard""" role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) files = fleet.util.get_file_shard(["file1", "file2", "file3"]) print(files) assert len(files) == 2
def test_get_file_shard(self): import paddle.distributed.fleet as fleet self.assertRaises(Exception, fleet.util.get_file_shard, "files") role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) files = fleet.util.get_file_shard(["1", "2", "3"]) self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
def test_tr_rolemaker(self): try: import netifaces except: print("warning: no netifaces, skip test_tr_rolemaker") return ro = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, server_endpoints="127.0.0.1:36001,127.0.0.1:36001", role=role_maker.Role.WORKER, current_id=0, worker_num=2) self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints()) self.assertTrue(ro.is_worker()) self.assertEqual(ro.role_id(), 0)
def run_ut(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True training_role = os.getenv("TRAINING_ROLE", "TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER if training_role == "TRAINER" else role_maker.Role.SERVER, worker_num=1, server_endpoints=["127.0.0.1:6002"]) if training_role == "TRAINER": self.run_trainer(role, strategy) else: self.run_pserver(role, strategy)
def test_get_file_shard(self): self.assertRaises(Exception, fleet_util.get_file_shard, "files") try: import netifaces except: print("warning: no netifaces, skip test_get_file_shard") return role = role_maker.UserDefinedRoleMaker( is_collective=False, init_gloo=False, current_id=0, role=role_maker.Role.WORKER, worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet_util._set_role_maker(role) files = fleet_util.get_file_shard(["1", "2", "3"]) self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.Adam(base_lr) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def test_pserver(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False} avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost)
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss = self.net() scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) optimizer = fluid.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) fleet.init_server()
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss = self.net() scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=base_lr, gamma=0.999, verbose=True) optimizer = fluid.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) fleet.init_server()
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"launch_barrier": False} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) os.environ["TEST_MODE"] = "1" fleet.init_worker() time.sleep(10) fleet.stop_worker()
def fit(): EPOCH_NUM = 3 BATCH_SIZE = 128 type_size = 10 role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True type_size = createDataList('F:/机器学习/CNN/train', 'D:/cnn/cnn.model.data' + "/") # 用于训练的数据提供器 train_reader = dataReader("D:/cnn/cnn.model.data/trainer.list") train_reader = paddle.batch(paddle.reader.shuffle(reader=train_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) test_reader = dataReader("D:/cnn/cnn.model.data/test.list") test_reader = paddle.batch(paddle.reader.shuffle(reader=test_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] paddle.enable_static() images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images, type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) # 定义优化方法 optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): fleet.init_worker() ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print("完成") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) for pass_id in range(EPOCH_NUM): # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run( program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run( program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict, "D:/cnn/cnn.model", exe)