def _test_upload(self, fs): src_file = os.path.abspath("./test_upload.src") dst_file = os.path.abspath("./test_uolpad.dst") try: fs.upload(src_file, dst_file) self.assertFalse(True) except FSFileNotExistsError as e: pass local = LocalFS() local.touch(src_file) fs.delete(dst_file) assert fs.need_upload_download() fs.upload(src_file, dst_file) try: fs.upload(src_file, dst_file) self.assertFalse(True) except FSFileExistsError as e: pass self.assertTrue(fs.is_exist(dst_file)) fs.delete(dst_file) fs.delete(src_file)
def upload(self, local_path, fs_path): if self.is_exist(fs_path): raise FSFileExistsError("{} exists".format(fs_path)) local = LocalFS() if not local.is_exist(local_path): raise FSFileNotExistsError("{} not exists".format(local_path)) return self._try_upload(local_path, fs_path)
def _try_download(self, fs_path, local_path): cmd = "get {} {}".format(fs_path, local_path) ret = 0 try: ret, lines = self._run_cmd(cmd) if ret != 0: raise ExecuteError(cmd) except Exception as e: local_fs = LocalFS() local_fs.delete(local_path) raise e
def load_checkpoint(self, path, slists, trainer_id, local_cache_path=".cache", checkpoint_no=None, ignore_empty=True): """ Deserialize objects in slists from path Return really load path """ if checkpoint_no is None: max_no = self._get_last_checkpoint_no(path) if not ignore_empty: assert max_no >= 0, "Can't find checkpoint" if max_no < 0: return None checkpoint_no = max_no else: assert isinstance(checkpoint_no, int) assert checkpoint_no >= 0 from paddle.distributed.fleet.utils.fs import LocalFS local_fs = LocalFS() if self._fs.need_upload_download(): cache_path = "{}/{}.{}.load_cache".format(local_cache_path, self._checkpoint_prefix, checkpoint_no) if trainer_id is not None: cache_path = "{}.{}".format(cache_path, trainer_id) if not local_fs.is_exist(local_cache_path): local_fs.mkdirs(local_cache_path) if local_fs.is_exist(cache_path): local_fs.delete(cache_path) real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, checkpoint_no) load_path = real_path if self._fs.need_upload_download(): self._fs.download(real_path, cache_path) load_path = cache_path for s in slists: s.deserialize(load_path) if self._fs.need_upload_download() and cache_path: local_fs.delete(cache_path) return real_path
def test_local(self): fs = LocalFS() self._test_rm(fs) self._test_touch(fs) self._test_dirs(fs) self._test_touch_file(fs)
def _test_download_dir(self, fs): src_file = os.path.abspath("./test_download_dir_src") dst_file = os.path.abspath("./test_download_dir_dst") file1 = os.path.abspath("./test_download_dir_src/file1") file2 = os.path.abspath("./test_download_dir_src/file2") fs.delete(dst_file) fs.delete(src_file) fs.mkdirs(src_file) fs.touch(file1) fs.touch(file2) fs.download(src_file, dst_file) local = LocalFS() self.assertTrue(local.is_exist(file1)) local.delete(dst_file) fs.delete(src_file)
def save_checkpoint(self, path, slists, trainer_id=None, local_cache_path=".cache"): """ Serialize objects in slists to path Return really saved path and checkpoint_no """ if not self._fs.is_exist(path): self._fs.mkdirs(path) else: assert self._fs.is_dir(path), "path:{} must be a directory".format( path) max_no = self._get_last_checkpoint_no(path) if max_no < 0: max_no = -1 max_no += 1 real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no) tmp_path = "{}.tmp".format(real_path) saved_path = tmp_path from paddle.distributed.fleet.utils.fs import LocalFS local_fs = LocalFS() cache_path = None if self._fs.need_upload_download(): cache_path = "{}/{}.{}.saved_cache".format(local_cache_path, self._checkpoint_prefix, max_no) if trainer_id is not None: cache_path = "{}.{}".format(cache_path, trainer_id) if not local_fs.is_exist(cache_path): local_fs.mkdirs(cache_path) else: assert local_fs.is_dir(cache_path), \ "cache path:{} must be a directory".format(cache_path) saved_path = cache_path for s in slists: s.serialize(saved_path) if self._fs.need_upload_download(): self._fs.delete(tmp_path) self._fs.upload(cache_path, tmp_path) local_fs.delete(cache_path) self._fs.mv(tmp_path, real_path) return real_path, max_no
def test_distributed_basic(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_distributed_basic") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) #basic exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog, minimize=False) #fleet os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with fluid.program_guard(main_prog, startup_prog): dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(loss) exe.run(startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i)) for data in data_loader(): fetch = exe.run(fleet.main_program, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" self.assertEqual(i, 2) fs.delete(save_dir) logger.info("end test_distributed_basic")
def _run_normal(self): exe, main_prog, startup_prog = self._generate() save_dir = "./run_save_model" fs = LocalFS() fs.delete(save_dir) logger.info("begin _run_normal") compiled, data_loader, optimizer, loss, image, label = self._init_env( exe, main_prog, startup_prog) for i in range(3): self.assertEqual(acp._get_train_epoch_range(), None) self.assertEqual(acp.g_acp_type, None) for data in data_loader(): self.assertEqual(acp.g_acp_type, None) self.assertEqual(acp._get_train_epoch_range(), None) fetch = exe.run(compiled, feed=data, fetch_list=[loss]) self.assertEqual(acp.g_acp_type, None) self.assertEqual(acp._get_train_epoch_range(), None) m1 = PaddleModel(exe, compiled) m1.serialize(save_dir) m2 = PaddleModel(exe, compiled) m2.deserialize(save_dir) logger.info("end _run_normal") fs.delete(save_dir)
def _test_upload_dir(self, fs): # upload dir src_file = os.path.abspath("./test_upload_dir") dst_file = os.path.abspath("./test_uolpad_dir") file1 = os.path.abspath("./test_upload_dir/file1") file2 = os.path.abspath("./test_upload_dir/file2") local = LocalFS() local.mkdirs(src_file) local.touch(file1) local.touch(file2) fs.upload(src_file, dst_file) self.assertTrue(fs.is_exist(dst_file)) fs.delete(dst_file) local.delete(src_file)
def test_multiple(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_multiple") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) exe, main_prog1, startup_prog1 = self._generate() _, main_prog2, startup_prog2 = self._generate() compiled1, data_loader1, optimizer1, loss1, image1, label1 = \ self._init_env(exe, main_prog1, startup_prog1) compiled2, data_loader2, optimizer2, loss2, image2, label2 = \ self._init_env(exe, main_prog2, startup_prog2) o = None epochs = [] for i in acp.train_epoch_range(3, 0): for data in data_loader1(): fetch = exe.run(compiled1, feed=data, fetch_list=[loss1]) for data in data_loader2(): fetch = exe.run(compiled2, feed=data, fetch_list=[loss2]) o = acp._get_train_epoch_range() self.assertEqual(len(o._exe_status), 2) print(o._exe_status) epochs.append(i) o = acp._get_train_epoch_range() self.assertTrue(o == None, "now train epoch must not exits now") self.assertEqual(i, 2) self.assertEqual(epochs, [0, 1, 2]) fs.delete(save_dir) logger.info("end test_multiple")
def _run_load_0(self, break_epoch_no=None): logger.info("begin _run_load_0") exe, main_prog, startup_prog = self._generate() fs = LocalFS() save_dir = "./run_load_0" fs.delete(save_dir) compiled, data_loader, optimizer, loss, image, label = self._init_env( exe, main_prog, startup_prog) o = None i = 0 check = False epochs = [] for i in acp.train_epoch_range(3, 0): epochs.append(i) for data in data_loader(): fetch = exe.run(compiled, feed=data, fetch_list=[loss]) o = acp._get_train_epoch_range() self.assertTrue(o == None, "now train epoch must not exits now") self.assertEqual(i, 2) if break_epoch_no is not None: if break_epoch_no == 0: self.assertEqual(epochs, [0, 1, 2]) elif break_epoch_no == 1: self.assertEqual(epochs, [1, 2]) elif break_epoch_no == 2: self.assertEqual(epochs, [2]) else: self.assertEqual(epochs, [2]) fs.delete(save_dir) logger.info("begin _run_load_0")
def _run_save_0(self, break_epoch_no=None): logger.info("begin _run_save_0") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name for data in data_loader(): fetch = exe.run(compiled, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) if break_epoch_no is not None: if i == break_epoch_no: break o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" if break_epoch_no is None: self.assertEqual(i, 2) else: self.assertEqual(i, break_epoch_no) fs.delete(save_dir) logger.info("end _run_save_0")
def _test_checkpoint(self, fs, dir_path): file_name = "persistables" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace()) predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) avg_loss = fluid.layers.mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(avg_loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) status = ExeTrainStatus() status.epoch_no = 2 _, n1 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs) status2 = ExeTrainStatus() fleet.load_checkpoint(exe, dir_path, trainer_id=0, fs=fs, train_status=status2) self.assertEqual(status2, status) _, n2 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) self.assertEqual(n2, n1 + 1) c = CheckpointSaver(fs) cp_nos = c.get_checkpoint_no(dir_path) assert len(cp_nos) == 1 # cleanup all others # unnormal # test remain_all_checkpoint fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) # can't save under a file fs = LocalFS() cache_path = "./.load_cache" fs.touch(cache_path) try: fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass # can't load under a file try: fleet.load_checkpoint(exe, dir_path, trainer_id=0, train_status=status2, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass fs.delete(cache_path)
def test_local_checkpoint(self): fs = LocalFS() dir_path = "./checkpoint_test_local" self._test_checkpoint(fs, dir_path)
def test_local(self): fs = LocalFS() self._test_mkdirs(fs) self._test_list_dir(fs) self._test_try_upload(fs) self._test_try_download(fs)