def test_normal(self): logger.info("begin test_normal") checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._clear_envs() self._reset_generator() self._run_normal() self._readd_envs() logger.info("end test_normal")
def __init__(self, max_epoch_num, name, checkpoint_inter=None, restored=True): self._max_epoch_num = max_epoch_num self._epoch_no = -1 # current epoch_no self._name = name self._restored_from = None self._exe_status = {} self._flag_generated = False self._checker = g_checker if checkpoint_inter is not None: self._save_checkpoint_inter = checkpoint_inter else: self._save_checkpoint_inter = self._checker.save_checkpoint_inter assert self._save_checkpoint_inter >= 0, "checkpointer:{} must >=0".format( self._save_checkpoint_inter) self._last_checkpoint_time = time.time() self._load_cp_nos = None self._checkpoint_epoch_no = None if not self._checker.valid(): return self._file_name = "range_train_status" if not restored: return self._checkpoint_path = self._checker.get_range_checkpoint_path(name) config = { "fs.default.name": self._checker.hdfs_name, "hadoop.job.ugi": self._checker.hdfs_ugi } if self._checker.ce_test: config = None from paddle.distributed.fleet.utils.fs import HDFSClient self._hdfs = HDFSClient(self._checker.hdfs_home, config) self._cper = CheckpointSaver(self._hdfs) _thread_checker() self._get_last_valid_checkpoint()
def test_distributed_basic(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_distributed_basic") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) #basic exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog, minimize=False) #fleet os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with fluid.program_guard(main_prog, startup_prog): dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(loss) exe.run(startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i)) for data in data_loader(): fetch = exe.run(fleet.main_program, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" self.assertEqual(i, 2) fs.delete(save_dir) logger.info("end test_distributed_basic")
def __init__(self, config): self.metrics = {} self.config = config self.exe = None self.reader_type = config.get("runner.reader_type", "InMemoryDataset") self.split_interval = config.get("runner.split_interval", 5) self.split_per_pass = config.get("runner.split_per_pass", 1) self.checkpoint_per_pass = config.get("runner.checkpoint_per_pass", 6) self.save_delta_frequency = config.get("runner.save_delta_frequency", 6) self.save_first_base = config.get("runner.save_first_base", False) self.data_donefile = config.get("runner.data_donefile", "") self.data_sleep_second = config.get("runner.data_sleep_second", 10) self.start_day = config.get("runner.start_day") self.end_day = config.get("runner.end_day") self.save_model_path = self.config.get("runner.model_save_path") self.need_train_dump = self.config.get("runner.need_train_dump", False) self.need_infer_dump = self.config.get("runner.need_infer_dump", False) if config.get("runner.fs_client.uri") is not None: self.hadoop_fs_name = config.get("runner.fs_client.uri", "") self.hadoop_fs_ugi = config.get("runner.fs_client.user", "") + "," + config.get( "runner.fs_client.passwd", "") configs = { "fs.default.name": self.hadoop_fs_name, "hadoop.job.ugi": self.hadoop_fs_ugi } self.hadoop_client = HDFSClient("$HADOOP_HOME", configs) else: self.hadoop_fs_name, self.hadoop_fs_ugi = "", "" self.hadoop_client = None
def test_hdfs(self): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=5 * 1000, sleep_inter=100) self._test_rm(fs) self._test_touch(fs) self._test_dirs(fs)
def test_basic(self): logger.info("begin test_basic") checker = acp._get_checker() self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT") self.assertEqual(checker.platform, "PADDLE_CLOUD") self.assertEqual(checker.save_checkpoint_inter, 0) print(checker) fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() self._run_save_0() self._reset_generator() self._run_load_0() logger.info("end test_basic")
def test_hdfs(self): fs = HDFSClient( "/usr/local/hadoop-2.7.7/", None, time_out=5 * 1000, sleep_inter=100) self._test_mkdirs(fs) self._test_list_dir(fs) self._test_try_upload(fs) self._test_try_download(fs) self._test_upload(fs) self._test_download(fs)
def test_exists(self): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=6 * 1000, sleep_inter=100) self.assertFalse(fs.is_exist(os.path.abspath("./xxxx"))) self.assertFalse(fs.is_dir(os.path.abspath("./xxxx"))) self.assertTrue(fs.is_dir(os.path.abspath("./xxx/.."))) dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs1.py")) self.assertTrue(dirs == []) self.assertTrue(len(files) == 1) dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
def test_timeout(self): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=6 * 1000, sleep_inter=100) src = "hdfs_test_timeout" dst = "new_hdfs_test_timeout" fs.delete(dst) fs.mkdirs(src) fs.mkdirs(dst) fs.mkdirs(dst + "/" + src) output = "" cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst) try: fs.mv(src, dst, test_exists=False) self.assertFalse( 1, "can't execute cmd:{} output:{}".format(cmd, output)) except FSTimeOut as e: print("execute mv {} to {} timeout".format(src, dst)) ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000) self.assertNotEqual(ret, 0) print("second mv ret:{} output:{}".format(ret, output))
def test_is_dir(self): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=6 * 1000, sleep_inter=100) self.assertFalse(fs.is_dir("./test_hdfs.py")) s = """ java.io.IOException: Input/output error responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164) at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118) at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696) at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297) at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514) at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092) at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79) at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353) """ print("split lines:", s.splitlines()) self.assertTrue(fs._test_match(s.splitlines()) != None)
def test_multiple(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_multiple") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) exe, main_prog1, startup_prog1 = self._generate() _, main_prog2, startup_prog2 = self._generate() compiled1, data_loader1, optimizer1, loss1, image1, label1 = \ self._init_env(exe, main_prog1, startup_prog1) compiled2, data_loader2, optimizer2, loss2, image2, label2 = \ self._init_env(exe, main_prog2, startup_prog2) o = None epochs = [] for i in acp.train_epoch_range(3, 0): for data in data_loader1(): fetch = exe.run(compiled1, feed=data, fetch_list=[loss1]) for data in data_loader2(): fetch = exe.run(compiled2, feed=data, fetch_list=[loss2]) o = acp._get_train_epoch_range() self.assertEqual(len(o._exe_status), 2) print(o._exe_status) epochs.append(i) o = acp._get_train_epoch_range() self.assertTrue(o == None, "now train epoch must not exits now") self.assertEqual(i, 2) self.assertEqual(epochs, [0, 1, 2]) fs.delete(save_dir) logger.info("end test_multiple")
def _test_corner_epoch_no(self, break_epoch_no): logger.info("begin test_corener_epoch_no") checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() self._run_save_0(break_epoch_no=break_epoch_no) self._reset_generator() self._run_load_0(break_epoch_no=break_epoch_no) fs.delete(checker.hdfs_checkpoint_path) logger.info("end test_corener_epoch_no")
def test(self): fs = HDFSClient("/usr/local/hadoop-2.7.7", None) dir_path = "./checkpointsaver_test" fs.delete(dir_path) s = CheckpointSaver(fs) fs.mkdirs("{}/exe.exe".format(dir_path)) fs.mkdirs("{}/exe.1".format(dir_path)) fs.mkdirs("{}/exe".format(dir_path)) a = s.get_checkpoint_no(dir_path) self.assertEqual(len(a), 0) fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path)) fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path)) a = s.get_checkpoint_no(dir_path) self.assertEqual(len(a), 1) s.clean_redundant_checkpoints(dir_path) s.clean_redundant_checkpoints(dir_path) fs.delete(dir_path)
def test_config(self): config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"} fs = HDFSClient("/usr/local/hadoop-2.7.7/", config, time_out=6 * 1000, sleep_inter=100)
class TrainEpochRange(SerializableBase): def __init__(self, max_epoch_num, name, checkpoint_inter=None, restored=True): self._max_epoch_num = max_epoch_num self._epoch_no = -1 # current epoch_no self._name = name self._restored_from = None self._exe_status = {} self._flag_generated = False self._checker = g_checker if checkpoint_inter is not None: self._save_checkpoint_inter = checkpoint_inter else: self._save_checkpoint_inter = self._checker.save_checkpoint_inter assert self._save_checkpoint_inter >= 0, "checkpointer:{} must >=0".format( self._save_checkpoint_inter) self._last_checkpoint_time = time.time() self._load_cp_nos = None self._checkpoint_epoch_no = None if not self._checker.valid(): return self._file_name = "range_train_status" if not restored: return self._checkpoint_path = self._checker.get_range_checkpoint_path(name) config = { "fs.default.name": self._checker.hdfs_name, "hadoop.job.ugi": self._checker.hdfs_ugi } if self._checker.ce_test: config = None from paddle.distributed.fleet.utils.fs import HDFSClient self._hdfs = HDFSClient(self._checker.hdfs_home, config) self._cper = CheckpointSaver(self._hdfs) _thread_checker() self._get_last_valid_checkpoint() def _look_for_valid(self, cp_nos): cps = [] epoch_no = -1 for i in cp_nos[::-1]: t = TrainEpochRange(self._max_epoch_num, self.name, restored=False) self._cper.load_checkpoint( self._checkpoint_path, [t], self._checker.trainer_id, checkpoint_no=i, local_cache_path=self._checker._fs_cache) cps.append(t) logger.debug("look for valid:{} t:{}".format(i, t._serialize())) if epoch_no < 0: epoch_no = t._epoch_no else: if epoch_no - t._epoch_no >= 1: return t, i return None, None def _get_last_valid_checkpoint(self): self._load_cp_nos = self._cper.get_checkpoint_no(self._checkpoint_path) logger.info("find checkpoint nos:{}".format(self._load_cp_nos)) if len(self._load_cp_nos) < 1: self._restored_from = CONST_MEMORYINIT return if g_acp_type == CONST_ACP_TYPE: # get the last one self._cper.load_checkpoint( self._checkpoint_path, [self], self._checker.trainer_id, local_cache_path=self._checker._fs_cache) self._restored_from = CONST_CHECKPOINT self._checkpoint_epoch_no = self._epoch_no logger.info("load tain_epoch_range checkpoint:{}".format( self._serialize())) elif g_acp_type == CONST_DACP_TYPE: t, i = self._look_for_valid(self._load_cp_nos) if t is None: self._restored_from = CONST_MEMORYINIT return self._cper.load_checkpoint( self._checkpoint_path, [self], self._checker.trainer_id, checkpoint_no=i, local_cache_path=self._checker._fs_cache) self._restored_from = CONST_CHECKPOINT self._checkpoint_epoch_no = self._epoch_no logger.info("load tain_epoch_range checkpoint:{}".format( self._serialize())) else: assert False, "not supported acp_type:{}".format(g_acp_type) def _to_dict(self): d = { "max_epoch_num": self._max_epoch_num, "epoch_no": self._epoch_no, "name": self._name, "checkpoint_path": self._checkpoint_path, "restored_from": self._restored_from, "checkpoint_epoch_no": self._checkpoint_epoch_no } return d def __str__(self): return self._serialize([]) @property def name(self): return self._name def serialize(self, path): file_name = "{}/{}".format(path, self._file_name) with open(file_name, 'w') as f: s = self._serialize() f.write(s) def _serialize(self, pop_keys=["restored_from", "checkpoint_epoch_no"]): # self d = self._to_dict() for k in pop_keys: d.pop(k, None) # registerd exes d["exe_status"] = {} e = d["exe_status"] for k, t in six.iteritems(self._exe_status): e[t._key] = t._serialize() return json.dumps(d) @property def restored_from(self): return self._restored_from def deserialize(self, path): d = None file_name = "{}/{}".format(path, self._file_name) with open(file_name, 'r') as f: d = json.load(f) # self self._max_epoch_num = d["max_epoch_num"] self._epoch_no = d["epoch_no"] self._name = d["name"] self._checkpoint_path = d["checkpoint_path"] # exes status e = d["exe_status"] for k, v in six.iteritems(e): t = ExeTrainStatus() t._deserialize(v) self._exe_status[k] = t def next(self): _thread_checker() if self._max_epoch_num < 0: self._max_epoch_num = sys.maxint assert self._epoch_no >= -1, "self._epoch_no:{} must >=-1".format( self._epoch_no) self._last_checkpoint_time = time.time() start = self._epoch_no + 1 logger.info("started epoch_no:{} max_epoch_num:{}".format( start, self._max_epoch_num)) for i in range(start, self._max_epoch_num): self._epoch_no = i yield i self.save_checkpoint() def get(self): return self._epoch_no def save_checkpoint(self): # not save last one because exe and program can't be restored. if self._checker.trainer_id == 0: if time.time() - self._last_checkpoint_time >= \ self._save_checkpoint_inter: if g_acp_type == CONST_ACP_TYPE: # not save the last one if self._max_epoch_num > 0 and self._epoch_no != self._max_epoch_num - 1: self._save_checkpoint() elif g_acp_type == CONST_DACP_TYPE: self._save_checkpoint() else: assert False, "not supported acp_type:{}".format( g_acp_type) self._last_checkpoint_time = time.time() def _save_checkpoint(self): """ status => /jobid/xxx_range_xx/range/ model => /exe/ """ if not self._checker.valid(): return e = self._exe_status for k, t in six.iteritems(self._exe_status): m = PaddleModel(t._exe, t._program) p = self._checker.get_exe_checkpoint_path(t._hash_key) t._epoch_no = self.get() path, checkpoint_no = self._cper.save_checkpoint( p, [m], self._checker.trainer_id, local_cache_path=self._checker._fs_cache) # index info t._checkpoint_path = path t._checkpoint_no = checkpoint_no e[t._key] = t logger.debug("save executor checkpoint:{}".format(t._serialize())) if len(self._exe_status) > 0: self._cper.save_checkpoint( self._checkpoint_path, [self], local_cache_path=self._checker._fs_cache) logger.info("save train_epoch_range checkpoint:{}".format( self._serialize())) self._generate_flag() def _generate_flag(self): if self._flag_generated: return name = "can_be_auto_checkpoint.flag" path = self._checker.get_job_path() + "/" + name logger.info("this job can_be_auto_checkpoint") self._hdfs.mkdirs(self._checker.get_job_path()) self._hdfs.touch(path, exist_ok=True) self._flag_generated = True
def test_hdfs_checkpoint(self): fs = HDFSClient("/usr/local/hadoop-2.7.7", None) dir_path = "./checkpoint_test_hdfs" self._test_checkpoint(fs, os.path.abspath(dir_path))
def _test_list_dir(self, fs): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000, sleep_inter=100) fs.ls_dir("test_not_exists")