def _run_normal(self): exe, main_prog, startup_prog = self._generate() save_dir = "./run_save_model" fs = LocalFS() fs.delete(save_dir) logger.info("begin _run_normal") compiled, data_loader, optimizer, loss, image, label = self._init_env( exe, main_prog, startup_prog) for i in range(3): self.assertEqual(acp._get_train_epoch_range(), None) self.assertEqual(acp.g_acp_type, None) for data in data_loader(): self.assertEqual(acp.g_acp_type, None) self.assertEqual(acp._get_train_epoch_range(), None) fetch = exe.run(compiled, feed=data, fetch_list=[loss]) self.assertEqual(acp.g_acp_type, None) self.assertEqual(acp._get_train_epoch_range(), None) m1 = PaddleModel(exe, compiled) m1.serialize(save_dir) m2 = PaddleModel(exe, compiled) m2.deserialize(save_dir) logger.info("end _run_normal") fs.delete(save_dir)
def _try_download(self, fs_path, local_path): cmd = "get {} {}".format(fs_path, local_path) ret = 0 try: ret, lines = self._run_cmd(cmd) if ret != 0: raise ExecuteError(cmd) except Exception as e: local_fs = LocalFS() local_fs.delete(local_path) raise e
def save_checkpoint(self, path, slists, trainer_id=None, local_cache_path=".cache"): """ Serialize objects in slists to path Return really saved path and checkpoint_no """ if not self._fs.is_exist(path): self._fs.mkdirs(path) else: assert self._fs.is_dir(path), "path:{} must be a directory".format( path) max_no = self._get_last_checkpoint_no(path) if max_no < 0: max_no = -1 max_no += 1 real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no) tmp_path = "{}.tmp".format(real_path) saved_path = tmp_path from paddle.distributed.fleet.utils.fs import LocalFS local_fs = LocalFS() cache_path = None if self._fs.need_upload_download(): cache_path = "{}/{}.{}.saved_cache".format(local_cache_path, self._checkpoint_prefix, max_no) if trainer_id is not None: cache_path = "{}.{}".format(cache_path, trainer_id) if not local_fs.is_exist(cache_path): local_fs.mkdirs(cache_path) else: assert local_fs.is_dir(cache_path), \ "cache path:{} must be a directory".format(cache_path) saved_path = cache_path for s in slists: s.serialize(saved_path) if self._fs.need_upload_download(): self._fs.delete(tmp_path) self._fs.upload(cache_path, tmp_path) local_fs.delete(cache_path) self._fs.mv(tmp_path, real_path) return real_path, max_no
def load_checkpoint(self, path, slists, trainer_id, local_cache_path=".cache", checkpoint_no=None, ignore_empty=True): """ Deserialize objects in slists from path Return really load path """ if checkpoint_no is None: max_no = self._get_last_checkpoint_no(path) if not ignore_empty: assert max_no >= 0, "Can't find checkpoint" if max_no < 0: return None checkpoint_no = max_no else: assert isinstance(checkpoint_no, int) assert checkpoint_no >= 0 from paddle.distributed.fleet.utils.fs import LocalFS local_fs = LocalFS() if self._fs.need_upload_download(): cache_path = "{}/{}.{}.load_cache".format(local_cache_path, self._checkpoint_prefix, checkpoint_no) if trainer_id is not None: cache_path = "{}.{}".format(cache_path, trainer_id) if not local_fs.is_exist(local_cache_path): local_fs.mkdirs(local_cache_path) if local_fs.is_exist(cache_path): local_fs.delete(cache_path) real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, checkpoint_no) load_path = real_path if self._fs.need_upload_download(): self._fs.download(real_path, cache_path) load_path = cache_path for s in slists: s.deserialize(load_path) if self._fs.need_upload_download() and cache_path: local_fs.delete(cache_path) return real_path
def _test_download_dir(self, fs): src_file = os.path.abspath("./test_download_dir_src") dst_file = os.path.abspath("./test_download_dir_dst") file1 = os.path.abspath("./test_download_dir_src/file1") file2 = os.path.abspath("./test_download_dir_src/file2") fs.delete(dst_file) fs.delete(src_file) fs.mkdirs(src_file) fs.touch(file1) fs.touch(file2) fs.download(src_file, dst_file) local = LocalFS() self.assertTrue(local.is_exist(file1)) local.delete(dst_file) fs.delete(src_file)
def _test_upload_dir(self, fs): # upload dir src_file = os.path.abspath("./test_upload_dir") dst_file = os.path.abspath("./test_uolpad_dir") file1 = os.path.abspath("./test_upload_dir/file1") file2 = os.path.abspath("./test_upload_dir/file2") local = LocalFS() local.mkdirs(src_file) local.touch(file1) local.touch(file2) fs.upload(src_file, dst_file) self.assertTrue(fs.is_exist(dst_file)) fs.delete(dst_file) local.delete(src_file)
def _run_load_0(self, break_epoch_no=None): logger.info("begin _run_load_0") exe, main_prog, startup_prog = self._generate() fs = LocalFS() save_dir = "./run_load_0" fs.delete(save_dir) compiled, data_loader, optimizer, loss, image, label = self._init_env( exe, main_prog, startup_prog) o = None i = 0 check = False epochs = [] for i in acp.train_epoch_range(3, 0): epochs.append(i) for data in data_loader(): fetch = exe.run(compiled, feed=data, fetch_list=[loss]) o = acp._get_train_epoch_range() self.assertTrue(o == None, "now train epoch must not exits now") self.assertEqual(i, 2) if break_epoch_no is not None: if break_epoch_no == 0: self.assertEqual(epochs, [0, 1, 2]) elif break_epoch_no == 1: self.assertEqual(epochs, [1, 2]) elif break_epoch_no == 2: self.assertEqual(epochs, [2]) else: self.assertEqual(epochs, [2]) fs.delete(save_dir) logger.info("begin _run_load_0")
def _test_download(self, fs): src_file = os.path.abspath("./test_download.src") dst_file = os.path.abspath("./test_download.dst") fs.delete(dst_file) fs.delete(src_file) try: fs.download(src_file, dst_file) self.assertFalse(True) except FSFileNotExistsError as e: pass local = LocalFS() fs.touch(src_file) local.delete(dst_file) assert fs.need_upload_download() fs.download(src_file, dst_file) self.assertTrue(local.is_exist(dst_file)) local.delete(dst_file) fs.delete(src_file)
def _run_save_0(self, break_epoch_no=None): logger.info("begin _run_save_0") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name for data in data_loader(): fetch = exe.run(compiled, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) if break_epoch_no is not None: if i == break_epoch_no: break o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" if break_epoch_no is None: self.assertEqual(i, 2) else: self.assertEqual(i, break_epoch_no) fs.delete(save_dir) logger.info("end _run_save_0")
def _test_checkpoint(self, fs, dir_path): file_name = "persistables" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace()) predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) avg_loss = fluid.layers.mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(avg_loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) status = ExeTrainStatus() status.epoch_no = 2 _, n1 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs) status2 = ExeTrainStatus() fleet.load_checkpoint(exe, dir_path, trainer_id=0, fs=fs, train_status=status2) self.assertEqual(status2, status) _, n2 = fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) self.assertEqual(n2, n1 + 1) c = CheckpointSaver(fs) cp_nos = c.get_checkpoint_no(dir_path) assert len(cp_nos) == 1 # cleanup all others # unnormal # test remain_all_checkpoint fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, remain_all_checkpoint=False) # can't save under a file fs = LocalFS() cache_path = "./.load_cache" fs.touch(cache_path) try: fleet.save_checkpoint(exe, dir_path, trainer_id=0, train_status=status, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass # can't load under a file try: fleet.load_checkpoint(exe, dir_path, trainer_id=0, train_status=status2, fs=fs, cache_path=cache_path) self.assertFalse(True) except: pass fs.delete(cache_path)