Example #1
0
    def test_normal(self):
        logger.info("begin test_normal")
        checker = acp._get_checker()

        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._clear_envs()
        self._reset_generator()
        self._run_normal()
        self._readd_envs()
        logger.info("end test_normal")
Example #2
0
    def __init__(self,
                 max_epoch_num,
                 name,
                 checkpoint_inter=None,
                 restored=True):
        self._max_epoch_num = max_epoch_num
        self._epoch_no = -1  # current epoch_no
        self._name = name
        self._restored_from = None
        self._exe_status = {}
        self._flag_generated = False

        self._checker = g_checker
        if checkpoint_inter is not None:
            self._save_checkpoint_inter = checkpoint_inter
        else:
            self._save_checkpoint_inter = self._checker.save_checkpoint_inter
        assert self._save_checkpoint_inter >= 0, "checkpointer:{} must >=0".format(
            self._save_checkpoint_inter)
        self._last_checkpoint_time = time.time()

        self._load_cp_nos = None
        self._checkpoint_epoch_no = None

        if not self._checker.valid():
            return

        self._file_name = "range_train_status"

        if not restored:
            return

        self._checkpoint_path = self._checker.get_range_checkpoint_path(name)

        config = {
            "fs.default.name": self._checker.hdfs_name,
            "hadoop.job.ugi": self._checker.hdfs_ugi
        }

        if self._checker.ce_test:
            config = None

        from paddle.distributed.fleet.utils.fs import HDFSClient
        self._hdfs = HDFSClient(self._checker.hdfs_home, config)

        self._cper = CheckpointSaver(self._hdfs)

        _thread_checker()

        self._get_last_valid_checkpoint()
Example #3
0
    def test_distributed_basic(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_distributed_basic")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        #basic
        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)

        #fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with fluid.program_guard(main_prog, startup_prog):
            dist_optimizer = fleet.distributed_optimizer(optimizer)
            dist_optimizer.minimize(loss)

        exe.run(startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name
            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))

            for data in data_loader():
                fetch = exe.run(fleet.main_program,
                                feed=data,
                                fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        self.assertEqual(i, 2)

        fs.delete(save_dir)

        logger.info("end test_distributed_basic")
 def __init__(self, config):
     self.metrics = {}
     self.config = config
     self.exe = None
     self.reader_type = config.get("runner.reader_type", "InMemoryDataset")
     self.split_interval = config.get("runner.split_interval", 5)
     self.split_per_pass = config.get("runner.split_per_pass", 1)
     self.checkpoint_per_pass = config.get("runner.checkpoint_per_pass", 6)
     self.save_delta_frequency = config.get("runner.save_delta_frequency",
                                            6)
     self.save_first_base = config.get("runner.save_first_base", False)
     self.data_donefile = config.get("runner.data_donefile", "")
     self.data_sleep_second = config.get("runner.data_sleep_second", 10)
     self.start_day = config.get("runner.start_day")
     self.end_day = config.get("runner.end_day")
     self.save_model_path = self.config.get("runner.model_save_path")
     self.need_train_dump = self.config.get("runner.need_train_dump", False)
     self.need_infer_dump = self.config.get("runner.need_infer_dump", False)
     if config.get("runner.fs_client.uri") is not None:
         self.hadoop_fs_name = config.get("runner.fs_client.uri", "")
         self.hadoop_fs_ugi = config.get("runner.fs_client.user",
                                         "") + "," + config.get(
                                             "runner.fs_client.passwd", "")
         configs = {
             "fs.default.name": self.hadoop_fs_name,
             "hadoop.job.ugi": self.hadoop_fs_ugi
         }
         self.hadoop_client = HDFSClient("$HADOOP_HOME", configs)
     else:
         self.hadoop_fs_name, self.hadoop_fs_ugi = "", ""
         self.hadoop_client = None
Example #5
0
 def test_hdfs(self):
     fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                     None,
                     time_out=5 * 1000,
                     sleep_inter=100)
     self._test_rm(fs)
     self._test_touch(fs)
     self._test_dirs(fs)
Example #6
0
    def test_basic(self):
        logger.info("begin test_basic")
        checker = acp._get_checker()
        self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT")
        self.assertEqual(checker.platform, "PADDLE_CLOUD")
        self.assertEqual(checker.save_checkpoint_inter, 0)
        print(checker)

        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()
        self._run_save_0()

        self._reset_generator()
        self._run_load_0()

        logger.info("end test_basic")
Example #7
0
    def test_hdfs(self):
        fs = HDFSClient(
            "/usr/local/hadoop-2.7.7/",
            None,
            time_out=5 * 1000,
            sleep_inter=100)
        self._test_mkdirs(fs)
        self._test_list_dir(fs)
        self._test_try_upload(fs)
        self._test_try_download(fs)

        self._test_upload(fs)
        self._test_download(fs)
Example #8
0
 def test_exists(self):
     fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                     None,
                     time_out=6 * 1000,
                     sleep_inter=100)
     self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
     self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
     self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
     dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs1.py"))
     self.assertTrue(dirs == [])
     self.assertTrue(len(files) == 1)
     dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
Example #9
0
    def test_timeout(self):
        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                        None,
                        time_out=6 * 1000,
                        sleep_inter=100)
        src = "hdfs_test_timeout"
        dst = "new_hdfs_test_timeout"
        fs.delete(dst)
        fs.mkdirs(src)
        fs.mkdirs(dst)
        fs.mkdirs(dst + "/" + src)
        output = ""
        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
        try:
            fs.mv(src, dst, test_exists=False)
            self.assertFalse(
                1, "can't execute cmd:{} output:{}".format(cmd, output))
        except FSTimeOut as e:
            print("execute mv {} to {} timeout".format(src, dst))

        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
        self.assertNotEqual(ret, 0)
        print("second mv ret:{} output:{}".format(ret, output))
Example #10
0
    def test_is_dir(self):
        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                        None,
                        time_out=6 * 1000,
                        sleep_inter=100)
        self.assertFalse(fs.is_dir("./test_hdfs.py"))
        s = """
java.io.IOException: Input/output error
 responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
        """

        print("split lines:", s.splitlines())
        self.assertTrue(fs._test_match(s.splitlines()) != None)
Example #11
0
    def test_multiple(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_multiple")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        exe, main_prog1, startup_prog1 = self._generate()
        _, main_prog2, startup_prog2 = self._generate()

        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
            self._init_env(exe, main_prog1, startup_prog1)

        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
            self._init_env(exe, main_prog2, startup_prog2)

        o = None
        epochs = []
        for i in acp.train_epoch_range(3, 0):
            for data in data_loader1():
                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])

            for data in data_loader2():
                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])

            o = acp._get_train_epoch_range()
            self.assertEqual(len(o._exe_status), 2)
            print(o._exe_status)
            epochs.append(i)

        o = acp._get_train_epoch_range()
        self.assertTrue(o == None, "now train epoch must not exits now")
        self.assertEqual(i, 2)
        self.assertEqual(epochs, [0, 1, 2])

        fs.delete(save_dir)
        logger.info("end test_multiple")
Example #12
0
    def _test_corner_epoch_no(self, break_epoch_no):
        logger.info("begin test_corener_epoch_no")
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()
        self._run_save_0(break_epoch_no=break_epoch_no)
        self._reset_generator()
        self._run_load_0(break_epoch_no=break_epoch_no)

        fs.delete(checker.hdfs_checkpoint_path)
        logger.info("end test_corener_epoch_no")
    def test(self):
        fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
        dir_path = "./checkpointsaver_test"
        fs.delete(dir_path)

        s = CheckpointSaver(fs)

        fs.mkdirs("{}/exe.exe".format(dir_path))
        fs.mkdirs("{}/exe.1".format(dir_path))
        fs.mkdirs("{}/exe".format(dir_path))

        a = s.get_checkpoint_no(dir_path)
        self.assertEqual(len(a), 0)

        fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path))
        fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path))

        a = s.get_checkpoint_no(dir_path)
        self.assertEqual(len(a), 1)

        s.clean_redundant_checkpoints(dir_path)
        s.clean_redundant_checkpoints(dir_path)

        fs.delete(dir_path)
Example #14
0
 def test_config(self):
     config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
     fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                     config,
                     time_out=6 * 1000,
                     sleep_inter=100)
Example #15
0
class TrainEpochRange(SerializableBase):
    def __init__(self,
                 max_epoch_num,
                 name,
                 checkpoint_inter=None,
                 restored=True):
        self._max_epoch_num = max_epoch_num
        self._epoch_no = -1  # current epoch_no
        self._name = name
        self._restored_from = None
        self._exe_status = {}
        self._flag_generated = False

        self._checker = g_checker
        if checkpoint_inter is not None:
            self._save_checkpoint_inter = checkpoint_inter
        else:
            self._save_checkpoint_inter = self._checker.save_checkpoint_inter
        assert self._save_checkpoint_inter >= 0, "checkpointer:{} must >=0".format(
            self._save_checkpoint_inter)
        self._last_checkpoint_time = time.time()

        self._load_cp_nos = None
        self._checkpoint_epoch_no = None

        if not self._checker.valid():
            return

        self._file_name = "range_train_status"

        if not restored:
            return

        self._checkpoint_path = self._checker.get_range_checkpoint_path(name)

        config = {
            "fs.default.name": self._checker.hdfs_name,
            "hadoop.job.ugi": self._checker.hdfs_ugi
        }

        if self._checker.ce_test:
            config = None

        from paddle.distributed.fleet.utils.fs import HDFSClient
        self._hdfs = HDFSClient(self._checker.hdfs_home, config)

        self._cper = CheckpointSaver(self._hdfs)

        _thread_checker()

        self._get_last_valid_checkpoint()

    def _look_for_valid(self, cp_nos):
        cps = []
        epoch_no = -1
        for i in cp_nos[::-1]:
            t = TrainEpochRange(self._max_epoch_num, self.name, restored=False)
            self._cper.load_checkpoint(
                self._checkpoint_path, [t],
                self._checker.trainer_id,
                checkpoint_no=i,
                local_cache_path=self._checker._fs_cache)
            cps.append(t)
            logger.debug("look for valid:{} t:{}".format(i, t._serialize()))
            if epoch_no < 0:
                epoch_no = t._epoch_no
            else:
                if epoch_no - t._epoch_no >= 1:
                    return t, i
        return None, None

    def _get_last_valid_checkpoint(self):
        self._load_cp_nos = self._cper.get_checkpoint_no(self._checkpoint_path)
        logger.info("find checkpoint nos:{}".format(self._load_cp_nos))

        if len(self._load_cp_nos) < 1:
            self._restored_from = CONST_MEMORYINIT
            return

        if g_acp_type == CONST_ACP_TYPE:
            # get the last one
            self._cper.load_checkpoint(
                self._checkpoint_path, [self],
                self._checker.trainer_id,
                local_cache_path=self._checker._fs_cache)
            self._restored_from = CONST_CHECKPOINT
            self._checkpoint_epoch_no = self._epoch_no

            logger.info("load tain_epoch_range checkpoint:{}".format(
                self._serialize()))

        elif g_acp_type == CONST_DACP_TYPE:
            t, i = self._look_for_valid(self._load_cp_nos)
            if t is None:
                self._restored_from = CONST_MEMORYINIT
                return

            self._cper.load_checkpoint(
                self._checkpoint_path, [self],
                self._checker.trainer_id,
                checkpoint_no=i,
                local_cache_path=self._checker._fs_cache)

            self._restored_from = CONST_CHECKPOINT
            self._checkpoint_epoch_no = self._epoch_no
            logger.info("load tain_epoch_range checkpoint:{}".format(
                self._serialize()))
        else:
            assert False, "not supported acp_type:{}".format(g_acp_type)

    def _to_dict(self):
        d = {
            "max_epoch_num": self._max_epoch_num,
            "epoch_no": self._epoch_no,
            "name": self._name,
            "checkpoint_path": self._checkpoint_path,
            "restored_from": self._restored_from,
            "checkpoint_epoch_no": self._checkpoint_epoch_no
        }
        return d

    def __str__(self):
        return self._serialize([])

    @property
    def name(self):
        return self._name

    def serialize(self, path):
        file_name = "{}/{}".format(path, self._file_name)
        with open(file_name, 'w') as f:
            s = self._serialize()
            f.write(s)

    def _serialize(self, pop_keys=["restored_from", "checkpoint_epoch_no"]):
        # self
        d = self._to_dict()
        for k in pop_keys:
            d.pop(k, None)

        # registerd exes
        d["exe_status"] = {}
        e = d["exe_status"]
        for k, t in six.iteritems(self._exe_status):
            e[t._key] = t._serialize()
        return json.dumps(d)

    @property
    def restored_from(self):
        return self._restored_from

    def deserialize(self, path):
        d = None
        file_name = "{}/{}".format(path, self._file_name)
        with open(file_name, 'r') as f:
            d = json.load(f)

        # self
        self._max_epoch_num = d["max_epoch_num"]
        self._epoch_no = d["epoch_no"]
        self._name = d["name"]
        self._checkpoint_path = d["checkpoint_path"]

        # exes status
        e = d["exe_status"]
        for k, v in six.iteritems(e):
            t = ExeTrainStatus()
            t._deserialize(v)
            self._exe_status[k] = t

    def next(self):
        _thread_checker()

        if self._max_epoch_num < 0:
            self._max_epoch_num = sys.maxint

        assert self._epoch_no >= -1, "self._epoch_no:{} must >=-1".format(
            self._epoch_no)

        self._last_checkpoint_time = time.time()
        start = self._epoch_no + 1
        logger.info("started epoch_no:{} max_epoch_num:{}".format(
            start, self._max_epoch_num))

        for i in range(start, self._max_epoch_num):
            self._epoch_no = i
            yield i

            self.save_checkpoint()

    def get(self):
        return self._epoch_no

    def save_checkpoint(self):
        # not save last one because exe and program can't be restored.
        if self._checker.trainer_id == 0:

            if time.time() - self._last_checkpoint_time >= \
                    self._save_checkpoint_inter:
                if g_acp_type == CONST_ACP_TYPE:
                    # not save the last one
                    if self._max_epoch_num > 0 and self._epoch_no != self._max_epoch_num - 1:
                        self._save_checkpoint()
                elif g_acp_type == CONST_DACP_TYPE:
                    self._save_checkpoint()
                else:
                    assert False, "not supported acp_type:{}".format(
                        g_acp_type)
            self._last_checkpoint_time = time.time()

    def _save_checkpoint(self):
        """
        status => /jobid/xxx_range_xx/range/
        model =>                       /exe/
        """
        if not self._checker.valid():
            return

        e = self._exe_status
        for k, t in six.iteritems(self._exe_status):
            m = PaddleModel(t._exe, t._program)
            p = self._checker.get_exe_checkpoint_path(t._hash_key)
            t._epoch_no = self.get()
            path, checkpoint_no = self._cper.save_checkpoint(
                p, [m],
                self._checker.trainer_id,
                local_cache_path=self._checker._fs_cache)
            # index info
            t._checkpoint_path = path
            t._checkpoint_no = checkpoint_no

            e[t._key] = t

            logger.debug("save executor checkpoint:{}".format(t._serialize()))

        if len(self._exe_status) > 0:
            self._cper.save_checkpoint(
                self._checkpoint_path, [self],
                local_cache_path=self._checker._fs_cache)
            logger.info("save train_epoch_range checkpoint:{}".format(
                self._serialize()))

            self._generate_flag()

    def _generate_flag(self):
        if self._flag_generated:
            return

        name = "can_be_auto_checkpoint.flag"
        path = self._checker.get_job_path() + "/" + name
        logger.info("this job can_be_auto_checkpoint")
        self._hdfs.mkdirs(self._checker.get_job_path())
        self._hdfs.touch(path, exist_ok=True)

        self._flag_generated = True
 def test_hdfs_checkpoint(self):
     fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
     dir_path = "./checkpoint_test_hdfs"
     self._test_checkpoint(fs, os.path.abspath(dir_path))
Example #17
0
 def _test_list_dir(self, fs):
     fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                     None,
                     time_out=15 * 1000,
                     sleep_inter=100)
     fs.ls_dir("test_not_exists")