コード例 #1
0
    def test_normal(self):
        logger.info("begin test_normal")
        checker = acp._get_checker()

        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._clear_envs()
        self._reset_generator()
        self._run_normal()
        self._readd_envs()
        logger.info("end test_normal")
コード例 #2
0
    def _test_corner_epoch_no(self, break_epoch_no):
        logger.info("begin test_corener_epoch_no")
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()
        self._run_save_0(break_epoch_no=break_epoch_no)
        self._reset_generator()
        self._run_load_0(break_epoch_no=break_epoch_no)

        fs.delete(checker.hdfs_checkpoint_path)
        logger.info("end test_corener_epoch_no")
コード例 #3
0
    def test_basic(self):
        logger.info("begin test_basic")
        checker = acp._get_checker()
        self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT")
        self.assertEqual(checker.platform, "PADDLE_CLOUD")
        self.assertEqual(checker.save_checkpoint_inter, 0)
        print(checker)

        fs = HDFSClient(checker.hdfs_home, None)

        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()
        self._run_save_0()

        self._reset_generator()
        self._run_load_0()

        logger.info("end test_basic")
コード例 #4
0
    def test_timeout(self):
        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
                        None,
                        time_out=6 * 1000,
                        sleep_inter=100)
        src = "hdfs_test_timeout"
        dst = "new_hdfs_test_timeout"
        fs.delete(dst)
        fs.mkdirs(src)
        fs.mkdirs(dst)
        fs.mkdirs(dst + "/" + src)
        output = ""
        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
        try:
            fs.mv(src, dst, test_exists=False)
            self.assertFalse(
                1, "can't execute cmd:{} output:{}".format(cmd, output))
        except FSTimeOut as e:
            print("execute mv {} to {} timeout".format(src, dst))

        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
        self.assertNotEqual(ret, 0)
        print("second mv ret:{} output:{}".format(ret, output))
コード例 #5
0
    def test(self):
        fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
        dir_path = "./checkpointsaver_test"
        fs.delete(dir_path)

        s = CheckpointSaver(fs)

        fs.mkdirs("{}/exe.exe".format(dir_path))
        fs.mkdirs("{}/exe.1".format(dir_path))
        fs.mkdirs("{}/exe".format(dir_path))

        a = s.get_checkpoint_no(dir_path)
        self.assertEqual(len(a), 0)

        fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path))
        fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path))

        a = s.get_checkpoint_no(dir_path)
        self.assertEqual(len(a), 1)

        s.clean_redundant_checkpoints(dir_path)
        s.clean_redundant_checkpoints(dir_path)

        fs.delete(dir_path)
コード例 #6
0
    def test_distributed_basic(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_distributed_basic")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        #basic
        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)

        #fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with fluid.program_guard(main_prog, startup_prog):
            dist_optimizer = fleet.distributed_optimizer(optimizer)
            dist_optimizer.minimize(loss)

        exe.run(startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name
            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))

            for data in data_loader():
                fetch = exe.run(fleet.main_program,
                                feed=data,
                                fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        self.assertEqual(i, 2)

        fs.delete(save_dir)

        logger.info("end test_distributed_basic")
コード例 #7
0
    def test_multiple(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_multiple")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        exe, main_prog1, startup_prog1 = self._generate()
        _, main_prog2, startup_prog2 = self._generate()

        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
            self._init_env(exe, main_prog1, startup_prog1)

        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
            self._init_env(exe, main_prog2, startup_prog2)

        o = None
        epochs = []
        for i in acp.train_epoch_range(3, 0):
            for data in data_loader1():
                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])

            for data in data_loader2():
                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])

            o = acp._get_train_epoch_range()
            self.assertEqual(len(o._exe_status), 2)
            print(o._exe_status)
            epochs.append(i)

        o = acp._get_train_epoch_range()
        self.assertTrue(o == None, "now train epoch must not exits now")
        self.assertEqual(i, 2)
        self.assertEqual(epochs, [0, 1, 2])

        fs.delete(save_dir)
        logger.info("end test_multiple")