def _test_upload(self, fs):
        src_file = os.path.abspath("./test_upload.src")
        dst_file = os.path.abspath("./test_uolpad.dst")

        try:
            fs.upload(src_file, dst_file)
            self.assertFalse(True)
        except FSFileNotExistsError as e:
            pass

        local = LocalFS()
        local.touch(src_file)
        fs.delete(dst_file)

        assert fs.need_upload_download()

        fs.upload(src_file, dst_file)
        try:
            fs.upload(src_file, dst_file)
            self.assertFalse(True)
        except FSFileExistsError as e:
            pass

        self.assertTrue(fs.is_exist(dst_file))
        fs.delete(dst_file)
        fs.delete(src_file)
Beispiel #2
0
    def upload(self, local_path, fs_path):
        if self.is_exist(fs_path):
            raise FSFileExistsError("{} exists".format(fs_path))

        local = LocalFS()
        if not local.is_exist(local_path):
            raise FSFileNotExistsError("{} not exists".format(local_path))

        return self._try_upload(local_path, fs_path)
Beispiel #3
0
 def _try_download(self, fs_path, local_path):
     cmd = "get {} {}".format(fs_path, local_path)
     ret = 0
     try:
         ret, lines = self._run_cmd(cmd)
         if ret != 0:
             raise ExecuteError(cmd)
     except Exception as e:
         local_fs = LocalFS()
         local_fs.delete(local_path)
         raise e
    def load_checkpoint(self,
                        path,
                        slists,
                        trainer_id,
                        local_cache_path=".cache",
                        checkpoint_no=None,
                        ignore_empty=True):
        """
        Deserialize objects in slists from path
        Return really load path
        """
        if checkpoint_no is None:
            max_no = self._get_last_checkpoint_no(path)

            if not ignore_empty:
                assert max_no >= 0, "Can't find checkpoint"

            if max_no < 0:
                return None

            checkpoint_no = max_no
        else:
            assert isinstance(checkpoint_no, int)
            assert checkpoint_no >= 0

        from paddle.distributed.fleet.utils.fs import LocalFS
        local_fs = LocalFS()
        if self._fs.need_upload_download():
            cache_path = "{}/{}.{}.load_cache".format(local_cache_path,
                                                      self._checkpoint_prefix,
                                                      checkpoint_no)

            if trainer_id is not None:
                cache_path = "{}.{}".format(cache_path, trainer_id)

            if not local_fs.is_exist(local_cache_path):
                local_fs.mkdirs(local_cache_path)
            if local_fs.is_exist(cache_path):
                local_fs.delete(cache_path)

        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix,
                                      checkpoint_no)
        load_path = real_path
        if self._fs.need_upload_download():
            self._fs.download(real_path, cache_path)
            load_path = cache_path

        for s in slists:
            s.deserialize(load_path)

        if self._fs.need_upload_download() and cache_path:
            local_fs.delete(cache_path)

        return real_path
Beispiel #5
0
    def test_local(self):
        fs = LocalFS()
        self._test_rm(fs)
        self._test_touch(fs)
        self._test_dirs(fs)

        self._test_touch_file(fs)
    def _test_download_dir(self, fs):
        src_file = os.path.abspath("./test_download_dir_src")
        dst_file = os.path.abspath("./test_download_dir_dst")
        file1 = os.path.abspath("./test_download_dir_src/file1")
        file2 = os.path.abspath("./test_download_dir_src/file2")
        fs.delete(dst_file)
        fs.delete(src_file)

        fs.mkdirs(src_file)
        fs.touch(file1)
        fs.touch(file2)

        fs.download(src_file, dst_file)
        local = LocalFS()
        self.assertTrue(local.is_exist(file1))
        local.delete(dst_file)
        fs.delete(src_file)
    def save_checkpoint(self,
                        path,
                        slists,
                        trainer_id=None,
                        local_cache_path=".cache"):
        """
        Serialize objects in slists to path
        Return really saved path and checkpoint_no
        """
        if not self._fs.is_exist(path):
            self._fs.mkdirs(path)
        else:
            assert self._fs.is_dir(path), "path:{} must be a directory".format(
                path)

        max_no = self._get_last_checkpoint_no(path)
        if max_no < 0:
            max_no = -1
        max_no += 1

        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
        tmp_path = "{}.tmp".format(real_path)
        saved_path = tmp_path

        from paddle.distributed.fleet.utils.fs import LocalFS
        local_fs = LocalFS()

        cache_path = None
        if self._fs.need_upload_download():
            cache_path = "{}/{}.{}.saved_cache".format(local_cache_path,
                                                       self._checkpoint_prefix,
                                                       max_no)

            if trainer_id is not None:
                cache_path = "{}.{}".format(cache_path, trainer_id)

            if not local_fs.is_exist(cache_path):
                local_fs.mkdirs(cache_path)
            else:
                assert local_fs.is_dir(cache_path), \
                    "cache path:{} must be a directory".format(cache_path)

            saved_path = cache_path

        for s in slists:
            s.serialize(saved_path)

        if self._fs.need_upload_download():
            self._fs.delete(tmp_path)
            self._fs.upload(cache_path, tmp_path)
            local_fs.delete(cache_path)
        self._fs.mv(tmp_path, real_path)

        return real_path, max_no
Beispiel #8
0
    def test_distributed_basic(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_distributed_basic")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        #basic
        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)

        #fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with fluid.program_guard(main_prog, startup_prog):
            dist_optimizer = fleet.distributed_optimizer(optimizer)
            dist_optimizer.minimize(loss)

        exe.run(startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name
            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))

            for data in data_loader():
                fetch = exe.run(fleet.main_program,
                                feed=data,
                                fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        self.assertEqual(i, 2)

        fs.delete(save_dir)

        logger.info("end test_distributed_basic")
Beispiel #9
0
    def _run_normal(self):
        exe, main_prog, startup_prog = self._generate()

        save_dir = "./run_save_model"
        fs = LocalFS()

        fs.delete(save_dir)
        logger.info("begin _run_normal")

        compiled, data_loader, optimizer, loss, image, label = self._init_env(
            exe, main_prog, startup_prog)
        for i in range(3):
            self.assertEqual(acp._get_train_epoch_range(), None)
            self.assertEqual(acp.g_acp_type, None)
            for data in data_loader():
                self.assertEqual(acp.g_acp_type, None)
                self.assertEqual(acp._get_train_epoch_range(), None)
                fetch = exe.run(compiled, feed=data, fetch_list=[loss])

        self.assertEqual(acp.g_acp_type, None)
        self.assertEqual(acp._get_train_epoch_range(), None)

        m1 = PaddleModel(exe, compiled)
        m1.serialize(save_dir)

        m2 = PaddleModel(exe, compiled)
        m2.deserialize(save_dir)

        logger.info("end _run_normal")
        fs.delete(save_dir)
    def _test_upload_dir(self, fs):
        # upload dir
        src_file = os.path.abspath("./test_upload_dir")
        dst_file = os.path.abspath("./test_uolpad_dir")
        file1 = os.path.abspath("./test_upload_dir/file1")
        file2 = os.path.abspath("./test_upload_dir/file2")

        local = LocalFS()
        local.mkdirs(src_file)
        local.touch(file1)
        local.touch(file2)

        fs.upload(src_file, dst_file)

        self.assertTrue(fs.is_exist(dst_file))
        fs.delete(dst_file)
        local.delete(src_file)
Beispiel #11
0
    def test_multiple(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_multiple")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        exe, main_prog1, startup_prog1 = self._generate()
        _, main_prog2, startup_prog2 = self._generate()

        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
            self._init_env(exe, main_prog1, startup_prog1)

        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
            self._init_env(exe, main_prog2, startup_prog2)

        o = None
        epochs = []
        for i in acp.train_epoch_range(3, 0):
            for data in data_loader1():
                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])

            for data in data_loader2():
                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])

            o = acp._get_train_epoch_range()
            self.assertEqual(len(o._exe_status), 2)
            print(o._exe_status)
            epochs.append(i)

        o = acp._get_train_epoch_range()
        self.assertTrue(o == None, "now train epoch must not exits now")
        self.assertEqual(i, 2)
        self.assertEqual(epochs, [0, 1, 2])

        fs.delete(save_dir)
        logger.info("end test_multiple")
Beispiel #12
0
    def _run_load_0(self, break_epoch_no=None):
        logger.info("begin _run_load_0")
        exe, main_prog, startup_prog = self._generate()

        fs = LocalFS()
        save_dir = "./run_load_0"
        fs.delete(save_dir)

        compiled, data_loader, optimizer, loss, image, label = self._init_env(
            exe, main_prog, startup_prog)

        o = None
        i = 0
        check = False

        epochs = []
        for i in acp.train_epoch_range(3, 0):
            epochs.append(i)

            for data in data_loader():
                fetch = exe.run(compiled, feed=data, fetch_list=[loss])

        o = acp._get_train_epoch_range()
        self.assertTrue(o == None, "now train epoch must not exits now")
        self.assertEqual(i, 2)

        if break_epoch_no is not None:
            if break_epoch_no == 0:
                self.assertEqual(epochs, [0, 1, 2])
            elif break_epoch_no == 1:
                self.assertEqual(epochs, [1, 2])
            elif break_epoch_no == 2:
                self.assertEqual(epochs, [2])
        else:
            self.assertEqual(epochs, [2])

        fs.delete(save_dir)
        logger.info("begin _run_load_0")
Beispiel #13
0
    def _run_save_0(self, break_epoch_no=None):
        logger.info("begin _run_save_0")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name

            for data in data_loader():
                fetch = exe.run(compiled, feed=data, fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

            if break_epoch_no is not None:
                if i == break_epoch_no:
                    break

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        if break_epoch_no is None:
            self.assertEqual(i, 2)
        else:
            self.assertEqual(i, break_epoch_no)

        fs.delete(save_dir)
        logger.info("end _run_save_0")
    def _test_checkpoint(self, fs, dir_path):
        file_name = "persistables"

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        feeder = fluid.DataFeeder(feed_list=[image, label],
                                  place=fluid.CPUPlace())
        predict = fluid.layers.fc(input=image, size=10, act='softmax')
        loss = fluid.layers.cross_entropy(input=predict, label=label)
        avg_loss = fluid.layers.mean(loss)
        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)

        dist_optimizer = fleet.distributed_optimizer(optimizer)
        dist_optimizer.minimize(avg_loss)

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())

        status = ExeTrainStatus()
        status.epoch_no = 2
        _, n1 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs)

        status2 = ExeTrainStatus()
        fleet.load_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              fs=fs,
                              train_status=status2)
        self.assertEqual(status2, status)

        _, n2 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs,
                                      remain_all_checkpoint=False)
        self.assertEqual(n2, n1 + 1)

        c = CheckpointSaver(fs)
        cp_nos = c.get_checkpoint_no(dir_path)
        assert len(cp_nos) == 1  # cleanup all others

        # unnormal
        # test remain_all_checkpoint
        fleet.save_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              train_status=status,
                              fs=fs,
                              remain_all_checkpoint=False)

        # can't save under a file
        fs = LocalFS()
        cache_path = "./.load_cache"
        fs.touch(cache_path)
        try:
            fleet.save_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass

        # can't load under a file
        try:
            fleet.load_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status2,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass
        fs.delete(cache_path)
 def test_local_checkpoint(self):
     fs = LocalFS()
     dir_path = "./checkpoint_test_local"
     self._test_checkpoint(fs, dir_path)
Beispiel #16
0
 def test_local(self):
     fs = LocalFS()
     self._test_mkdirs(fs)
     self._test_list_dir(fs)
     self._test_try_upload(fs)
     self._test_try_download(fs)