Example #1
0
    def test_mlp_mp(self):
        global _global_parallel_strategy
        _global_parallel_strategy = "mp"
        global _global_process_mesh
        _global_process_mesh = auto.ProcessMesh([0, 1])

        dist_main_prog, dist_start_prog, loss = get_distributed_program()

        place = paddle.set_device("gpu")
        exe = paddle.static.Executor(place)
        exe.run(dist_start_prog)

        input = np.random.random(size=(80, 64)).astype('float32')
        label = np.random.random(size=(80, 1)).astype('float32')
        for step in range(20):
            if step == 10:
                path = "./output_mp{}".format(paddle.distributed.get_rank())
                os.makedirs(path, exist_ok=True)
                save_distributed_checkpoint(dist_main_prog, path, path)

            res = exe.run(dist_main_prog,
                          feed={
                              "input": input[step * 4:(step + 1) * 4, :],
                              "label": label[step * 4:(step + 1) * 4, :]
                          },
                          fetch_list=[loss])

        last_res = res[0]
        ckpt_path = [
            "./output_mp0/model_state_rank0.pdmodel",
            "./output_mp1/model_state_rank1.pdmodel"
        ]
        dist_attr_path = [
            "./output_mp0/dist_attr_rank0.pdattr",
            "./output_mp1/dist_attr_rank1.pdattr"
        ]
        load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog)
        for step in range(10, 20):
            res = exe.run(dist_main_prog,
                          feed={
                              "input": input[step * 4:(step + 1) * 4, :],
                              "label": label[step * 4:(step + 1) * 4, :]
                          },
                          fetch_list=[loss])

        self.assertEqual(last_res, res[0])
        shutil.rmtree("./output_mp{}".format(paddle.distributed.get_rank()))
Example #2
0
 def test_input_invalid(self):
     set_default_distributed_context(None)
     global _global_parallel_strategy
     _global_parallel_strategy = "mp"
     global _global_process_mesh
     _global_process_mesh = auto.ProcessMesh([0, 1])
     dist_main_prog, _, _ = get_distributed_program()
     with self.assertRaises(TypeError):
         save_distributed_checkpoint(dist_main_prog, [""], [""],
                                     addition_info=[0])
     with self.assertRaises(ValueError):
         save_distributed_checkpoint(dist_main_prog, [""], [""],
                                     addition_info={"step": 0})
     with self.assertRaises(ValueError):
         save_distributed_checkpoint(dist_main_prog, [""], [""],
                                     addition_info={"batch": 0.0})
     with self.assertRaises(ValueError):
         load_checkpoint_into_program(["./model_state_rank.pdmodel"],
                                      ["./dist_attr_rank.pdattr"],
                                      dist_main_prog)
     with self.assertRaises(ValueError):
         load_distributed_checkpoint(["./model_state_rank.pdmodel"],
                                     ["./dist_attr_rank.pdattr"])
     with self.assertRaises(TypeError):
         load_distributed_checkpoint({"0": "./model_state_rank.pdmodel"},
                                     {"1": "./dist_attr_rank.pdattr"})
Example #3
0
    def test_mlp_pp2mp(self):
        set_default_distributed_context(None)
        global _global_parallel_strategy
        _global_parallel_strategy = "pp"
        global _global_process_mesh
        _global_process_mesh = auto.ProcessMesh([0, 1])
        global PP_MESH_0
        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
        global PP_MESH_1
        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
        input = np.random.random(size=(80, 64)).astype('float32')
        label = np.random.random(size=(80, 1)).astype('float32')

        dist_main_prog, dist_start_prog, loss = get_distributed_program()
        place = paddle.set_device("gpu")
        exe = paddle.static.Executor(place)
        exe.run(dist_start_prog)
        for step in range(20):
            if step == 10:
                add_info = {"batch": step, "batch_size": 4}
                save_distributed_checkpoint(dist_main_prog, ".", ".", add_info)

            if paddle.distributed.get_rank() in [0]:
                res = exe.run(dist_main_prog,
                              feed={
                                  "input": input[step * 4:(step + 1) * 4, :],
                                  "label": label[step * 4:(step + 1) * 4, :]
                              })
            else:
                res = exe.run(dist_main_prog,
                              feed={
                                  "input": input[step * 4:(step + 1) * 4, :],
                                  "label": label[step * 4:(step + 1) * 4, :]
                              },
                              fetch_list=[loss])
        if paddle.distributed.get_rank() in [1]:
            last_res = res[0]

        set_default_distributed_context(None)
        _global_parallel_strategy = "mp"
        _global_process_mesh = auto.ProcessMesh([0, 1])

        dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
        )
        place = paddle.set_device("gpu")
        exe = paddle.static.Executor(place)
        exe.run(dist_start_prog_load)
        ckpt_path = [
            "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel"
        ]
        dist_attr_path = [
            "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr"
        ]
        param_dict, pre_dist_attr, add_info = load_distributed_checkpoint(
            ckpt_path, dist_attr_path)
        batch = add_info["batch"]
        batch_size = add_info["batch_size"]
        start_index = batch * batch_size
        input = input[start_index:, :]
        label = label[start_index:, :]
        cur_dist_attr = get_dist_attr(dist_main_prog_load)
        sliced_param_dict = merge_and_slice_parameter(param_dict,
                                                      pre_dist_attr,
                                                      cur_dist_attr)
        load_parameter_into_program(sliced_param_dict, dist_main_prog_load)
        for step in range(10):
            res = exe.run(dist_main_prog_load,
                          feed={
                              "input": input[step * 4:(step + 1) * 4, :],
                              "label": label[step * 4:(step + 1) * 4, :]
                          },
                          fetch_list=[loss_load])
        if paddle.distributed.get_rank() in [1]:
            self.assertEqual(last_res, res[0])
Example #4
0
    def test_mlp_mp2pp(self):
        set_default_distributed_context(None)
        global _global_parallel_strategy
        _global_parallel_strategy = "mp"
        global _global_process_mesh
        _global_process_mesh = auto.ProcessMesh([0, 1])

        input = np.random.random(size=(80, 64)).astype('float32')
        label = np.random.random(size=(80, 1)).astype('float32')

        dist_main_prog, dist_start_prog, loss = get_distributed_program()
        place = paddle.set_device("gpu")
        exe = paddle.static.Executor(place)
        exe.run(dist_start_prog)

        for step in range(20):
            if step == 10:
                save_distributed_checkpoint(dist_main_prog,
                                            ".",
                                            dist_attr_path=".")

            res = exe.run(dist_main_prog,
                          feed={
                              "input": input[step * 4:(step + 1) * 4, :],
                              "label": label[step * 4:(step + 1) * 4, :]
                          },
                          fetch_list=[loss])
        last_res = res[0]

        set_default_distributed_context(None)
        _global_parallel_strategy = "pp"
        _global_process_mesh = auto.ProcessMesh([0, 1])
        global PP_MESH_0
        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
        global PP_MESH_1
        PP_MESH_1 = auto.ProcessMesh(mesh=[1])

        dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
        )
        place = paddle.set_device("gpu")
        exe = paddle.static.Executor(place)
        exe.run(dist_start_prog_load)

        ckpt_path = [
            "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel"
        ]
        dist_attr_path = [
            "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr"
        ]
        load_checkpoint_into_program(ckpt_path, dist_attr_path,
                                     dist_main_prog_load)
        for step in range(10, 20):
            if paddle.distributed.get_rank() in [0]:
                res = exe.run(dist_main_prog_load,
                              feed={
                                  "input": input[step * 4:(step + 1) * 4, :],
                                  "label": label[step * 4:(step + 1) * 4, :]
                              })
            else:
                res = exe.run(dist_main_prog_load,
                              feed={
                                  "input": input[step * 4:(step + 1) * 4, :],
                                  "label": label[step * 4:(step + 1) * 4, :]
                              },
                              fetch_list=[loss_load])
        if paddle.distributed.get_rank() in [1]:
            self.assertEqual(last_res, res[0])