Ejemplo n.º 1
0
    def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
        image = paddle.static.data(
            shape=[batch_size] + image_shape, dtype='float32', name='image')

        model = BatchNormActNet()
        pred_out = model(image)
        loss = paddle.mean(pred_out)
        optimizer = paddle.optimizer.Adam(learning_rate=1e-3)

        dist_strategy = fleet.DistributedStrategy()
        dist_strategy.fuse_all_reduce_ops = False
        dist_strategy.without_graph_optimization = True
        dist_strategy.amp = True
        dist_strategy.amp_configs = {
            "init_loss_scaling": 32768,
            "use_dynamic_loss_scaling": True,
        }
        fleet.init(is_collective=True, strategy=dist_strategy)
        optimizer = fleet.distributed_optimizer(optimizer)
        optimizer.minimize(loss)

        rank = paddle.distributed.get_rank()

        def reader():
            seed = int(os.environ.get("SEED", 0))
            np.random.seed(seed + rank)
            for _ in range(10):
                image_np = np.random.random(size=image.shape).astype('float32')
                yield image_np,

        main_program = paddle.static.default_main_program()
        startup_program = paddle.static.default_startup_program()
        return main_program, startup_program, [image], [loss], reader
Ejemplo n.º 2
0
    def init_distributed_infer_env(self,
                                   exe,
                                   loss,
                                   role_maker=None,
                                   dirname=None):
        import paddle.distributed.fleet as fleet

        if fleet.fleet._runtime_handle is None:
            fleet.init(role_maker=role_maker)

            fake_optimizer = paddle.optimizer.SGD()
            strategy = fleet.DistributedStrategy()
            strategy.a_sync = True
            optimizer = fleet.distributed_optimizer(
                fake_optimizer, strategy=strategy)
            optimizer.minimize(
                loss, startup_program=self.origin_startup_program)

            if fleet.is_server():
                fleet.init_server(dirname=dirname)
                fleet.run_server()
            else:
                exe.run(paddle.static.default_startup_program())
                fleet.init_worker()
                self._init_dense_params(exe, dirname)
            global_startup_program = paddle.static.default_startup_program()
            global_startup_program = self.origin_startup_program
            global_main_program = paddle.static.default_main_program()
            global_main_program = self.origin_main_program
Ejemplo n.º 3
0
    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
        # Input data
        device_id = 0
        if dist_strategy:
            fleet.init(is_collective=True)
        with fluid.device_guard("gpu:0"):
            images = fluid.layers.data(
                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')

            if dist_strategy:
                data_loader = fluid.io.DataLoader.from_generator(
                    feed_list=[images, label],
                    capacity=64,
                    use_double_buffer=False,
                    iterable=False)
            # Train program
            predict = cnn_model(images)
        with fluid.device_guard("gpu:0"):
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)

        # Evaluator
        with fluid.device_guard("gpu:0"):
            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
            batch_acc = fluid.layers.accuracy(
                input=predict, label=label, total=batch_size_tensor)

        inference_program = fluid.default_main_program().clone()
        base_lr = self.lr
        passes = [30, 60, 80, 90]
        steps_per_pass = 10
        bd = [steps_per_pass * p for p in passes]
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
        lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
        opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)

        # Reader
        train_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)

        if dist_strategy:
            strategy = fleet.DistributedStrategy()
            strategy.pipeline = True
            strategy.pipeline_configs = {
                'schedule_mode': 'F-then-B',
                'micro_batch_size': batch_size
            }
            dist_opt = fleet.distributed_optimizer(
                optimizer=opt, strategy=strategy)
            dist_opt.minimize(avg_cost)
        else:
            opt.minimize(avg_cost)

        if dist_strategy:
            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
        else:
            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
Ejemplo n.º 4
0
    def test_fleet_get_applied_optimizer(self):
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        fleet.init(is_collective=True)

        meta_list = fleet._get_applied_meta_list()
        graph_list = fleet._get_applied_graph_list()
        # not called minimize function
        self.assertEqual(len(meta_list), 0)
        self.assertEqual(len(graph_list), 0)

        strategy = fleet.DistributedStrategy()
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        meta_list = fleet._get_applied_meta_list()
        graph_list = fleet._get_applied_graph_list()
        self.assertEqual(len(meta_list), 0)
        self.assertEqual(len(graph_list), 1)
Ejemplo n.º 5
0
    def _init_distributed_strategy(self):
        """Initialize distributed strategy."""
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = 4
        exec_strategy.num_iteration_per_drop_scope = 1

        dist_strategy = fleet.DistributedStrategy()
        dist_strategy.execution_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 1
        dist_strategy.fuse_all_reduce_ops = True
        if self.use_recompute:
            dist_strategy.recompute = True
        if self.use_amp:
            dist_strategy.amp = True
            dist_strategy.amp_configs = {
                "custom_white_list": ["softmax", "layer_norm", "gelu"],
                "init_loss_scaling": self.amp_loss_scaling
            }
        if self.use_sharding:
            dist_strategy.sharding = True
            dist_strategy.sharding_configs = {
                "segment_broadcast_MB": 32,
                "dp_degree": self.dp_degree,
                "sharding_degree": self.sharding_degree,
                "mp_degree": self.mp_degree,
                "pp_degree": self.pp_degree
            }
        self.dist_strategy = dist_strategy
        self._init_build_strategy()
        print(self.dist_strategy)
        return
Ejemplo n.º 6
0
    def __init__(self,
                 model=None,
                 inputs_spec=None,
                 labels_spec=None,
                 cluster=None,
                 strategy=None):
        self.model = model
        self.inputs_spec = self._validate_spec(inputs_spec)
        self.labels_spec = self._validate_spec(labels_spec)
        self.cluster = cluster
        # if self.cluster is None:
        #     self.cluster = get_default_cluster()
        self.strategy = strategy
        if self.strategy is None:
            self.strategy = fleet.DistributedStrategy()

        self._executor = None
        self._cur_rank = paddle.distributed.get_rank()
        self._nranks = paddle.distributed.get_world_size()
        self._saver = DistributedSaver()
        self._logger = get_logger(logging.INFO)

        self._default_strategy = None
        self._orig_main_prog = static.default_main_program()
        self._orig_startup_prog = static.default_startup_program()
        self._orig_dist_context = get_default_distributed_context()
        self._dist_contexts = {}
        self._serial_main_progs = {}
        self._serial_startup_progs = {}
        self._dist_main_progs = defaultdict(dict)  # dist main programs
        self._dist_startup_progs = defaultdict(dict)  # dist startup programs
        self._feed_vars = {}
        self._fetch_vars = {}
def get_dist_prog(train_program, startup_program, dist_context, rank_id):
    loss, train_program, startup_program = mlp_forward(train_program,
                                                       startup_program)

    fleet._user_defined_strategy = fleet.DistributedStrategy()
    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
    parallelizer = AutoParallelizer(fleet)
    parallelizer._dist_context = dist_context

    # serial forward & backward completion
    complete_train_program = auto.complete_annotation(train_program,
                                                      dist_context)

    params_grads = parallelizer._generate_backward(complete_train_program,
                                                   startup_program,
                                                   loss,
                                                   parameter_list=None,
                                                   no_grad_set=None,
                                                   callbacks=None)

    # logical partition
    partitioner = Partitioner(dist_context, rank_id)
    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
        complete_train_program, startup_program, params_grads)

    partitioned_optimize_ops = parallelizer._apply_optimize(
        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)

    return auto_parallel_main_prog, auto_parallel_startup_prog
Ejemplo n.º 8
0
def train():
    global _global_process_mesh
    _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])

    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.amp = False
    dist_strategy.pipeline = False
    dist_strategy.recompute = False
    # init parallel optimizer
    dist_strategy.semi_auto = True

    fleet.init(is_collective=True, strategy=dist_strategy)

    train_program = static.Program()
    start_program = static.Program()
    loss, train_program, start_program, loader = mlp_pretrain_forward(
        train_program, start_program)

    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=1e-08,
                                                     grad_clip=None)

    optimizer = fleet.distributed_optimizer(optimizer)
    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
        loss, start_program)

    places = static.cuda_places()
    loader.set_batch_generator(batch_generator_creator(), places=places)
    exe = paddle.static.Executor(places[0])
    exe.run(distributed_startup_program)

    for data in loader():
        exe.run(distributed_main_program, feed=data, fetch_list=[loss])
Ejemplo n.º 9
0
def get_dist_prog(train_program, startup_program, dist_context, rank_id):
    loss, train_program, startup_program = mlp_forward(train_program,
                                                       startup_program)

    fleet._user_defined_strategy = fleet.DistributedStrategy()
    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
    parallelizer = AutoParallelizer(fleet)
    parallelizer._dist_context = dist_context

    # auto completion
    completer = Completer(dist_context)
    complete_train_program = completer.complete_forward_annotation(
        train_program)
    dist_context.block_state.parse_forward_blocks(complete_train_program)
    params_grads = parallelizer._generate_backward(
        complete_train_program,
        startup_program,
        loss,
        parameter_list=None,
        no_grad_set=None,
        callbacks=None)

    partitioner = Partitioner(dist_context, rank_id)
    dist_train_program, dist_startup_prog, dist_params_grads = partitioner.partition(
        complete_train_program, startup_program, params_grads)

    partitioned_optimize_ops = parallelizer._apply_optimize(
        dist_train_program, dist_startup_prog, dist_params_grads)

    resharder = Resharder(dist_train_program, dist_startup_prog, rank_id,
                          dist_context, dist_params_grads)
    resharder.reshard()
    return dist_train_program, dist_startup_prog
Ejemplo n.º 10
0
def dist_optimizer(args, optimizer):
    """
    Create a distributed optimizer based on a normal optimizer
    Args:
        args:
        optimizer: a normal optimizer
    Returns:
        optimizer: a distributed optimizer
    """
    build_strategy, exec_strategy = create_strategy()

    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.build_strategy = build_strategy

    dist_strategy.fuse_grad_size_in_MB = 16
    if args.use_amp:
        dist_strategy.amp = True
        dist_strategy.amp_configs = {
            'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
            'init_loss_scaling': args.scale_loss,
        }

    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
    return optimizer
    def test_single_gpu(self):
        paddle.enable_static()
        fleet.init(is_collective=True)
        sharding_program = paddle.static.Program()
        sharding_startup_program = paddle.static.Program()
        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        with fluid.program_guard(sharding_program, sharding_startup_program):
            with fluid.unique_name.guard():
                input_x = paddle.static.data(name="x",
                                             shape=[None, 32],
                                             dtype='float32')
                input_y = paddle.static.data(name="y",
                                             shape=[None, 1],
                                             dtype='int64')
                cost = self.mlp(input_x=input_x, input_y=input_y)
                output_name = cost.name
                optimizer = fleet.distributed_optimizer(
                    fluid.optimizer.Adam(), strategy)
                optimizer.minimize(cost)

        trainer_id = fleet.worker_index()
        exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id))
        rank = fleet.worker_index()
        exe.run(sharding_startup_program)
        exe.run(program=sharding_program, feed=self.gen_data())
Ejemplo n.º 12
0
 def test_util_base(self):
     import paddle.distributed.fleet as fleet
     util = fleet.UtilBase()
     strategy = fleet.DistributedStrategy()
     util._set_strategy(strategy)
     role_maker = None  # should be fleet.PaddleCloudRoleMaker()
     util._set_role_maker(role_maker)
def get_dist_prog_with_parallelizer(train_program, startup_program,
                                    dist_context):
    global _global_process_mesh

    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.amp = False
    dist_strategy.pipeline = False
    dist_strategy.recompute = False

    # init parallel optimizer
    dist_strategy.semi_auto = True
    fleet.init(is_collective=True, strategy=dist_strategy)

    loss, train_program, startup_program = mlp_forward(train_program,
                                                       startup_program)

    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=1e-08,
                                                     grad_clip=None)
    optimizer = fleet.distributed_optimizer(optimizer)

    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
        loss, startup_program)

    return distributed_main_program, distributed_startup_program
Ejemplo n.º 14
0
def dist_optimizer(args, optimizer):
    """
    Create a distributed optimizer based on a normal optimizer
    """
    build_strategy, exec_strategy = create_strategy(args)

    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.build_strategy = build_strategy

    dist_strategy.fuse_grad_size_in_MB = 16
    if args.use_amp:
        dist_strategy.amp = True

        custom_black_list = ['lookup_table', 'lookup_table_v2'
                             ] if args.use_pure_fp16 else None
        dist_strategy.amp_configs = {
            'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
            'init_loss_scaling': args.scale_loss,
            'custom_black_list': custom_black_list,
            'use_pure_fp16': args.use_pure_fp16
        }
    if args.gradient_merge_steps > 1:
        dist_strategy.gradient_merge = True
        dist_strategy.gradient_merge_configs = {
            'k_steps': args.gradient_merge_steps
        }

    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
    return optimizer
Ejemplo n.º 15
0
 def apply_passes(self):
     dist_strategy = fleet.DistributedStrategy()
     dist_strategy.semi_auto = True
     dist_strategy.sharding = True
     dist_strategy.sharding_configs = {
         "sharding_degree": 2,
         "stage": 2,
     }
     fleet.init(is_collective=True, strategy=dist_strategy)
 def setUp(self):
     strategy = fleet.DistributedStrategy()
     self.model_parallel_size = 2
     strategy.hybrid_configs = {
         "dp_degree": 1,
         "mp_degree": self.model_parallel_size,
         "pp_degree": 1
     }
     fleet.init(is_collective=True, strategy=strategy)
Ejemplo n.º 17
0
def dist_optimizer(args, topo):
    default_global_batch_size = topo.data_info.size * args.micro_batch_size
    if args.global_batch_size is None:
        args.global_batch_size = default_global_batch_size

    bsz_per_dp = args.global_batch_size // topo.data_info.size
    micro_batch_size = args.micro_batch_size
    assert args.global_batch_size % micro_batch_size == 0, "cannot do gradient accumulate, global_batch_size: {} micro_batch_size: {}".format(
        args.global_batch_size, micro_batch_size)
    acc_steps = bsz_per_dp // micro_batch_size

    exec_strategy = paddle.fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3

    dist_strategy.recompute = args.use_recompute
    dist_strategy.pipeline = args.pp_degree > 1

    if args.use_amp:
        dist_strategy.amp = True
        dist_strategy.amp_configs = {
            "custom_white_list": [
                'softmax', 'layer_norm', 'gelu',
                "fused_softmax_mask_upper_triangle", "elementwise_add"
            ],
            "custom_black_list":
            ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"],
            "init_loss_scaling": 32768,
            "use_dynamic_loss_scaling": True,
            "use_pure_fp16": args.amp_level == "O2",
            "use_fp16_guard": False
        }
    if args.use_sharding:
        dist_strategy.sharding = True
        dist_strategy.sharding_configs = {
            "segment_broadcast_MB": 32,
            "sharding_degree": args.sharding_degree,
            "mp_degree": args.mp_degree,
            "pp_degree": args.pp_degree,
            "dp_degree": args.dp_degree,
            "optimize_offload": False,
        }
    if args.pp_degree > 1:
        dist_strategy.pipeline_configs = {
            "schedule_mode": "1F1B",
            "micro_micro_batch_size": micro_batch_size,
            "accumulate_steps": acc_steps,
        }
    else:
        assert acc_steps == 1, "Only support accumulate steps in piplinemode. Please set you global_batch_size={}".format(
            default_global_batch_size)

    return dist_strategy
 def setUp(self):
     strategy = fleet.DistributedStrategy()
     self.pipeline_parallel_size = 2
     strategy.hybrid_configs = {
         "dp_degree": 1,
         "mp_degree": 1,
         "pp_degree": self.pipeline_parallel_size
     }
     fleet.init(is_collective=True, strategy=strategy)
     self.hcg = fleet.get_hybrid_communicate_group()
Ejemplo n.º 19
0
    def get_model(self, place, gradient_merge, batch_size, max_step):
        paddle.seed(2021)
        random.seed(2021)
        np.random.seed(2021)

        hidden_size = 128

        global _global_parallel_strategy
        global _global_process_mesh
        world_size = paddle.distributed.get_world_size()
        if world_size == 1:
            _global_parallel_strategy = "dp"
            _global_process_mesh = auto.ProcessMesh([0])
        elif world_size == 2:
            _global_parallel_strategy = "dp"
            _global_process_mesh = auto.ProcessMesh([0, 1])

        train_program = static.Program()
        startup_program = static.Program()
        dist_strategy = fleet.DistributedStrategy()
        dist_strategy.semi_auto = True
        #if gradient_merge:
        #    dist_strategy.gradient_merge = True
        #    dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
        fleet.init(is_collective=True, strategy=dist_strategy)

        with static.program_guard(train_program, startup_program), \
            utils.unique_name.guard():
            input = static.data(name="input",
                                shape=[batch_size, hidden_size],
                                dtype='float32')
            label = static.data(name="label",
                                shape=[batch_size, 1],
                                dtype='float32')
            input.stop_gradient = False
            loss = mlp_forward(input, label, hidden_size)

        optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
        #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer)
        _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize(
            loss, startup_program)

        input_data = np.random.random(size=(128,
                                            hidden_size)).astype('float32')
        label_data = np.random.random(size=(128, 1)).astype('float32')

        def reader():
            for i in range(max_step):
                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
                yield x_data, y_data

        return dist_main_prog, dist_startup_prog, [input,
                                                   label], [loss], reader
Ejemplo n.º 20
0
 def test_util_factory(self):
     import paddle.distributed.fleet as fleet
     factory = fleet.base.util_factory.UtilFactory()
     strategy = fleet.DistributedStrategy()
     role_maker = None  # should be fleet.PaddleCloudRoleMaker()
     optimize_ops = []
     params_grads = []
     context = {}
     context["role_maker"] = role_maker
     context["valid_strategy"] = strategy
     util = factory._create_util(context)
     self.assertEqual(util.role_maker, None)
Ejemplo n.º 21
0
 def apply_passes(self, main_prog, startup_prog):
     #self._config["params_grads"] = self._params_grads
     #pass_context = PassContext()
     #auto_parallel_gradient_merge_pass = new_pass(
     #    "auto_parallel_gradient_merge_pass", self._config)
     #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
     #                                        pass_context)
     dist_strategy = fleet.DistributedStrategy()
     dist_strategy.gradient_merge = True
     dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
     dist_strategy.semi_auto = True
     fleet.init(is_collective=True, strategy=dist_strategy)
def train():
    dist_strategy = fleet.DistributedStrategy()
    # init parallel optimizer
    dist_strategy.auto_search = True
    fleet.init(is_collective=True, strategy=dist_strategy)
    train_program = static.Program()
    start_program = static.Program()
    place = paddle.set_device("gpu")
    gpus = [0, 1]
    batch_size = 8
    sequence_len = 512
    vocab_size = 1000
    train_program, start_program, loss, gen_data = get_gpt_model(
        train_program, start_program, place, batch_size, sequence_len,
        vocab_size)

    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
                                                     beta1=0.9,
                                                     beta2=0.999,
                                                     epsilon=1e-08,
                                                     grad_clip=None)
    optimizer = fleet.distributed_optimizer(optimizer)
    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
        loss, start_program)

    places = static.cuda_places()
    exe = paddle.static.Executor(places[0])
    exe.run(distributed_startup_program)

    for step in range(10):
        tokens, position_ids, attention_mask, labels, loss_mask = gen_data()
        if loss.name in distributed_main_program.global_block().vars:
            loss_print, = exe.run(distributed_main_program,
                                  feed={
                                      "tokens": tokens,
                                      "position_ids": position_ids,
                                      "attention_mask": attention_mask,
                                      "labels": labels,
                                      "loss_mask": loss_mask
                                  },
                                  fetch_list=[loss])
            print("step: %s, loss: %f" % (step, loss_print[0]))
        else:
            exe.run(distributed_main_program,
                    feed={
                        "tokens": tokens,
                        "position_ids": position_ids,
                        "attention_mask": attention_mask,
                        "labels": labels,
                        "loss_mask": loss_mask
                    })
            print("step: %s, loss: %s" % (step, "None"))
Ejemplo n.º 23
0
 def _set_strategy(self, args):
     """配置运行的distributed_strategy,
        build_strategy 配置在do_training中"""
     self.dist_strategy = fleet.DistributedStrategy()
     if args.run_params["mode"] == "sync":
         self.dist_strategy.a_sync = False
     elif args.run_params["mode"] == "async":
         self.dist_strategy.a_sync = True
     elif args.run_params["mode"] == "geo_async":
         self.dist_strategy.a_sync = True
         self.dist_strategy.a_sync_configs = {"k_steps": 2}
     elif args.run_params["mode"] == "auto":
         self.dist_strategy.auto = True
 def boundary_net(self, main_prog, startup_prog):
     with fluid.program_guard(main_prog, startup_prog):
         fleet.init(is_collective=True)
         x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32')
         with paddle.static.device_guard('gpu:0'):
             linear = fluid.Linear(4, 8, bias_attr=False)
             out = linear(x)
         with paddle.static.device_guard('gpu:1'):
             linear = fluid.Linear(8, 5, bias_attr=False)
             out = linear(out)
             avg_cost = paddle.mean(out)
         strategy = fleet.DistributedStrategy()
     return avg_cost, strategy
Ejemplo n.º 25
0
 def test_dygraph_fleet_api(self):
     import paddle.distributed.fleet as fleet
     import paddle.distributed as dist
     strategy = fleet.DistributedStrategy()
     strategy.amp = True
     strategy.recompute = True
     fleet.init(is_collective=True, strategy=strategy)
     net = paddle.nn.Sequential(
         paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2))
     net = dist.fleet.distributed_model(net)
     data = np.random.uniform(-1, 1, [30, 10]).astype('float32')
     data = paddle.to_tensor(data)
     net(data)
    def test_allgather(self):
        train_program = paddle.static.Program()
        startup_program = paddle.static.Program()
        process_mesh = auto.ProcessMesh(mesh=[0, 3])
        with static.program_guard(train_program, startup_program):
            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
            x = auto.shard_tensor(x,
                                  dist_attr={
                                      "process_mesh": process_mesh,
                                      "dims_mapping": [0, -1]
                                  })

            w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
            w = auto.shard_tensor(w,
                                  dist_attr={
                                      "process_mesh": process_mesh,
                                      "dims_mapping": [-1, -1]
                                  })

            # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
            #     x.name: [-1, -1],
            #     w.name: [-1, -1]
            # }, **{"x": x,
            #       "y": w})[0]

            y = paddle.distributed.shard_op(paddle.matmul,
                                            dist_attr={
                                                "process_mesh": process_mesh,
                                                x: {
                                                    "dims_mapping": [-1, -1]
                                                },
                                                w: {
                                                    "dims_mapping": [-1, -1]
                                                }
                                            })(x, w)[0]

        rank_id = 0
        dist_context = DistributedContext()
        dist_strategy = fleet.DistributedStrategy()
        partitioner = Partitioner(dist_context, rank_id)
        completer = Completer(dist_context)
        complete_train_program = completer.complete_forward_annotation(
            train_program)
        dist_context.block_state.parse_forward_blocks(complete_train_program)
        partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
            complete_train_program, startup_program, [])
        resharder = Resharder(partitioned_main_prog, partitioned_startup_prog,
                              rank_id, dist_context, partitioned_params_grads)
        resharder.reshard()
        # the x should not be slice
        self.assertTrue(check_allgather(partitioned_main_prog))
Ejemplo n.º 27
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()

    fleet.init()

    fake_num_nodes = 1
    py_reader, loss = StaticSkipGramModel(
        fake_num_nodes,
        args.neg_num,
        args.embed_size,
        sparse_embedding=True,
        shared_embedding=args.shared_embedding)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = build_graph(args)
        # bind gen
        train_ds = ShardedDataset(graph.nodes, args.epoch)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(train_ds,
                                 batch_size=args.cpu_batch_size,
                                 shuffle=True,
                                 num_workers=args.sample_workers,
                                 collate_fn=collate_fn)
        py_reader.set_batch_generator(lambda: data_loader)

        train_loss = train(exe, paddle.static.default_main_program(),
                           py_reader, loss)
        fleet.stop_worker()

        if fleet.is_first_worker():
            fleet.save_persistables(exe, "./model",
                                    paddle.static.default_main_program())
Ejemplo n.º 28
0
def get_distributed_program():
    train_program = static.Program()
    startup_program = static.Program()
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.semi_auto = True
    fleet.init(is_collective=True, strategy=dist_strategy)
    loss, train_program, startup_program = mlp_forward(train_program,
                                                       startup_program)
    optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
    optimizer = fleet.distributed_optimizer(optimizer)
    _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
        loss, startup_program)

    return dist_main_prog, dist_startup_prog, loss
Ejemplo n.º 29
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(
        num_nodes, args.neg_num, args.embed_size, sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        for epoch in range(args.epoch):
            train_loss = train(exe,
                               paddle.static.default_main_program(),
                               data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()
Ejemplo n.º 30
0
def main(args):
    paddle.enable_static()
    paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id)

    fleet.init(is_collective=True)

    graph = load(args.dataset)

    loss = StaticSkipGramModel(graph.num_nodes,
                               args.neg_num,
                               args.embed_size,
                               num_emb_part=args.num_emb_part,
                               shared_embedding=args.shared_embedding)

    optimizer = F.optimizer.Adam(args.learning_rate)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.sharding = True
    dist_strategy.sharding_configs = {
        "segment_anchors": None,
        "sharding_segment_strategy": "segment_broadcast_MB",
        "segment_broadcast_MB": 32,
        "sharding_degree": int(paddle.distributed.get_world_size()),
    }
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
    exe = paddle.static.Executor(place)
    exe.run(paddle.static.default_startup_program())

    # bind gen
    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(train_ds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.sample_workers,
                             collate_fn=collate_fn)

    for epoch in range(args.epoch):
        train_loss = train(exe, paddle.static.default_main_program(),
                           data_loader, loss)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
    fleet.stop_worker()

    if fleet.is_first_worker():
        fleet.save_persistables(exe, "./model",
                                paddle.static.default_main_program())