def build_role(self):
        environs = {}
        environs[
            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36012,127.0.0.1:36013"
        environs[
            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36014,127.0.0.1:36015"
        environs[
            "PADDLE_ALL_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36016,127.0.0.1:36017"
        environs[
            "PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36014,127.0.0.1:36015"
        environs["PADDLE_HETER_TRAINER_DEVICE"] = "gpu"
        environs["TRAINING_ROLE"] = "HETER_TRAINER"
        environs["STAGE_ID"] = 2
        environs["STAGE_NUM"] = 2
        environs["HETER_DEVICE_TYPE"] = "gpu"
        environs["PADDLE_STAGE_TRAINERS_NUM"] = [2, 2]
        environs["PADDLE_TRAINERS_NUM"] = 2
        environs["PADDLE_TRAINER_ID"] = 0
        environs["POD_IP"] = "127.0.0.1"
        environs["PADDLE_PORT"] = "36016"
        environs["FLAGS_selected_gpus"] = 0

        for k, v in environs.items():
            os.environ[k] = str(v)

        self.role = role_maker.PaddleCloudRoleMaker()
        return self.role
Ejemplo n.º 2
0
    def test_gradient_merge_optimizer(self):
        fleet.init(role_maker.PaddleCloudRoleMaker())

        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
        avg_cost = paddle.fluid.layers.mean(cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 0)
        self.assertEqual(sgds, 0)
Ejemplo n.º 3
0
    def test(self):
        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ[
            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
        os.environ["TRAINING_ROLE"] = "PSERVER"

        role = role_maker.PaddleCloudRoleMaker()
        fleet.init(role)
        loss, acc, _ = self.net()

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True

        configs = {}
        configs['__emb__'] = {
            "table_parameters.__emb__.accessor.embed_sgd_param.name":
            "SparseNaiveSGDRule",
            "table_parameters.__emb__.accessor.embedx_sgd_param.name":
            "SparseAdamSGDRule",
        }
        strategy.sparse_table_configs = configs
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(loss)

        fleet.init_server()
Ejemplo n.º 4
0
    def test_fs_gloo6(self):
        plats = platform.platform()
        if 'Linux' not in plats:
            print("skip gloo UT on MacOS/Win")
            return

        tmp = self.mkdir()

        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINERS_NUM"] = "0"

        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"

        os.environ["PADDLE_WITH_GLOO"] = "2"
        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
        os.environ["PADDLE_GLOO_FS_PATH"] = tmp

        role = role_maker.PaddleCloudRoleMaker()
        role._generate_role()
        self.case(role, "server")
        self.case(role, "all")
        self.clean(tmp)
Ejemplo n.º 5
0
    def test_a_sync_optimizer_pserver(self):
        os.environ["TRAINING_ROLE"] = "PSERVER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
        fleet.init_server()
Ejemplo n.º 6
0
    def test_communicator_sync(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
            "127.0.0.1:36001,127.0.0.2:36001"

        fleet.init(role_maker.PaddleCloudRoleMaker())
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
    def test_pipeline_optimizer(self):
        import paddle.distributed.fleet as fleet
        import paddle.distributed.fleet.base.role_maker as role_maker
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')

        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.without_graph_optimization = True

        optimizer = paddle.fluid.optimizer.Adam(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
Ejemplo n.º 8
0
    def test_pipeline_optimizer(self):
        import paddle.distributed.fleet as fleet
        import paddle.distributed.fleet.base.role_maker as role_maker
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        with paddle.fluid.device_guard("gpu:0"):
            input_x = paddle.fluid.layers.data(name="x",
                                               shape=[32],
                                               dtype='float32')
            input_y = paddle.fluid.layers.data(name="y",
                                               shape=[1],
                                               dtype='int64')
            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')

        with paddle.fluid.device_guard("gpu:1"):
            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
            prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                size=2,
                                                act='softmax')
            cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                     label=input_y)
            avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.pipeline = True
        strategy.pipeline_configs = {
            'micro_batch_size': 1,
            'accumulate_steps': 2
        }

        optimizer = paddle.fluid.optimizer.Adam(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
Ejemplo n.º 9
0
 def test_get_util(self):
     import paddle.distributed.fleet as fleet
     import paddle.distributed.fleet.base.role_maker as role_maker
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     default_util = fleet.util
     self.assertEqual(default_util, None)
    def test_a_sync_optimizer2(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(
            name="x", shape=[32], dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(
            input=prediction, label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.auto = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        self.assertTrue(optimizer.user_defined_strategy.a_sync)
        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 800)
    def net(main_prog, startup_prog):
        with fluid.program_guard(main_prog, startup_prog):
            with fluid.unique_name.guard():
                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                fleet.init(role)
                input_x = paddle.fluid.layers.data(
                    name="x", shape=[32], dtype='float32')
                input_y = paddle.fluid.layers.data(
                    name="y", shape=[1], dtype='int64')

                fc_1 = paddle.fluid.layers.fc(input=input_x,
                                              size=64,
                                              act='tanh')
                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
                prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                    size=2,
                                                    act='softmax')
                cost = paddle.fluid.layers.cross_entropy(
                    input=prediction, label=input_y)
                avg_cost = paddle.fluid.layers.mean(x=cost)

                optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
                opt = MetaOptimizerBase(optimizer)
                opt_ops, params_grads = opt.minimize(avg_cost)
                opt.apply_optimize(avg_cost,
                                   paddle.static.default_startup_program(),
                                   params_grads)
        return None
    def test_fp16_allreduce_optimizer(self):
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)

        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        ops = [op.type for op in avg_cost.block.ops]
        cast_out = [
            op.output('Out')[0] for op in avg_cost.block.ops
            if op.type == 'cast'
        ]

        cast_op_count = 0
        for name in ops:
            if name == 'cast':
                cast_op_count += 1
        self.assertIn('cast', ops)
        self.assertEqual(cast_op_count, 12)  # 6 + 6, cast_fp16 + cast_fp32

        for name in cast_out:
            self.assertIn('cast_fp16', name)
Ejemplo n.º 13
0
 def init_fleet_with_gloo(use_gloo=True):
     if use_gloo:
         os.environ["PADDLE_WITH_GLOO"] = "1"
         role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
     else:
         fleet.init()
Ejemplo n.º 14
0
 def instance(self, context):
     import paddle.distributed.fleet.base.role_maker as role_maker
     import paddle.distributed.fleet as fleet
     is_collective = (context["fleet_mode"] == 'COLLECTIVE')
     role = role_maker.PaddleCloudRoleMaker(is_collective=is_collective)
     fleet.init(role)
     context['fleet'] = fleet
     context['role'] = role
     context['status'] = 'network_pass'
Ejemplo n.º 15
0
def runtime_main():
    import paddle.distributed.fleet as fleet

    # model definition
    train_prog = paddle.fluid.Program()
    startup_prog = paddle.fluid.Program()
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            input_x = paddle.fluid.layers.data(name="x",
                                               shape=[32],
                                               dtype='float32')
            input_y = paddle.fluid.layers.data(name="y",
                                               shape=[1],
                                               dtype='int64')

            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
            prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                size=2,
                                                act='softmax')
            cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                     label=input_y)
            avg_cost = paddle.fluid.layers.mean(x=cost)

            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.sharding = True
            strategy.sharding_configs = {
                "sharding_segment_strategy": "segment_broadcast_MB",
                "segment_broadcast_MB": 0.2,
                "sharding_degree": 2,
            }

            optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01,
                                                        momentum=0.9)
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=strategy)
            optimizer.minimize(avg_cost)

    # execution
    device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
    place = fluid.CUDAPlace(device_id)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    dirname = "./ut_sharding_save_model"
    sharding.utils.save_persistables(exe,
                                     dirname,
                                     main_program=train_prog,
                                     filename=None)

    out_losses = []
    if six.PY2:
        print(pickle.dumps(out_losses))
    else:
        sys.stdout.buffer.write(pickle.dumps(out_losses))
Ejemplo n.º 16
0
 def __init__(self, config):
     self.metrics = {}
     self.config = config
     self.input_data = None
     self.reader = None
     self.exe = None
     self.train_result_dict = {}
     self.train_result_dict["speed"] = []
     self.model = None
     self.pure_bf16 = self.config['pure_bf16']
     self.role_maker = role_maker.PaddleCloudRoleMaker()
Ejemplo n.º 17
0
    def test_fs_gloo8(self):
        plats = platform.platform()
        if 'Linux' not in plats:
            print("skip gloo UT on MacOS/Win")
            return

        tmp = self.mkdir()

        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINERS_NUM"] = "0"

        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"

        os.environ["PADDLE_WITH_GLOO"] = "2"
        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
        os.environ["PADDLE_GLOO_FS_PATH"] = tmp

        def net():
            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
            cost = paddle.fluid.layers.square_error_cost(
                input=y_predict, label=y)
            avg_cost = paddle.fluid.layers.mean(cost)
            return avg_cost

        from paddle.distributed import fleet

        role = role_maker.PaddleCloudRoleMaker()
        fleet.init(role)
        avg_cost = net()

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False

        optimizer = paddle.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        comm_world = "server"
        fleet.util.barrier(comm_world)

        gather = fleet.util.all_gather(1, comm_world)
        self.assertEqual(gather[0], 1)

        all_reduce = fleet.util.all_reduce(1, "sum", comm_world)
        self.assertEqual(1, all_reduce)

        self.clean(tmp)
 def setUp(self):
     os.environ[
         "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
     os.environ["PADDLE_TRAINERS_NUM"] = str(2)
     os.environ["TRAINING_ROLE"] = "PSERVER"
     os.environ["PADDLE_PORT"] = "4001"
     os.environ["POD_IP"] = "127.0.0.1"
     role = role_maker.PaddleCloudRoleMaker()
     fleet.init(role)
     self.strategy = paddle.distributed.fleet.DistributedStrategy()
     self.strategy.a_sync = True
Ejemplo n.º 19
0
    def test_traing_role(self):
        """Test training role."""
        os.environ["TRAINING_ROLE"] = "TEST"
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_training_role")
            return

        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
        self.assertRaises(ValueError, ro.generate_role)
Ejemplo n.º 20
0
    def test_lamb_optimizer(self):
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        startup_prog = fluid.Program()
        train_prog = fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)
        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        ops = [op.type for op in avg_cost.block.ops]
        self.assertIn('lamb', ops)
    def test_fp16_allreduce_not_apply_fp16_net(self):
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog,
                                      startup_prog,
                                      dtype='float16')

        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        ops = [op.type for op in avg_cost.block.ops]
        self.assertNotIn('cast', ops)
Ejemplo n.º 22
0
    def test_fleet_amp_meta_optimizer_init(self):
        if not fluid.core.is_compiled_with_cuda():
            return

        main_program = paddle.static.Program()
        startup_program = paddle.static.Program()

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with paddle.static.program_guard(main_program, startup_program):
            input_x = paddle.static.data(name="x",
                                         shape=[None, 32],
                                         dtype='float32')
            input_y = paddle.static.data(name="y",
                                         shape=[None, 1],
                                         dtype='int64')

            cost = mlp(input_x, input_y)
            optimizer = paddle.optimizer.Momentum(
                learning_rate=0.001,
                momentum=0.9,
                weight_decay=fluid.regularizer.L2Decay(1e-4),
                multi_precision=True)

            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.amp = True
            strategy.amp_configs = {'use_pure_fp16': True}
            strategy.gradient_merge = True
            strategy.gradient_merge_configs = {"k_steps": 2}

            optimizer = fleet.distributed_optimizer(optimizer, strategy)
            optimizer.minimize(cost)

        print(fleet._get_applied_meta_list())
        loss_scale = optimizer.get_loss_scaling()

        place = paddle.CUDAPlace(0)

        exe = paddle.static.Executor(place)
        exe.run(startup_program)
        optimizer.amp_init(place)

        step = 3
        for i in range(step):
            cost_val = exe.run(program=main_program,
                               feed=gen_data(),
                               fetch_list=[cost.name])
            print(cost_val)
Ejemplo n.º 23
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(
        num_nodes, args.neg_num, args.embed_size, sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        for epoch in range(args.epoch):
            train_loss = train(exe,
                               paddle.static.default_main_program(),
                               data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()
    def test_dgc_optimizer_backward(self):
        """ test dgc optimizer backward """
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)

        self.set_strategy(strategy, 'dgc')
        opt = fluid.optimizer.MomentumOptimizer(
            learning_rate=0.001, momentum=0.9)
        dgc_opt = DGCOptimizer(opt)
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
        params_grads = dgc_opt.backward(avg_cost, startup_prog)

        ops = [op.type for op in avg_cost.block.ops]
        self.assertNotIn('dgc', ops)
Ejemplo n.º 25
0
    def test_hdfs_gloo_v2(self):
        plats = platform.platform()
        if 'Linux' not in plats:
            print("skip gloo UT on MacOS/Win")
            return

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
        os.environ["PADDLE_WITH_GLOO"] = "1"
        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
        os.environ["PADDLE_GLOO_FS_NAME"] = ""
        os.environ["PADDLE_GLOO_FS_UGI"] = ""
        os.environ["PADDLE_GLOO_FS_PATH"] = ""

        role = role_maker.PaddleCloudRoleMaker()
        self.assertRaises(ValueError, role._generate_role)
Ejemplo n.º 26
0
    def test_pipeline_optimizer(self):
        import paddle.distributed.fleet as fleet
        import paddle.distributed.fleet.base.role_maker as role_maker
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        with paddle.fluid.device_guard("gpu:0"):
            input_x = paddle.fluid.layers.data(name="x",
                                               shape=[32],
                                               dtype='float32')
            input_y = paddle.fluid.layers.data(name="y",
                                               shape=[1],
                                               dtype='int64')
            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
            fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh')
            fc_4 = paddle.fluid.layers.fc(input=fc_3, size=64, act='tanh')
            fc_5 = paddle.fluid.layers.fc(input=fc_4, size=64, act='tanh')
            fc_6 = paddle.fluid.layers.fc(input=fc_5, size=64, act='tanh')

        with paddle.fluid.device_guard("gpu:1"):
            fc_7 = paddle.fluid.layers.fc(input=fc_6, size=64, act='tanh')
            prediction = paddle.fluid.layers.fc(input=[fc_7],
                                                size=2,
                                                act='softmax')
            cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                     label=input_y)
            avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.pipeline = True
        strategy.pipeline_configs = {
            'micro_batch_size': 1,
            'accumulate_steps': 2,
            'schedule_mode': '1F1B'
        }

        checkpoints = ['fc_5.tmp_0', 'fc_7.tmp_0']
        strategy.recompute = True
        strategy.recompute_configs = {
            "checkpoints": checkpoints,
            "enable_offload": False,
            "checkpoint_shape": []
        }

        optimizer = paddle.fluid.optimizer.Adam(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
Ejemplo n.º 27
0
    def test_a_sync_optimizer_trainer(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 7)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
Ejemplo n.º 28
0
 def test_rnn_raw_optimizer(self):
     import paddle.distributed.fleet as fleet
     import paddle.distributed.fleet.base.role_maker as role_maker
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     train_program = static.Program()
     start_program = static.Program()
     train_program, start_program, loss, optimizer, data_holders = \
         rnn_pretrain_forward(train_program, start_program)
     with paddle.static.program_guard(
             train_program, start_program), paddle.utils.unique_name.guard():
         strategy = fleet.DistributedStrategy()
         strategy.without_graph_optimization = True
         strategy.fuse_all_reduce_ops = True
         fleet.init(is_collective=True, strategy=strategy)
         optimizer = fleet.distributed_optimizer(optimizer)
         optimizer.minimize(loss)
Ejemplo n.º 29
0
    def test_set_user_defined_util(self):
        import paddle.distributed.fleet as fleet

        class UserDefinedUtil(fleet.UtilBase):
            def __init__(self):
                super(UserDefinedUtil, self).__init__()

            def get_user_id(self):
                return 10

        import paddle.distributed.fleet.base.role_maker as role_maker
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        my_util = UserDefinedUtil()
        fleet.util = my_util
        user_id = fleet.util.get_user_id()
        self.assertEqual(user_id, 10)
Ejemplo n.º 30
0
    def test_amp_optimizer_backward(self):
        """ test amp optimizer backward """
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)

        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
                                                momentum=0.9)
        opt = AMPOptimizer(opt)

        self.set_strategy(strategy, 'amp')
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        opt._set_basic_info(avg_cost, role, opt, strategy)
        params_grads = opt.backward(avg_cost, startup_prog)

        ops = [op.type for op in avg_cost.block.ops]
        self.assertIn('cast', ops)
        self.assertNotIn('check_finite_and_unscale', ops)