Esempio n. 1
0
    def test_gradient_merge_optimizer(self):
        fleet.init(role_maker.PaddleCloudRoleMaker())

        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
        avg_cost = paddle.fluid.layers.mean(cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 0)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
Esempio n. 2
0
    def test_communicator_sync(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
            "127.0.0.1:36001,127.0.0.2:36001"

        fleet.init(role_maker.PaddleCloudRoleMaker())
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
Esempio n. 3
0
def train(args):
    import logging
    log.setLevel(logging.DEBUG)
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    data = load_raw_edges_fn(args.edge_path, args.undirected)
    edges = data[0]
    weights = data[1]
    node2idx = data[2]
    num_nodes = len(node2idx)

    model = DeepwalkModel(num_nodes, args.hidden_size, args.neg_num,
                          args.is_sparse, args.is_distributed, 1.)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    log.info("init_role")
    init_role()

    train_steps = math.ceil(1. * num_nodes * args.epoch /
                            args.batch_size / num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    if args.optimizer == "sgd":
        args.lr *= args.batch_size * args.walk_len * args.win_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        log.info("PS server mode")
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        exe = F.Executor(F.CPUPlace())
        exe.run(F.default_startup_program())
        log.info("Startup done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")


        print("LEO num_nodes:",num_nodes, len(edges))
        edges_feat={}
        edges_feat["weight"] = np.array(weights)
        graph = pgl.graph.Graph(num_nodes, edges, edge_feat=edges_feat)
        # bind gen
        gen_func = build_gen_func(args, graph)

        pyreader.decorate_tensor_provider(gen_func)

        train_prog(exe, F.default_main_program(), loss, pyreader, args, train_steps)
        print("fleet try to stop worker\r\n")
        fleet.stop_worker()
        print("Game over\r\n")
    def test_gradient_merge_optimizer(self):
        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(
            name="x", shape=[32], dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(
            input=prediction, label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 6)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
Esempio n. 5
0
 def run(self):
     fleet.init()
     self.network()
     if fleet.is_server():
         self.run_server()
     elif fleet.is_worker():
         self.run_worker()
         fleet.stop_worker()
     logger.info("Run Success, Exit.")
Esempio n. 6
0
 def run(self):
     self.init_fleet_with_gloo()
     self.network()
     if fleet.is_server():
         self.run_server()
     elif fleet.is_worker():
         self.run_offline_infer()
         fleet.stop_worker()
         # self.record_result()
     logger.info("Run Success, Exit.")
Esempio n. 7
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()

    fleet.init()

    fake_num_nodes = 1
    py_reader, loss = StaticSkipGramModel(
        fake_num_nodes,
        args.neg_num,
        args.embed_size,
        sparse_embedding=True,
        shared_embedding=args.shared_embedding)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = build_graph(args)
        # bind gen
        train_ds = ShardedDataset(graph.nodes, args.epoch)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(train_ds,
                                 batch_size=args.cpu_batch_size,
                                 shuffle=True,
                                 num_workers=args.sample_workers,
                                 collate_fn=collate_fn)
        py_reader.set_batch_generator(lambda: data_loader)

        train_loss = train(exe, paddle.static.default_main_program(),
                           py_reader, loss)
        fleet.stop_worker()

        if fleet.is_first_worker():
            fleet.save_persistables(exe, "./model",
                                    paddle.static.default_main_program())
Esempio n. 8
0
def main(args):
    paddle.enable_static()
    paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id)

    fleet.init(is_collective=True)

    graph = load(args.dataset)

    loss = StaticSkipGramModel(graph.num_nodes,
                               args.neg_num,
                               args.embed_size,
                               num_emb_part=args.num_emb_part,
                               shared_embedding=args.shared_embedding)

    optimizer = F.optimizer.Adam(args.learning_rate)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.sharding = True
    dist_strategy.sharding_configs = {
        "segment_anchors": None,
        "sharding_segment_strategy": "segment_broadcast_MB",
        "segment_broadcast_MB": 32,
        "sharding_degree": int(paddle.distributed.get_world_size()),
    }
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
    exe = paddle.static.Executor(place)
    exe.run(paddle.static.default_startup_program())

    # bind gen
    train_ds = ShardedDataset(graph.nodes)
    collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                               args.neg_num, args.neg_sample_type)
    data_loader = Dataloader(train_ds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.sample_workers,
                             collate_fn=collate_fn)

    for epoch in range(args.epoch):
        train_loss = train(exe, paddle.static.default_main_program(),
                           data_loader, loss)
        log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
    fleet.stop_worker()

    if fleet.is_first_worker():
        fleet.save_persistables(exe, "./model",
                                paddle.static.default_main_program())
Esempio n. 9
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(
        num_nodes, args.neg_num, args.embed_size, sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        for epoch in range(args.epoch):
            train_loss = train(exe,
                               paddle.static.default_main_program(),
                               data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()
Esempio n. 10
0
def runtime_main(test_class):
    parser = argparse.ArgumentParser(description='Run Fleet test.')
    parser.add_argument('--role',
                        type=str,
                        required=True,
                        choices=['pserver', 'trainer', 'heter_trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--trainer_endpoints',
                        type=str,
                        required=False,
                        default="")
    parser.add_argument('--heter_trainer_endpoints',
                        type=str,
                        required=False,
                        default="")
    parser.add_argument('--heter_trainer_device',
                        type=str,
                        required=False,
                        default="gpu")
    parser.add_argument('--gloo_path', type=str, required=False, default="")
    parser.add_argument('--current_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument('--mode', type=str, required=False, default='async')
    parser.add_argument('--geo_sgd_need_push_nums',
                        type=int,
                        required=False,
                        default=2)
    parser.add_argument('--reader',
                        type=str,
                        required=False,
                        default='dataset')
    args = parser.parse_args()

    model = test_class()
    role = model.build_role(args)
    fleet.init(role)
    strategy = model.build_strategy(args)
    avg_cost = model.net(args)
    model.build_optimizer(avg_cost, strategy)

    if args.role == "pserver" or args.role == "heter_trainer":
        model.run_pserver(args)
    else:
        if args.reader == "dataset":
            model.run_dataset_trainer(args)
        else:
            model.run_pyreader_trainer(args)
        fleet.stop_worker()
Esempio n. 11
0
    def test_a_sync_optimizer_trainer(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[32],
                                           dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
        for op in prog.global_block().ops:
            if op.type == "send":
                sends += 1
            if op.type == "sgd":
                sgds += 1
        self.assertEqual(sends, 7)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
        time.sleep(8)
        fleet.stop_worker()
    def run_trainer(self, role, strategy):
        place = fluid.core.CPUPlace()
        exe = fluid.Executor(place)

        fleet.init(role)
        avg_cost, x, z, y = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        train_reader = paddle.batch(self.fake_reader(), batch_size=24)
        feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])

        for batch_id, data in enumerate(train_reader()):
            exe.run(fluid.default_main_program(),
                    feed=feeder.feed(data),
                    fetch_list=[])

        fleet.stop_worker()
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        os.environ["TEST_MODE"] = "1"
        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
Esempio n. 14
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)
    data = pgl.dataset.RedditDataset(args.normalize, args.symmetry)
    log.info("Preprocess finish")
    log.info("Train Examples: %s" % len(data.train_index))
    log.info("Val Examples: %s" % len(data.val_index))
    log.info("Test Examples: %s" % len(data.test_index))
    log.info("Num nodes %s" % data.graph.num_nodes)
    log.info("Num edges %s" % data.graph.num_edges)
    log.info("Average Degree %s" % np.mean(data.graph.indegree()))

    graph = data.graph
    train_index = data.train_index
    val_index = data.val_index
    test_index = data.test_index

    train_label = data.train_label
    val_label = data.val_label
    test_label = data.test_label

    loss, acc = build_net(
        input_size=data.feature.shape[-1],
        num_class=data.num_classes,
        hidden_size=args.hidden_size,
        num_layers=len(args.samples))
    test_program = paddle.static.default_main_program().clone(for_test=True)

    strategy = fleet.DistributedStrategy()
    strategy.a_sync = True
    optimizer = paddle.fluid.optimizer.Adam(learning_rate=args.lr)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(loss)

    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()
    else:
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        train_ds = ShardedDataset(train_index, train_label)
        valid_ds = ShardedDataset(val_index, val_label)
        test_ds = ShardedDataset(test_index, test_label)

        collate_fn = partial(batch_fn, graph=graph, samples=args.samples)

        train_loader = Dataloader(
            train_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        valid_loader = Dataloader(
            valid_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        test_loader = Dataloader(
            test_ds,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.sample_workers,
            collate_fn=collate_fn)

        compiled_prog, cpu_num = setup_compiled_prog(loss)

        for epoch in tqdm.tqdm(range(args.epoch)):
            train_loss, train_acc = run(train_loader,
                                        data.feature,
                                        exe,
                                        compiled_prog,
                                        loss,
                                        acc,
                                        phase="train",
                                        cpu_num=cpu_num)

            valid_loss, valid_acc = run(valid_loader,
                                        data.feature,
                                        exe,
                                        test_program,
                                        loss,
                                        acc,
                                        phase="valid",
                                        cpu_num=1)

            log.info("Epoch %s Valid-Loss %s Valid-Acc %s" %
                     (epoch, valid_loss, valid_acc))
        test_loss, test_acc = run(test_loader,
                                  data.feature,
                                  exe,
                                  test_program,
                                  loss,
                                  acc,
                                  phase="test",
                                  cpu_num=1)
        log.info("Epoch %s Test-Loss %s Test-Acc %s" %
                 (epoch, test_loss, test_acc))

        fleet.stop_worker()
def runtime_main(test_class):
    parser = argparse.ArgumentParser(description='Run Fleet test.')
    parser.add_argument('--role',
                        type=str,
                        required=True,
                        choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--trainer_endpoints',
                        type=str,
                        required=False,
                        default="")
    parser.add_argument('--gloo_path', type=str, required=False, default="")
    parser.add_argument('--current_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument('--mode', type=str, required=False, default='geo')
    parser.add_argument('--geo_sgd_need_push_nums',
                        type=int,
                        required=False,
                        default=2)
    parser.add_argument('--reader',
                        type=str,
                        required=False,
                        default='dataset')
    parser.add_argument('--test', type=int, required=False, default=0)
    parser.add_argument('--model_dir', type=str, required=False, default="")
    args = parser.parse_args()

    model = test_class()
    role = model.build_role(args)

    # for distributed inference
    if args.test and args.model_dir != "":
        avg_cost = model.net(args, is_train=False)
        dist_infer = DistributedInfer()
        dist_infer.init_distributed_infer_env(exe=model.get_executor(),
                                              loss=model.avg_cost,
                                              role_maker=role,
                                              dirname=args.model_dir)

        if fleet.is_worker():
            with paddle.static.program_guard(
                    main_program=dist_infer.get_dist_infer_program()):
                model.do_distributed_testing(fleet)
                fleet.stop_worker()
            return

        if fleet.is_server():
            return

    fleet.init(role)
    strategy = model.build_strategy(args)
    avg_cost = model.net(args)
    model.build_optimizer(avg_cost, strategy)

    if args.role == "pserver":
        model.run_pserver(args)
    else:
        if args.reader == "dataset":
            model.run_dataset_trainer(args)
        else:
            model.run_pyreader_trainer(args)

        if args.test:
            test_origin_program = paddle.static.Program()
            test_startup_program = paddle.static.Program()
            with paddle.static.program_guard(
                    main_program=test_origin_program,
                    startup_program=test_startup_program):
                with paddle.utils.unique_name.guard():
                    avg_cost = model.net(args, is_train=False)
            dist_infer = DistributedInfer(main_program=test_origin_program,
                                          startup_program=test_startup_program)
            with paddle.static.program_guard(
                    main_program=dist_infer.get_dist_infer_program()):
                model.do_distributed_testing(fleet)
        fleet.stop_worker()
Esempio n. 16
0
 def test_stop_worker(self):
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     if fleet.is_worker():
         fleet.stop_worker()
Esempio n. 17
0
fleet.init(is_collective=False)

model = WideDeepModel()
model.net(is_train=True)

optimizer = paddle.optimizer.SGD(learning_rate=0.0001)

strategy = fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = fleet.distributed_optimizer(optimizer, strategy)

optimizer.minimize(model.cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()

if fleet.is_worker():
    place = paddle.CPUPlace()
    exe = paddle.static.Executor(place)

    exe.run(paddle.static.default_startup_program())

    fleet.init_worker()

    distributed_training(exe, model)
    clear_metric_state(model, place)
    distributed_infer(exe, model)

    fleet.stop_worker()
Esempio n. 18
0
    def test_communicator_ps_gpu(self):
        with open("test_communicator_ps_gpu.txt", "w") as f:
            data = "1 0.6 1 0.7\n"
            f.write(data)

        os.environ["PADDLE_PSERVER_NUMS"] = "2"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
        os.environ[
            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
        os.environ[
            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["FLAGS_selected_gpus"] = "0"
        role = role_maker.PaddleCloudRoleMaker()

        fleet.init(role)
        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        slots_vars = [x, y]

        cost = fluid.layers.square_error_cost(input=x, label=y)
        avg_cost = fluid.layers.mean(cost)

        optimizer = fluid.optimizer.Adam(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {
            "launch_barrier": False,
            "use_ps_gpu": 1,
        }
        startup_program = paddle.static.Program()
        main_program = paddle.static.Program()
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        dataset = paddle.distributed.InMemoryDataset()
        dataset.init(batch_size=32,
                     thread_num=1,
                     pipe_command="cat",
                     use_var=slots_vars)
        dataset.set_filelist(["test_communicator_ps_gpu.txt"])
        dataset.set_date("20211111")
        dataset.load_into_memory(is_shuffle=True)

        os.environ["TEST_MODE"] = "1"
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(startup_program)
        main_program._fleet_opt = {"stat_var_names": [x.name]}
        fleet.init_worker()

        try:
            exe.train_from_dataset(main_program, dataset)
        except ImportError as e:
            pass
        except Exception as e:
            self.assertTrue(False)
        time.sleep(10)
        fleet.stop_worker()
        os.remove("./test_communicator_ps_gpu.txt")
Esempio n. 19
0
def main(args):
    paddle.set_device("cpu")
    paddle.enable_static()

    fleet.init()

    if args.num_nodes is None:
        num_nodes = load(args.dataset).num_nodes
    else:
        num_nodes = args.num_nodes

    loss = StaticSkipGramModel(num_nodes,
                               args.neg_num,
                               args.embed_size,
                               sparse=True)

    optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True)
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.a_sync = True
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
    optimizer.minimize(loss)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    if fleet.is_worker():
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        graph = load(args.dataset)
        # bind gen
        train_ds = ShardedDataset(graph.nodes)
        collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size,
                                   args.neg_num, args.neg_sample_type)
        data_loader = Dataloader(train_ds,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=args.sample_workers,
                                 collate_fn=collate_fn)

        cpu_num = int(os.environ.get('CPU_NUM', 1))
        if int(cpu_num) > 1:
            parallel_places = [paddle.CPUPlace()] * cpu_num
            exec_strategy = paddle.static.ExecutionStrategy()
            exec_strategy.num_threads = int(cpu_num)
            build_strategy = paddle.static.BuildStrategy()
            build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce
            compiled_prog = paddle.static.CompiledProgram(
                paddle.static.default_main_program()).with_data_parallel(
                    loss_name=loss.name,
                    places=parallel_places,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)
        else:
            compiled_prog = paddle.static.default_main_program()

        for epoch in range(args.epoch):
            train_loss = train(exe, compiled_prog, data_loader, loss)
            log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss)
        fleet.stop_worker()

        if fleet.is_first_worker():
            fleet.save_persistables(exe, "./model",
                                    paddle.static.default_main_program())