Exemple #1
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
Exemple #2
0
def main(args):
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = GraphsageModel(args)
    loss = model.forward()
    train_iter = reader.get_iter(args, model.graph_wrapper, 'train')
    pyreader = fake_py_reader(train_iter, num_devices)

    # init fleet
    init_role()

    optimization(args.lr, loss, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, model, pyreader, args)
Exemple #3
0
    def run_pserver(self, args):
        if args.role.upper() != "PSERVER":
            raise ValueError("args role must be PSERVER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.SERVER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = args.sync_mode
        strategy.geo_sgd_mode = args.geo_sgd_mode
        strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums

        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Exemple #4
0
    def run_pserver(self, args):
        fleet.init(self.build_role(args))
        strategy = self.build_strategy(args)
        avg_cost = self.net(args)
        self.build_optimizer(avg_cost, strategy)

        fleet.init_server()
        fleet.run_server()
Exemple #5
0
    def run_pserver(self, role, strategy):
        fleet.init(role)
        avg_cost, x, y = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Exemple #6
0
 def server(self, context):
     namespace = "train.startup"
     init_model_path = envs.get_global_env("cluster.init_model_path", "",
                                           namespace)
     assert init_model_path != "", "Cluster train must has init_model for TDM"
     fleet.init_server(init_model_path)
     logger.info("TDM: load model from {}".format(init_model_path))
     fleet.run_server()
     context['is_exit'] = True
Exemple #7
0
def main(args):
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = Metapath2vecModel(config=args)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size /
                            num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    real_batch_size = args.batch_size * args.walk_len * args.win_size
    if args.optimizer == "sgd":
        args.lr *= real_batch_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        dataset = m2vGraph(args)
        log.info("Build graph done.")

        data_generator = multiprocess_data_generator(args, dataset)

        cur_time = time.time()
        for idx, _ in enumerate(data_generator()):
            log.info("iter %s: %s s" % (idx, time.time() - cur_time))
            cur_time = time.time()
            if idx == 100:
                break

        pyreader.decorate_tensor_provider(data_generator)
        pyreader.start()

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
Exemple #8
0
 def run_server(self, FLAGS):
     """
     set default run_server
     """
     #TODO: load pre model
     fleet.init_server(FLAGS.init_pretrain_model)
     if FLAGS.init_train_params is not None:
         place = fluid.CPUPlace()
         self.paddle_env['factory']['net'].init_params(place)
     logging.info("PServer init success!")
     fleet.run_server() 
     
     return True
Exemple #9
0
    def init_and_run_ps_worker(self, ckpt_path):
        # init and run server or worker
        self.exe = F.Executor(F.CPUPlace())
        if tfleet.is_server():
            tfleet.init_server()
            self.warmstart(tfleet.startup_program, path=ckpt_path)
            tfleet.run_server()
            exit()

        if tfleet.is_worker():
            log.info("start init worker done")
            tfleet.init_worker()
            self.exe.run(tfleet.startup_program)
Exemple #10
0
 def run_pserver(self, args):
     """
     run pserver process, you don't need to implement it.
     Args:
         args (ArgumentParser): run args to config dist fleet.
     """
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     if args.role.upper() != "PSERVER":
         raise ValueError("args role must be PSERVER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.SERVER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     self._set_strategy(args)
     avg_cost = self.net(args)
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, self.strategy)
     optimizer.minimize(avg_cost)
     fleet.init_server(model_dir=args.run_params.get("model_dir", ""))
     fleet.run_server()
 def run_pserver(self, args):
     """run pserver"""
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     import paddle.fluid as fluid
     from paddle.fluid.transpiler.ps_dispatcher import RoundRobin
     from paddle.fluid.transpiler.ps_dispatcher import HashName
     fluid.default_startup_program().random_seed = 1
     fluid.default_main_program().random_seed = 1
     if args.role.upper() != "PSERVER":
         raise ValueError("args role must be PSERVER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.SERVER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     strategy.sync_mode = args.run_params["sync_mode"]
     strategy.async_mode = args.run_params["async_mode"]
     strategy.mode = "pserver"
     strategy.slice_var_up = args.run_params['slice_var_up']
     strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
     if args.run_params['split_method']:
         strategy.split_method = HashName
     strategy.split_method = RoundRobin
     strategy.wait_port = args.run_params['wait_port']
     strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv']
     strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce']
     #strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks']
     #strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks']
     strategy.geo_sgd_mode = args.run_params['geo_sgd']
     strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     avg_cost = self.net(args)
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, strategy)
     optimizer.minimize(avg_cost)
     fleet.init_server()
     fleet.run_server()
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model,
          is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    model_args = model()
    navi_predict = model_args['predict'][0]
    voice_navi_predict = model_args['predict'][1]
    speed_navi_predict = model_args['predict'][2]
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']

    role = role_maker.PaddleCloudRoleMaker()
    # 全异步训练
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    config.runtime_split_send_recv = True

    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        # train_reader = paddle.batch(
        #     paddle.reader.shuffle(
        #         streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE)

        feeder = fluid.DataFeeder(feed_order, place)
        train_reader = feeder.decorate_reader(paddle.batch(
            paddle.reader.shuffle(streaming_data_reader(), buf_size=8192),
            batch_size=BATCH_SIZE),
                                              multi_devices=False,
                                              drop_last=True)
        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_program = fluid.default_main_program()
        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            # for auc_state in model_args['auc'][2]:
            #     set_zero(place, fluid.global_scope(), auc_state.name)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM
        main_program.num_threads = CPU_NUM
        build_strategy = fluid.BuildStrategy()
        build_strategy.async_mode = True

        # 并行训练,速度更快
        train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                          main_program=main_program,
                                          loss_name=avg_cost.name)

        cost_list = []
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                cost_value = train_pe.run(feed=data,
                                          fetch_list=[avg_cost.name])
                cost_list.append(np.array(cost_value))

                if batch_id % 100 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s" % \
                          (pass_id, batch_id, np.array(cost_list).mean())
                    cost_list = []
                if batch_id % 2000 == 0:
                    if output_model is not None:
                        fluid.io.save_inference_model(
                            output_model, feed_order, [
                                navi_predict, voice_navi_predict,
                                speed_navi_predict, avg_cost
                            ], exe)
                        fluid.io.save_persistables(exe, output_model)
                        infer(test_sample_dir, output_model, feed_order)

    else:
        # 加入 fleet init 初始化环境
        fleet.init(role)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            if is_increment:
                fleet.init_server(old_model)
            else:
                fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = Executor(place)
            # train_reader = paddle.batch(
            #     paddle.reader.shuffle(
            #         data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(paddle.batch(
                paddle.reader.shuffle(data_reader(train_sample_dir),
                                      buf_size=8192),
                batch_size=BATCH_SIZE),
                                                  multi_devices=False,
                                                  drop_last=True)
            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                    loss_name=avg_cost.name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

            cost_list = []
            for pass_id in range(PASS_NUM):
                for batch_id, data in enumerate(train_reader()):
                    cost_value = exe.run(program=compiled_prog,
                                         feed=data,
                                         fetch_list=[avg_cost.name])
                    cost_list.append(np.array(cost_value))

                    if batch_id % 100 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s" % \
                              (pass_id, batch_id, np.array(cost_list).mean())
                        cost_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if output_model is not None:
                            fleet.save_inference_model(
                                exe, output_model, feed_order, [
                                    navi_predict, voice_navi_predict,
                                    speed_navi_predict, avg_cost
                                ])
                            fleet.save_persistables(exe, output_model)
                            infer(test_sample_dir, output_model, feed_order)
        fleet.stop_worker()
Exemple #13
0
def train(args):
    """run train"""
    # set random
    program = fluid.default_main_program()
    program.random_seed = args.random_seed

    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    if args.sync_mode == "sync":
        strategy = StrategyFactory.create_sync_strategy()
    elif args.sync_mode == "half_async":
        strategy = StrategyFactory.create_half_async_strategy()
    elif args.sync_mode == "async":
        strategy = StrategyFactory.create_async_strategy()

    # set model
    logger.info("TDM Begin build network.")
    tdm_model = TdmTrainNet(args)
    inputs = tdm_model.input_data()

    logger.info("TDM Begin load tree travel & layer.")
    avg_cost, acc = tdm_model.tdm(inputs)
    logger.info("TDM End build network.")
    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate,
                                              lazy_mode=True)

    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)
    logger.info("TDM End append backward.")

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        logger.info("TDM Run server ...")
        # 初始化及运行参数服务器节点
        logger.info("TDM init model path: {}".format(
            args.init_model_files_path))
        # 模型中除了tdm树结构相关的变量都应该在此处初始化
        fleet.init_server(args.init_model_files_path)
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'),
                fluid.CPUPlace())
            logger.info("TDM Set learning rate {}".format(args.learning_rate))
        else:
            logger.info("TDM Didn't find learning_rate_0 param")
        logger.info("TDM load End")

        fleet.run_server()
        logger.info("TDM Run server success!")
    elif fleet.is_worker():
        logger.info("TDM Run worker ...")
        # 初始化工作节点
        fleet.init_worker()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        logger.info("TDM Run Startup Begin")
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)

        # Set Learning Rate
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'), place)
            logger.info("TDM Set learning rate {}".format(args.learning_rate))

        # Set TDM Variable
        logger.info("TDM Begin load parameter.")
        # Set TDM_Tree_Info
        # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set
        tdm_param_prepare_dict = tdm_sampler_prepare(args)
        tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args)
        Numpy_model = {}
        Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array']
        Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array']
        Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array']
        # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args)
        # 分布式训练中,Emb存储与参数服务器,无需在本地set
        for param_name in Numpy_model:
            param_t = fluid.global_scope().find_var(param_name).get_tensor()
            param_t.set(Numpy_model[str(param_name)].astype('int32'), place)

        logger.info("TDM Run Startup End")

        # Train loop
        dataset, file_list, example_num = get_dataset(inputs, args)
        logger.info("TDM Distributed training begin ...")
        for epoch in range(args.epoch_num):
            # local shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[acc, avg_cost],
                                   fetch_info=[
                                       "Epoch {} acc ".format(epoch),
                                       "Epoch {} loss ".format(epoch)
                                   ],
                                   print_period=1,
                                   debug=False)
            end_time = time.time()
            logger.info(
                "Epoch {} finished, use time {} second, speed {} example/s".
                format(epoch, end_time - start_time,
                       example_num * 1.0 / (end_time - start_time)))

            # 默认使用0号节点保存模型
            if fleet.is_first_worker():
                model_path = os.path.join(args.model_files_path,
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)
                logger.info("Begin upload files")
                # upload_files(model_path, warm_up=False)
                # 在分布式环境下时,支持上传模型到hdfs
        logger.info("TDM Before stop worker")
        fleet.stop_worker()
        logger.info("TDM Distributed training success!")
train_filelist = [
    "{}{}".format(input_folder, f)
    for f in output.decode('ascii').strip().split('\n')
]
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)

config = DistributeTranspilerConfig()
config.sync_mode = False

optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(avg_cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    fleet.init_worker()
    exe.run(fluid.default_startup_program())
    print("startup program done.")
    fleet_filelist = fleet.split_files(train_filelist)
    dataset.set_filelist(fleet_filelist)
    exe.train_from_dataset(program=fluid.default_main_program(),
                           dataset=dataset,
                           fetch_list=[auc_var],
                           fetch_info=["auc"],
                           debug=True)
    print("end .... ")
# save model here
Exemple #15
0
def train(args):
    datas, avg_cost, predict, train_file_path = model()

    endpoints = args.endpoints.split(",")
    if args.role.upper() == "PSERVER":
        current_id = endpoints.index(args.current_endpoint)
    else:
        current_id = 0
    role = role_maker.UserDefinedRoleMaker(
        current_id=current_id,
        role=role_maker.Role.WORKER
        if args.role.upper() == "TRAINER" else role_maker.Role.SERVER,
        worker_num=args.trainers,
        server_endpoints=endpoints)

    exe = fluid.Executor(fluid.CPUPlace())
    fleet.init(role)

    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False

    optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    if fleet.is_server():
        logger.info("run pserver")

        fleet.init_server()
        fleet.run_server()
    elif fleet.is_worker():
        logger.info("run trainer")

        fleet.init_worker()
        exe.run(fleet.startup_program)

        thread_num = 2
        filelist = []
        for _ in range(thread_num):
            filelist.append(train_file_path)

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_batch_size(128)
        dataset.set_use_var(datas)
        pipe_command = 'python ctr_dataset_reader.py'
        dataset.set_pipe_command(pipe_command)

        dataset.set_filelist(filelist)
        dataset.set_thread(thread_num)

        for epoch_id in range(10):
            logger.info("epoch {} start".format(epoch_id))
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(
                program=fleet.main_program,
                dataset=dataset,
                fetch_list=[avg_cost],
                fetch_info=["cost"],
                print_period=100,
                debug=False)
            pass_time = time.time() - pass_start
            logger.info("epoch {} finished, pass_time {}".format(epoch_id,
                                                                 pass_time))
        fleet.stop_worker()
Exemple #16
0
def train(use_cuda, save_dirname, is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    old_model = None
    model_args = model()
    predict = model_args['predict']
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']
    loader = model_args['loader']
    auc_batch = model_args['auc'][1]

    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)
    # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        readers = []
        for i in range(16):
            readers.append(data_reader(cluster_train_dir))
        multi_readers = paddle.reader.multiprocess_reader(readers)
        loader.set_sample_generator(
            multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
            # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
        # feeder = fluid.DataFeeder(feed_order, place)
        # train_reader = feeder.decorate_reader(
        #     paddle.batch(paddle.reader.shuffle(
        #         data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
        #          multi_devices=False, drop_last=True)

        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_prog = fluid.default_main_program()

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM * 2
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster
        build_strategy.fuse_broadcast_ops = True
        # build_strategy.async_mode = True
        main_program = fluid.CompiledProgram(main_prog).with_data_parallel(
            loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy)
            #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM))

        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            for auc_state in model_args['auc'][2]:
                set_zero(place, fluid.global_scope(), auc_state.name)

        # 并行训练,速度更快
        # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
        #                                   main_program=main_program, loss_name=avg_cost.name,
        #                                   exec_strategy=exec_strategy, build_strategy=build_strategy)

        cost_list = []
        auc_list = []
        import time
        pass_s_time = time.time()
        for pass_id in range(PASS_NUM):
            s_time = time.time()
            for batch_id, data in enumerate(loader()):
                r_time = time.time() - s_time
                st_time = time.time()
                cost_value, auc_value = exe.run(
                    program=main_program,
                    feed=data,
                    fetch_list=[avg_cost.name, auc_batch.name])
                t_time = time.time() - st_time
                cost_list.append(np.array(cost_value))
                auc_list.append(np.array(auc_value))

                if batch_id % 10 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \
                          (pass_id, batch_id, np.array(cost_list).mean(),
                           np.array(auc_list).mean(), r_time, t_time)
                    cost_list = []
                    auc_list = []
                if batch_id % 1000 == 0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            feed_order,
                            [predict, avg_cost, auc_batch], exe
                        )
                        fluid.io.save_persistables(exe, save_dirname)
                        infer(cluster_test_dir, save_dirname, feed_order)
                s_time = time.time()
        pass_time = time.time() - pass_s_time
        print("Pass train time: %f" % pass_time)

    else:
        role = role_maker.PaddleCloudRoleMaker()
        # 全异步训练
        config = DistributeTranspilerConfig()
        config.sync_mode = False
        config.runtime_split_send_recv = True
        # 加入 fleet init 初始化环境
        fleet.init(role)

        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(
                paddle.batch(paddle.reader.shuffle(
                    data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
                multi_devices=False, drop_last=True)

            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy)

            for pass_id in range(PASS_NUM):
                cost_list = []
                auc_list = []
                import time
                s_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    r_time = time.time() - s_time
                    cost_value, auc_value = exe.run(
                        program=compiled_prog, feed=data,
                        fetch_list=[avg_cost.name, auc_batch.name])
                    t_time = time.time() - r_time
                    cost_list.append(np.array(cost_value))
                    auc_list.append(np.array(auc_value))

                    if batch_id % 10 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \
                              (pass_id, batch_id, np.array(cost_list).mean(),
                               np.array(auc_list).mean(), r_time, t_time)
                        cost_list = []
                        auc_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if save_dirname is not None:
                            fleet.save_inference_model(
                                exe,
                                save_dirname,
                                feed_order,
                                [predict, avg_cost, auc_batch]
                            )
                            fleet.save_persistables(exe, save_dirname)
                            infer(cluster_test_dir, save_dirname, feed_order)
                    s_time = time.time()
        fleet.stop_worker()
    def test_pslib_2(self):
        """Test cases for pslib."""
        import paddle.fluid as fluid
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase

        os.environ["POD_IP"] = "127.0.0.1"
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "1"
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        try:
            fleet.init(None)
        except:
            print("no mpi4py, skip test_pslib_2")
            return
        train_program = fluid.Program()
        startup_program = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(train_program, startup_program):
            show = fluid.layers.data(name="show", shape=[-1, 1], \
                                     dtype="float32", lod_level=1, append_batch_size=False)
            fc = fluid.layers.fc(input=show, size=1, act=None)
            label = fluid.layers.data(name="click", shape=[-1, 1], \
                                      dtype="int64", lod_level=1, append_batch_size=False)
            label_cast = fluid.layers.cast(label, dtype='float32')
            cost = fluid.layers.log_loss(fc, label_cast)
        try:
            adam = fluid.optimizer.Adam(learning_rate=0.000005)
            adam = fleet.distributed_optimizer(adam)
            adam.minimize([cost], [scope])
            fleet.run_server()
        except:
            print("do not support pslib test, skip")
            return
        os.environ["TRAINING_ROLE"] = "wrong"
        try:
            role1 = GeneralRoleMaker(path="./test_gloo_1")
            role1.generate_role()
        except:
            print("catch expected error of wrong TRAINING_ROLE")
        os.environ["TRAINING_ROLE"] = "PSERVER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
        role2 = GeneralRoleMaker(path="./test_gloo_2")
        role2._finalize()
        role2._all_gather(1)
        role2._all_gather(1)
        role2._barrier_server()
        role2._all_gather(1)
        role3 = GeneralRoleMaker(path="./test_gloo_3")
        role3._worker_gather(1)
        role3._worker_gather(1)
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
        role4 = GeneralRoleMaker(path="./test_gloo_4")
        role4._worker_gather(1)
        role4._get_rank()
        role4._get_size()
        role4._all_comm.init()
        role5 = GeneralRoleMaker(path="./test_gloo_5")
        role5.get_local_endpoint()
        role5.get_local_endpoint()
        role6 = GeneralRoleMaker(path="./test_gloo_6")
        role6.get_trainer_endpoints()
        role6.get_trainer_endpoints()
        role7 = GeneralRoleMaker(path="./test_gloo_7")
        role7.get_pserver_endpoints()
        role7.get_pserver_endpoints()
        role8 = GeneralRoleMaker(path="./test_gloo_8")
        role8.is_worker()
        role8.is_worker()
        role9 = GeneralRoleMaker(path="./test_gloo_9")
        role9.is_server()
        role9.is_server()
        role10 = GeneralRoleMaker(path="./test_gloo_10")
        role10.is_first_worker()
        role10.is_first_worker()
        role11 = GeneralRoleMaker(path="./test_gloo_11")
        role11.worker_index()
        role11.worker_index()
        role12 = GeneralRoleMaker(path="./test_gloo_12")
        role12.server_index()
        role12.server_index()
        role13 = GeneralRoleMaker(path="./test_gloo_13")
        role13.worker_num()
        role13.worker_num()
        role14 = GeneralRoleMaker(path="./test_gloo_14")
        role14.server_num()
        role14.server_num()
        role15 = GeneralRoleMaker(path="./test_gloo_15")
        role15._barrier_worker()
        role15._barrier_worker()
        role16 = GeneralRoleMaker(path="./test_gloo_16")
        role16._barrier_all()
        role16._barrier_all()
        role17 = GeneralRoleMaker(path="./test_gloo_17")
        role17._barrier_server()
        role17._barrier_server()
        role18 = GeneralRoleMaker(path="./test_gloo_18")
        role18._worker_num()
        role18._worker_num()
        role19 = GeneralRoleMaker(path="./test_gloo_19")
        role19._server_num()
        role19._server_num()
        role20 = GeneralRoleMaker(path="./test_gloo_20")
        a = [1]
        b = [0]
        role20._all_reduce(a, b)
        role21 = GeneralRoleMaker(path="./test_gloo_21")
        role21.all_reduce_worker([], [])
        role21.all_reduce_worker([], [])
        role21.barrier_worker()
        role21.barrier_all()
        role22 = GeneralRoleMaker(path="./test_gloo_22")
        role22._get_rank()
        role22._get_rank()
        os.environ["PADDLE_PSERVER_ID"] = "0"
        role23 = GeneralRoleMaker(path="./test_gloo_23")
        role23._get_size()
        role23._get_size()
        with open("test_fleet_gloo_role_maker_1.txt", "w") as f:
            data = "1 1 1 1\n"
            f.write(data)

        dataset = paddle.distributed.InMemoryDataset()
        dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
        dataset._set_use_var([show, label])
        dataset.load_into_memory()
        dataset.get_memory_data_size(fleet)
        dataset.get_shuffle_data_size(fleet)
        os.remove("./test_fleet_gloo_role_maker_1.txt")

        class TmpClass():
            """
            dummy tmp class
            """
            def __init__(self):
                pass

            def all_reduce_worker(self, input, output):
                """
                dummy all reduce worker

                Args:
                    input(None): fake input
                    output(None): fale output
                """
                pass

            def barrier_worker(self):
                """
                dummy barrier worker
                """
                pass

        from paddle.fluid.incubate.fleet.base.fleet_base import Fleet

        class TmpFleet(Fleet):
            """
            dummy tmp fleet
            """
            def __init__(self):
                super(TmpFleet, self).__init__()
                self._role_maker = None

            def init_worker(self):
                """
                dummy init worker
                """
                pass

            def init_server(self, model_dir=None):
                """
                dummy init server

                Args:
                    model_dir(None): fake model_dir
                """
                pass

            def run_server(self):
                """
                dummy run server
                """
                pass

            def stop_worker(self):
                """
                dummy stop worker
                """
                pass

            def distributed_optimizer(self, optimizer, strategy=None):
                """
                dummy distributed optimizer

                Args:
                    optimizer(None): fake optimizer
                    strategy(None): fake strategy
                """
                pass

            def save_inference_model(self):
                """
                dummy save inference model
                """
                pass

            def save_persistables(self):
                """
                dummy save persistables
                """
                pass

        os.environ["TRAINING_ROLE"] = "TRAINER"
        tmp = TmpFleet()
        tmp._role_maker = TmpClass()
        tmp.all_reduce_worker([], [])
        tmp.barrier_worker()
        from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
        tmp = RoleMakerBase()
        tmp.all_gather(1)
        tmp.all_reduce_worker([], [])
        tmp.barrier_worker()
        tmp.barrier_all()
        from paddle.fluid.incubate.fleet.base.role_maker import \
            MPISymetricRoleMaker
        tmp1 = MPISymetricRoleMaker()
        tmp1.all_gather(1)
        tmp1.all_gather(1)
        tmp2 = MPISymetricRoleMaker()
        tmp2.all_reduce_worker([], [])
        tmp3 = MPISymetricRoleMaker()
        tmp3.barrier_worker()
        tmp3.barrier_worker()
        tmp4 = MPISymetricRoleMaker()
        tmp4.barrier_all()
        tmp4.barrier_all()
def fit():
    role = role_maker.UserDefinedRoleMaker(
        current_id=current_id,
        role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER,
        worker_num=2,
        server_endpoints=["127.0.0.1:36011"])
    fleet.init(role)
    BATCH_SIZE = 128
    type_size=createDataList(in_file_path,in_file_path+'.data'+"/")
    # 用于训练的数据提供器
    train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    data_shape = [3, 32, 32]
    images = fluid.layers.data(name='images', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # 获取分类器
    predict = networkConfiguration(images,type_size)

    # 定义损失函数和准确率
    cost = fluid.layers.cross_entropy(input=predict, label=label)   # 交叉熵
    avg_cost = fluid.layers.mean(cost)                              # 计算cost中所有元素的平均值
    acc = fluid.layers.accuracy(input=predict, label=label)         # 使用输入和标签计算准确率

    # 定义优化方法
    test_program = fluid.default_main_program().clone(for_test=True)    # 获取测试程序
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = True
    optimizer = fleet.distributed_optimizer(optimizer,strategy)
    # 定义优化方法
    optimizer.minimize(avg_cost)

    if fleet.is_server():
        print("启动server")
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        print("启动worker")
        fleet.init_worker()
        print(fleet.worker_endpoints())
        ########## 模型训练&模型评估 ##########
        # 创建Executor
        use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        print("cpu")
        # 定义数据映射器
        feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
        print("数据映射")
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        for pass_id in range(EPOCH_NUM):
            print(pass_id)
            # 开始训练
            for batch_id, data in enumerate(train_reader()):                            # 遍历train_reader
                train_cost, train_acc = exe.run(program=fluid.default_main_program(),   # 运行主程序
                                                feed=feeder.feed(data),                 # 喂入一个batch的数据
                                                fetch_list=[avg_cost, acc])             # fetch均方误差和准确率         # fetch均方误差和准确率
                # 每100次batch打印一次训练、进行一次测试
                if batch_id % 20 == 0:
                    print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0]))
            # 开始测试
            test_costs = [] # 测试的损失值
            test_accs = []  # 测试的准确率
            for batch_id, data in enumerate(test_reader()):
                test_cost, test_acc = exe.run(program=test_program,         # 执行训练程序
                                            feed=feeder.feed(data),       # 喂入数据
                                            fetch_list=[avg_cost, acc])   # fetch误差、准确率
                test_costs.append(test_cost[0])                             # 记录每个batch的损失值
                test_accs.append(test_acc[0])                               # 记录每个batch的准确率

            test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值
            test_acc = (sum(test_accs) / len(test_accs))    # 计算准确率平均值
            print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))
        save(predict,model_file_path,exe)
        fleet.stop_worker()
Exemple #19
0
def train(args):
    import logging
    log.setLevel(logging.DEBUG)
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num,
                          args.is_sparse, args.is_distributed, 1.)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    init_role()

    train_steps = math.ceil(1. * args.num_nodes * args.epoch /
                            args.batch_size / num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    if args.optimizer == "sgd":
        args.lr *= args.batch_size * args.walk_len * args.win_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        if args.dataset is not None:
            if args.dataset == "BlogCatalog":
                graph = data_loader.BlogCatalogDataset().graph
            elif args.dataset == "ArXiv":
                graph = data_loader.ArXivDataset().graph
            else:
                raise ValueError(args.dataset + " dataset doesn't exists")
            log.info("Load buildin BlogCatalog dataset done.")
        elif args.walkpath_files is None or args.walkpath_files == "None":
            graph = build_graph(args.num_nodes, args.edge_path)
            log.info("Load graph from '%s' done." % args.edge_path)
        else:
            graph = build_fake_graph(args.num_nodes)
            log.info("Load fake graph done.")

        # bind gen
        gen_func = build_gen_func(args, graph)

        pyreader.decorate_tensor_provider(gen_func)
        pyreader.start()

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
Exemple #20
0
 def server(self, context):
     fleet.init_server()
     fleet.run_server()
     context['is_exit'] = True