Esempio n. 1
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
Esempio n. 2
0
    def run_pserver(self, args):
        if args.role.upper() != "PSERVER":
            raise ValueError("args role must be PSERVER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=args.current_id,
            role=role_maker.Role.SERVER,
            worker_num=args.trainers,
            server_endpoints=args.endpoints.split(","))

        fleet.init(role)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = args.sync_mode
        strategy.geo_sgd_mode = args.geo_sgd_mode
        strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums

        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        fleet.init_server()
        fleet.run_server()
Esempio n. 3
0
def main(args):
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = GraphsageModel(args)
    loss = model.forward()
    train_iter = reader.get_iter(args, model.graph_wrapper, 'train')
    pyreader = fake_py_reader(train_iter, num_devices)

    # init fleet
    init_role()

    optimization(args.lr, loss, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, model, pyreader, args)
Esempio n. 4
0
    def run_pserver(self, args):
        fleet.init(self.build_role(args))
        strategy = self.build_strategy(args)
        avg_cost = self.net(args)
        self.build_optimizer(avg_cost, strategy)

        fleet.init_server()
        fleet.run_server()
Esempio n. 5
0
    def init_and_run_ps_worker(self, ckpt_path):
        # init and run server or worker
        self.exe = F.Executor(F.CPUPlace())
        if tfleet.is_server():
            tfleet.init_server()
            self.warmstart(tfleet.startup_program, path=ckpt_path)
            tfleet.run_server()
            exit()

        if tfleet.is_worker():
            log.info("start init worker done")
            tfleet.init_worker()
            self.exe.run(tfleet.startup_program)
 def run_pserver(self, args):
     """run pserver"""
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
     import paddle.fluid as fluid
     from paddle.fluid.transpiler.ps_dispatcher import RoundRobin
     from paddle.fluid.transpiler.ps_dispatcher import HashName
     fluid.default_startup_program().random_seed = 1
     fluid.default_main_program().random_seed = 1
     if args.role.upper() != "PSERVER":
         raise ValueError("args role must be PSERVER")
     role = role_maker.UserDefinedRoleMaker(
         current_id=args.current_id,
         role=role_maker.Role.SERVER,
         worker_num=args.trainers,
         server_endpoints=args.endpoints.split(","))
     fleet.init(role)
     strategy = DistributeTranspilerConfig()
     strategy.sync_mode = args.run_params["sync_mode"]
     strategy.async_mode = args.run_params["async_mode"]
     strategy.mode = "pserver"
     strategy.slice_var_up = args.run_params['slice_var_up']
     strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
     if args.run_params['split_method']:
         strategy.split_method = HashName
     strategy.split_method = RoundRobin
     strategy.wait_port = args.run_params['wait_port']
     strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv']
     strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce']
     #strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks']
     #strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks']
     strategy.geo_sgd_mode = args.run_params['geo_sgd']
     strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     avg_cost = self.net(args)
     optimizer = fluid.optimizer.SGD(LEARNING_RATE)
     optimizer = fleet.distributed_optimizer(optimizer, strategy)
     optimizer.minimize(avg_cost)
     fleet.init_server()
     fleet.run_server()
Esempio n. 7
0
def train(args):
    import logging
    log.setLevel(logging.DEBUG)
    log.info("start")

    worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
    num_devices = int(os.getenv("CPU_NUM", 10))

    model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num,
                          args.is_sparse, args.is_distributed, 1.)
    pyreader = model.pyreader
    loss = model.forward()

    # init fleet
    init_role()

    train_steps = math.ceil(1. * args.num_nodes * args.epoch /
                            args.batch_size / num_devices / worker_num)
    log.info("Train step: %s" % train_steps)

    if args.optimizer == "sgd":
        args.lr *= args.batch_size * args.walk_len * args.win_size
    optimization(args.lr, loss, train_steps, args.optimizer)

    # init and run server or worker
    if fleet.is_server():
        fleet.init_server(args.warm_start_from_dir)
        fleet.run_server()

    if fleet.is_worker():
        log.info("start init worker done")
        fleet.init_worker()
        #just the worker, load the sample
        log.info("init worker done")

        exe = F.Executor(F.CPUPlace())
        exe.run(fleet.startup_program)
        log.info("Startup done")

        if args.dataset is not None:
            if args.dataset == "BlogCatalog":
                graph = data_loader.BlogCatalogDataset().graph
            elif args.dataset == "ArXiv":
                graph = data_loader.ArXivDataset().graph
            else:
                raise ValueError(args.dataset + " dataset doesn't exists")
            log.info("Load buildin BlogCatalog dataset done.")
        elif args.walkpath_files is None or args.walkpath_files == "None":
            graph = build_graph(args.num_nodes, args.edge_path)
            log.info("Load graph from '%s' done." % args.edge_path)
        else:
            graph = build_fake_graph(args.num_nodes)
            log.info("Load fake graph done.")

        # bind gen
        gen_func = build_gen_func(args, graph)

        pyreader.decorate_tensor_provider(gen_func)
        pyreader.start()

        compiled_prog = build_complied_prog(fleet.main_program, loss)
        train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
Esempio n. 8
0
 def server(self, context):
     fleet.init_server()
     fleet.run_server()
     context['is_exit'] = True
Esempio n. 9
0
def fit():
    role = role_maker.UserDefinedRoleMaker(
        current_id=current_id,
        role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER,
        worker_num=2,
        server_endpoints=["127.0.0.1:36011"])
    fleet.init(role)
    BATCH_SIZE = 128
    type_size=createDataList(in_file_path,in_file_path+'.data'+"/")
    # 用于训练的数据提供器
    train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE)
    data_shape = [3, 32, 32]
    images = fluid.layers.data(name='images', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # 获取分类器
    predict = networkConfiguration(images,type_size)

    # 定义损失函数和准确率
    cost = fluid.layers.cross_entropy(input=predict, label=label)   # 交叉熵
    avg_cost = fluid.layers.mean(cost)                              # 计算cost中所有元素的平均值
    acc = fluid.layers.accuracy(input=predict, label=label)         # 使用输入和标签计算准确率

    # 定义优化方法
    test_program = fluid.default_main_program().clone(for_test=True)    # 获取测试程序
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = True
    optimizer = fleet.distributed_optimizer(optimizer,strategy)
    # 定义优化方法
    optimizer.minimize(avg_cost)

    if fleet.is_server():
        print("启动server")
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        print("启动worker")
        fleet.init_worker()
        print(fleet.worker_endpoints())
        ########## 模型训练&模型评估 ##########
        # 创建Executor
        use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        print("cpu")
        # 定义数据映射器
        feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
        print("数据映射")
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        for pass_id in range(EPOCH_NUM):
            print(pass_id)
            # 开始训练
            for batch_id, data in enumerate(train_reader()):                            # 遍历train_reader
                train_cost, train_acc = exe.run(program=fluid.default_main_program(),   # 运行主程序
                                                feed=feeder.feed(data),                 # 喂入一个batch的数据
                                                fetch_list=[avg_cost, acc])             # fetch均方误差和准确率         # fetch均方误差和准确率
                # 每100次batch打印一次训练、进行一次测试
                if batch_id % 20 == 0:
                    print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0]))
            # 开始测试
            test_costs = [] # 测试的损失值
            test_accs = []  # 测试的准确率
            for batch_id, data in enumerate(test_reader()):
                test_cost, test_acc = exe.run(program=test_program,         # 执行训练程序
                                            feed=feeder.feed(data),       # 喂入数据
                                            fetch_list=[avg_cost, acc])   # fetch误差、准确率
                test_costs.append(test_cost[0])                             # 记录每个batch的损失值
                test_accs.append(test_acc[0])                               # 记录每个batch的准确率

            test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值
            test_acc = (sum(test_accs) / len(test_accs))    # 计算准确率平均值
            print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))
        save(predict,model_file_path,exe)
        fleet.stop_worker()
Esempio n. 10
0
def train(use_cuda, save_dirname, is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    old_model = None
    model_args = model()
    predict = model_args['predict']
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']
    loader = model_args['loader']
    auc_batch = model_args['auc'][1]

    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)
    # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        readers = []
        for i in range(16):
            readers.append(data_reader(cluster_train_dir))
        multi_readers = paddle.reader.multiprocess_reader(readers)
        loader.set_sample_generator(
            multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
            # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
        # feeder = fluid.DataFeeder(feed_order, place)
        # train_reader = feeder.decorate_reader(
        #     paddle.batch(paddle.reader.shuffle(
        #         data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
        #          multi_devices=False, drop_last=True)

        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_prog = fluid.default_main_program()

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM * 2
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster
        build_strategy.fuse_broadcast_ops = True
        # build_strategy.async_mode = True
        main_program = fluid.CompiledProgram(main_prog).with_data_parallel(
            loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy)
            #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM))

        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            for auc_state in model_args['auc'][2]:
                set_zero(place, fluid.global_scope(), auc_state.name)

        # 并行训练,速度更快
        # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
        #                                   main_program=main_program, loss_name=avg_cost.name,
        #                                   exec_strategy=exec_strategy, build_strategy=build_strategy)

        cost_list = []
        auc_list = []
        import time
        pass_s_time = time.time()
        for pass_id in range(PASS_NUM):
            s_time = time.time()
            for batch_id, data in enumerate(loader()):
                r_time = time.time() - s_time
                st_time = time.time()
                cost_value, auc_value = exe.run(
                    program=main_program,
                    feed=data,
                    fetch_list=[avg_cost.name, auc_batch.name])
                t_time = time.time() - st_time
                cost_list.append(np.array(cost_value))
                auc_list.append(np.array(auc_value))

                if batch_id % 10 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \
                          (pass_id, batch_id, np.array(cost_list).mean(),
                           np.array(auc_list).mean(), r_time, t_time)
                    cost_list = []
                    auc_list = []
                if batch_id % 1000 == 0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            feed_order,
                            [predict, avg_cost, auc_batch], exe
                        )
                        fluid.io.save_persistables(exe, save_dirname)
                        infer(cluster_test_dir, save_dirname, feed_order)
                s_time = time.time()
        pass_time = time.time() - pass_s_time
        print("Pass train time: %f" % pass_time)

    else:
        role = role_maker.PaddleCloudRoleMaker()
        # 全异步训练
        config = DistributeTranspilerConfig()
        config.sync_mode = False
        config.runtime_split_send_recv = True
        # 加入 fleet init 初始化环境
        fleet.init(role)

        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(
                paddle.batch(paddle.reader.shuffle(
                    data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
                multi_devices=False, drop_last=True)

            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy)

            for pass_id in range(PASS_NUM):
                cost_list = []
                auc_list = []
                import time
                s_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    r_time = time.time() - s_time
                    cost_value, auc_value = exe.run(
                        program=compiled_prog, feed=data,
                        fetch_list=[avg_cost.name, auc_batch.name])
                    t_time = time.time() - r_time
                    cost_list.append(np.array(cost_value))
                    auc_list.append(np.array(auc_value))

                    if batch_id % 10 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \
                              (pass_id, batch_id, np.array(cost_list).mean(),
                               np.array(auc_list).mean(), r_time, t_time)
                        cost_list = []
                        auc_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if save_dirname is not None:
                            fleet.save_inference_model(
                                exe,
                                save_dirname,
                                feed_order,
                                [predict, avg_cost, auc_batch]
                            )
                            fleet.save_persistables(exe, save_dirname)
                            infer(cluster_test_dir, save_dirname, feed_order)
                    s_time = time.time()
        fleet.stop_worker()
Esempio n. 11
0
def train(args):
    """run train"""
    # set random
    program = fluid.default_main_program()
    program.random_seed = args.random_seed

    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    if args.sync_mode == "sync":
        strategy = StrategyFactory.create_sync_strategy()
    elif args.sync_mode == "half_async":
        strategy = StrategyFactory.create_half_async_strategy()
    elif args.sync_mode == "async":
        strategy = StrategyFactory.create_async_strategy()

    # set model
    logger.info("TDM Begin build network.")
    tdm_model = TdmTrainNet(args)
    inputs = tdm_model.input_data()

    logger.info("TDM Begin load tree travel & layer.")
    avg_cost, acc = tdm_model.tdm(inputs)
    logger.info("TDM End build network.")
    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate,
                                              lazy_mode=True)

    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)
    logger.info("TDM End append backward.")

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        logger.info("TDM Run server ...")
        # 初始化及运行参数服务器节点
        logger.info("TDM init model path: {}".format(
            args.init_model_files_path))
        # 模型中除了tdm树结构相关的变量都应该在此处初始化
        fleet.init_server(args.init_model_files_path)
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'),
                fluid.CPUPlace())
            logger.info("TDM Set learning rate {}".format(args.learning_rate))
        else:
            logger.info("TDM Didn't find learning_rate_0 param")
        logger.info("TDM load End")

        fleet.run_server()
        logger.info("TDM Run server success!")
    elif fleet.is_worker():
        logger.info("TDM Run worker ...")
        # 初始化工作节点
        fleet.init_worker()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        logger.info("TDM Run Startup Begin")
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)

        # Set Learning Rate
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'), place)
            logger.info("TDM Set learning rate {}".format(args.learning_rate))

        # Set TDM Variable
        logger.info("TDM Begin load parameter.")
        # Set TDM_Tree_Info
        # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set
        tdm_param_prepare_dict = tdm_sampler_prepare(args)
        tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args)
        Numpy_model = {}
        Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array']
        Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array']
        Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array']
        # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args)
        # 分布式训练中,Emb存储与参数服务器,无需在本地set
        for param_name in Numpy_model:
            param_t = fluid.global_scope().find_var(param_name).get_tensor()
            param_t.set(Numpy_model[str(param_name)].astype('int32'), place)

        logger.info("TDM Run Startup End")

        # Train loop
        dataset, file_list, example_num = get_dataset(inputs, args)
        logger.info("TDM Distributed training begin ...")
        for epoch in range(args.epoch_num):
            # local shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[acc, avg_cost],
                                   fetch_info=[
                                       "Epoch {} acc ".format(epoch),
                                       "Epoch {} loss ".format(epoch)
                                   ],
                                   print_period=1,
                                   debug=False)
            end_time = time.time()
            logger.info(
                "Epoch {} finished, use time {} second, speed {} example/s".
                format(epoch, end_time - start_time,
                       example_num * 1.0 / (end_time - start_time)))

            # 默认使用0号节点保存模型
            if fleet.is_first_worker():
                model_path = os.path.join(args.model_files_path,
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)
                logger.info("Begin upload files")
                # upload_files(model_path, warm_up=False)
                # 在分布式环境下时,支持上传模型到hdfs
        logger.info("TDM Before stop worker")
        fleet.stop_worker()
        logger.info("TDM Distributed training success!")
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model,
          is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    model_args = model()
    navi_predict = model_args['predict'][0]
    voice_navi_predict = model_args['predict'][1]
    speed_navi_predict = model_args['predict'][2]
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']

    role = role_maker.PaddleCloudRoleMaker()
    # 全异步训练
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    config.runtime_split_send_recv = True

    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        # train_reader = paddle.batch(
        #     paddle.reader.shuffle(
        #         streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE)

        feeder = fluid.DataFeeder(feed_order, place)
        train_reader = feeder.decorate_reader(paddle.batch(
            paddle.reader.shuffle(streaming_data_reader(), buf_size=8192),
            batch_size=BATCH_SIZE),
                                              multi_devices=False,
                                              drop_last=True)
        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_program = fluid.default_main_program()
        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            # for auc_state in model_args['auc'][2]:
            #     set_zero(place, fluid.global_scope(), auc_state.name)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM
        main_program.num_threads = CPU_NUM
        build_strategy = fluid.BuildStrategy()
        build_strategy.async_mode = True

        # 并行训练,速度更快
        train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                          main_program=main_program,
                                          loss_name=avg_cost.name)

        cost_list = []
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                cost_value = train_pe.run(feed=data,
                                          fetch_list=[avg_cost.name])
                cost_list.append(np.array(cost_value))

                if batch_id % 100 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s" % \
                          (pass_id, batch_id, np.array(cost_list).mean())
                    cost_list = []
                if batch_id % 2000 == 0:
                    if output_model is not None:
                        fluid.io.save_inference_model(
                            output_model, feed_order, [
                                navi_predict, voice_navi_predict,
                                speed_navi_predict, avg_cost
                            ], exe)
                        fluid.io.save_persistables(exe, output_model)
                        infer(test_sample_dir, output_model, feed_order)

    else:
        # 加入 fleet init 初始化环境
        fleet.init(role)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            if is_increment:
                fleet.init_server(old_model)
            else:
                fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = Executor(place)
            # train_reader = paddle.batch(
            #     paddle.reader.shuffle(
            #         data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(paddle.batch(
                paddle.reader.shuffle(data_reader(train_sample_dir),
                                      buf_size=8192),
                batch_size=BATCH_SIZE),
                                                  multi_devices=False,
                                                  drop_last=True)
            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                    loss_name=avg_cost.name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

            cost_list = []
            for pass_id in range(PASS_NUM):
                for batch_id, data in enumerate(train_reader()):
                    cost_value = exe.run(program=compiled_prog,
                                         feed=data,
                                         fetch_list=[avg_cost.name])
                    cost_list.append(np.array(cost_value))

                    if batch_id % 100 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s" % \
                              (pass_id, batch_id, np.array(cost_list).mean())
                        cost_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if output_model is not None:
                            fleet.save_inference_model(
                                exe, output_model, feed_order, [
                                    navi_predict, voice_navi_predict,
                                    speed_navi_predict, avg_cost
                                ])
                            fleet.save_persistables(exe, output_model)
                            infer(test_sample_dir, output_model, feed_order)
        fleet.stop_worker()