Example #1
0
              (task_name, card_num, ce_loss))
        print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc))

    # evaluate on test set
    if not args.do_train and args.do_val:
        print("Final test result:")
        evaluate(test_exe, test_prog, test_loader,
                 [loss.name, accuracy.name, num_seqs.name], "test")

    # infer
    if args.do_infer:
        print("Final infer result:")
        infer(test_exe, test_prog, infer_loader, [probs.name], "infer")


def get_cards():
    num = 0
    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
    if cards != '':
        num = len(cards.split(","))
    return num


if __name__ == "__main__":
    args = PDConfig('config.json')
    args.build()
    args.print_arguments()
    check_cuda(args.use_cuda)
    check_version()
    main(args)
Example #2
0
def fast_infer(args):
    """
    Inference by beam search decoder based solely on Fluid operators.
    """
    out_ids, out_scores, pyreader = fast_decoder(
        ModelHyperParams.src_vocab_size,
        ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer,
        ModelHyperParams.n_head,
        ModelHyperParams.d_key,
        ModelHyperParams.d_value,
        ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid,
        ModelHyperParams.prepostprocess_dropout,
        ModelHyperParams.attention_dropout,
        ModelHyperParams.relu_dropout,
        ModelHyperParams.preprocess_cmd,
        ModelHyperParams.postprocess_cmd,
        ModelHyperParams.weight_sharing,
        InferTaskConfig.beam_size,
        InferTaskConfig.max_out_len,
        ModelHyperParams.bos_idx,
        ModelHyperParams.eos_idx,
        use_py_reader=args.use_py_reader)

    # This is used here to set dropout to the test mode.
    infer_program = fluid.default_main_program().clone(for_test=True)

    if args.use_mem_opt:
        fluid.memory_optimize(infer_program)

    if InferTaskConfig.use_gpu:
        check_cuda(InferTaskConfig.use_gpu)
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    fluid.io.load_vars(exe,
                       InferTaskConfig.model_path,
                       vars=[
                           var for var in infer_program.list_vars()
                           if isinstance(var, fluid.framework.Parameter)
                       ])

    exec_strategy = fluid.ExecutionStrategy()
    # For faster executor
    exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = 1
    build_strategy = fluid.BuildStrategy()
    infer_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       main_program=infer_program,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    # data reader settings for inference
    args.train_file_pattern = args.test_file_pattern
    args.use_token_batch = False
    args.sort_type = reader.SortType.NONE
    args.shuffle = False
    args.shuffle_batch = False
    test_data = prepare_data_generator(
        args,
        is_test=False,
        count=dev_count,
        pyreader=pyreader,
        py_reader_provider_wrapper=py_reader_provider_wrapper,
        place=place)
    if args.use_py_reader:
        pyreader.start()
        data_generator = None
    else:
        data_generator = test_data()
    trg_idx2word = reader.DataReader.load_dict(dict_path=args.trg_vocab_fpath,
                                               reverse=True)

    while True:
        try:
            feed_dict_list = prepare_feed_dict_list(data_generator, dev_count,
                                                    place)
            if args.use_parallel_exe:
                seq_ids, seq_scores = infer_exe.run(
                    fetch_list=[out_ids.name, out_scores.name],
                    feed=feed_dict_list,
                    return_numpy=False)
            else:
                seq_ids, seq_scores = exe.run(
                    program=infer_program,
                    fetch_list=[out_ids.name, out_scores.name],
                    feed=feed_dict_list[0]
                    if feed_dict_list is not None else None,
                    return_numpy=False,
                    use_program_cache=False)
            seq_ids_list, seq_scores_list = [
                seq_ids
            ], [seq_scores] if isinstance(
                seq_ids, paddle.fluid.LoDTensor) else (seq_ids, seq_scores)
            for seq_ids, seq_scores in zip(seq_ids_list, seq_scores_list):
                # How to parse the results:
                #   Suppose the lod of seq_ids is:
                #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
                #   then from lod[0]:
                #     there are 2 source sentences, beam width is 3.
                #   from lod[1]:
                #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
                #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
                hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
                scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
                for i in range(len(seq_ids.lod()[0]) -
                               1):  # for each source sentence
                    start = seq_ids.lod()[0][i]
                    end = seq_ids.lod()[0][i + 1]
                    for j in range(end - start):  # for each candidate
                        sub_start = seq_ids.lod()[1][start + j]
                        sub_end = seq_ids.lod()[1][start + j + 1]
                        hyps[i].append(" ".join([
                            trg_idx2word[idx] for idx in post_process_seq(
                                np.array(seq_ids)[sub_start:sub_end])
                        ]))
                        scores[i].append(np.array(seq_scores)[sub_end - 1])
                        print(hyps[i][-1])
                        if len(hyps[i]) >= InferTaskConfig.n_best:
                            break
        except (StopIteration, fluid.core.EOFException):
            # The data pass is over.
            if args.use_py_reader:
                pyreader.reset()
            break
Example #3
0
def main():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if args.log_path:
        file_handler = logging.FileHandler(args.log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
    logger.info('Running with args : {}'.format(args))

    config = RNNConfig(args)

    # define train program
    main_program = fluid.Program()
    startup_program = fluid.Program()
    if args.enable_ce:
        startup_program.random_seed = SEED
    with fluid.program_guard(main_program, startup_program):
        with fluid.unique_name.guard():
            res_vars = lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                config.batch_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
                use_dataloader=args.use_dataloader)

            if args.use_dataloader:
                dataloader = res_vars[-1]
                res_vars = res_vars[:-1]
            loss, last_hidden, last_cell, feed_order = res_vars

            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=config.max_grad_norm))

            learning_rate = fluid.layers.create_global_var(
                name="learning_rate",
                shape=[1],
                value=1.0,
                dtype='float32',
                persistable=True)

            optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
            optimizer.minimize(loss)

    # define inference program
    inference_program = fluid.Program()
    inference_startup_program = fluid.Program()
    with fluid.program_guard(inference_program, inference_startup_program):
        with fluid.unique_name.guard():
            lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                config.batch_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
                use_dataloader=False)
    # Some op behaves differently for train and inference, we need to call
    # this clone function to ensure every op is right for inference.
    inference_program = inference_program.clone(for_test=True)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = Executor(place)
    exe.run(startup_program)

    if args.init_from_pretrain_model:
        if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
            print(args.init_from_pretrain_model)
            raise Warning("The pretrained params do not exist.")
            return
        fluid.load(main_program, args.init_from_pretrain_model)
        print("finish initing model from pretrained params from %s" %
              (args.init_from_pretrain_model))

    device_count = len(fluid.cuda_places()) if args.use_gpu else len(
        fluid.cpu_places())

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = device_count
    exec_strategy.num_iteration_per_drop_scope = 100

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = True

    if args.parallel:
        train_program = fluid.compiler.CompiledProgram(
            main_program).with_data_parallel(
                loss_name=loss.name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
    else:
        train_program = fluid.compiler.CompiledProgram(main_program)

    data_path = args.data_path
    print("begin to load data")
    ptb_data = reader.get_ptb_data(data_path)
    print("finished load data")
    train_data, valid_data, test_data = ptb_data

    def generate_init_data():
        if args.rnn_model == "lod":
            init_hidden = np.zeros(
                (config.batch_size, config.num_layers, config.hidden_size),
                dtype='float32')
            init_cell = np.zeros(
                (config.batch_size, config.num_layers, config.hidden_size),
                dtype='float32')
        else:
            init_hidden = np.zeros(
                (config.num_layers, config.batch_size, config.hidden_size),
                dtype='float32')
            init_cell = np.zeros(
                (config.num_layers, config.batch_size, config.hidden_size),
                dtype='float32')
        return init_hidden, init_cell

    def generate_new_lr(epoch_id=0, device_count=1):
        new_lr = config.base_learning_rate * (config.lr_decay**max(
            epoch_id + 1 - config.epoch_start_decay, 0.0))
        lr = np.ones((device_count), dtype='float32') * new_lr
        return lr

    def prepare_input(batch,
                      init_hidden=None,
                      init_cell=None,
                      epoch_id=0,
                      with_lr=True,
                      device_count=1):
        x, y = batch
        batch_size = x.shape[0]
        x = x.reshape((-1, config.num_steps, 1))
        y = y.reshape((-1, 1))
        if args.rnn_model == "lod":
            x = to_lodtensor(x.reshape((-1, 1)), place, [
                range(0, (batch_size + 1) * config.num_steps, config.num_steps)
            ])
            y = to_lodtensor(y.reshape((-1, 1)), place, [
                range(0, (batch_size + 1) * config.num_steps, config.num_steps)
            ])
        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
            res['learning_rate'] = generate_new_lr(epoch_id, device_count)

        return res

    def eval(data):
        # when eval the batch_size set to 1
        eval_data_iter = reader.get_data_iter(data, config.batch_size,
                                              config.num_steps)
        total_loss = 0.0
        iters = 0
        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(eval_data_iter):
            input_data_feed = prepare_input(
                batch, init_hidden, init_cell, epoch_id=0, with_lr=False)
            fetch_outs = exe.run(
                program=inference_program,
                feed=input_data_feed,
                fetch_list=[loss.name, last_hidden.name, last_cell.name],
                use_program_cache=False)

            cost_eval = np.array(fetch_outs[0])
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

            total_loss += cost_eval
            iters += config.num_steps

        ppl = np.exp(total_loss / iters)
        return ppl

    def get_log_interval(data_len):
        num_batchs = data_len // config.batch_size
        epoch_size = (num_batchs - 1) // config.num_steps
        log_interval = max(1, epoch_size // 10)
        return log_interval

    def train_an_epoch(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))
        train_data_iter = reader.get_data_iter(train_data, config.batch_size,
                                               config.num_steps)

        total_loss = 0
        iters = 0

        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(train_data_iter):
            input_data_feed = prepare_input(
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
                device_count=device_count)
            batch_start_time = time.time()
            fetch_outs = exe.run(train_program,
                                 feed=input_data_feed,
                                 fetch_list=[
                                     loss.name, "learning_rate",
                                     last_hidden.name, last_cell.name
                                 ],
                                 use_program_cache=True)
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += config.num_steps
            if batch_id > 0 and batch_id % log_interval == 0:
                ppl = np.exp(total_loss / iters)
                print(
                    "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
        ppl = np.exp(total_loss / iters)
        return ppl

    def train_an_epoch_dataloader(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))

        init_hidden, init_cell = generate_init_data()

        total_loss = 0
        iters = 0

        dataloader.start()
        batch_id = 0
        try:
            while True:
                data_feeds = {}
                if batch_id == 0:
                    batch_time = 0
                    batch_start_time = time.time()
                else:
                    batch_time = time.time() - batch_start_time
                    batch_times.append(batch_time)
                    batch_start_time = time.time()

                new_lr = generate_new_lr(epoch_id, device_count)
                data_feeds['learning_rate'] = new_lr
                data_feeds["init_hidden"] = init_hidden
                data_feeds["init_cell"] = init_cell

                fetch_outs = exe.run(train_program,
                                     feed=data_feeds,
                                     fetch_list=[
                                         loss.name, "learning_rate",
                                         last_hidden.name, last_cell.name
                                     ],
                                     use_program_cache=True)

                cost_train = np.array(fetch_outs[0])
                lr = np.array(fetch_outs[1])
                init_hidden = np.array(fetch_outs[2])
                init_cell = np.array(fetch_outs[3])

                total_loss += cost_train
                iters += config.num_steps
                if batch_id > 0 and (log_interval == 0 or
                                     batch_id % log_interval == 0):
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

                batch_id += 1
        except fluid.core.EOFException:
            dataloader.reset()

        batch_times.append(time.time() - batch_start_time)
        ppl = np.exp(total_loss / iters)
        return ppl

    def train():
        if args.use_dataloader:
            def data_gen():
                data_iter_size = config.batch_size // device_count
                train_batches = reader.get_data_iter(train_data, data_iter_size,
                                                     config.num_steps)
                for batch in train_batches:
                    x, y = batch
                    x = x.reshape((-1, config.num_steps, 1))
                    y = y.reshape((-1, 1))
                    if args.rnn_model == "lod":
                        x = to_lodtensor(x.reshape((-1, 1)), place, [
                            range(0, (data_iter_size + 1) * config.num_steps,
                                  config.num_steps)
                        ])
                        y = to_lodtensor(y.reshape((-1, 1)), place, [
                            range(0, (data_iter_size + 1) * config.num_steps,
                                  config.num_steps)
                        ])

                    yield x, y

            dataloader.set_batch_generator(data_gen)

        total_time = 0.0
        for epoch_id in range(config.max_epoch):
            batch_times = []
            epoch_start_time = time.time()
            if args.use_dataloader:
                train_ppl = train_an_epoch_dataloader(epoch_id, batch_times)
            else:
                train_ppl = train_an_epoch(epoch_id, batch_times)
            epoch_time = time.time() - epoch_start_time
            total_time += epoch_time
            print(
                "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
                % (epoch_id, epoch_time, train_ppl[0],
                   len(batch_times) / sum(batch_times)))

            # FIXME(zjl): ppl[0] increases as batch_size increases.
            # We should find a better way to calculate ppl by normalizing batch_size.
            if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[
                    0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            if epoch_id == config.max_epoch - 1 and args.enable_ce:
                # kpis
                print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
                      (args.rnn_model, device_count,
                       total_time / config.max_epoch))
                print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" %
                      (args.rnn_model, device_count, train_ppl[0]))

            # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
            # Just skip to avoid error
            def is_valid_data(data, batch_size, num_steps):
                data_len = len(data)
                batch_len = data_len // batch_size
                epoch_size = (batch_len - 1) // num_steps
                return epoch_size >= 1

            valid_data_valid = is_valid_data(valid_data, config.batch_size,
                                             config.num_steps)
            if valid_data_valid:
                valid_ppl = eval(valid_data)
                print("Valid ppl: %.5f" % valid_ppl[0])
            else:
                print(
                    'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'.
                    format(
                        len(valid_data), config.batch_size, config.num_steps))

            save_model_dir = os.path.join(args.save_model_dir,
                                          str(epoch_id), "params")
            fluid.save(main_program, save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)

    with profile_context(args.profile):
        train()

    test_ppl = eval(test_data)
    print("Test ppl:", test_ppl[0])
Example #4
0
def train(args):
    # priority: ENV > args > config
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    if is_local == '0':
        args.local = False
    logging.info(args)

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    training_role = os.getenv("TRAINING_ROLE", "TRAINER")

    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
        place = fluid.CPUPlace()
        # the default setting of CPU_NUM in paddle framework is 1
        dev_count = int(os.environ.get('CPU_NUM', 1))
    else:
        check_cuda(TrainTaskConfig.use_gpu)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = get_device_num()

    exe = fluid.Executor(place)

    train_prog = fluid.Program()
    startup_prog = fluid.Program()

    if args.enable_ce:
        train_prog.random_seed = 1000
        startup_prog.random_seed = 1000

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                ModelHyperParams.bos_idx,
                use_py_reader=args.use_py_reader,
                is_test=False)

            optimizer = None
            if args.sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
                logging.info("before adam")

                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate

                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(0.003)
            optimizer.minimize(avg_cost)

    if args.local:
        logging.info("local start_up:")
        train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                   avg_cost, token_num, predict, pyreader)
    else:
        if args.update_method == "nccl2":
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
            port = os.getenv("PADDLE_PORT")
            worker_ips = os.getenv("PADDLE_TRAINERS")
            worker_endpoints = []
            for ip in worker_ips.split(","):
                worker_endpoints.append(':'.join([ip, port]))
            trainers_num = len(worker_endpoints)
            current_endpoint = os.getenv("POD_IP") + ":" + port
            if trainer_id == 0:
                logging.info("train_id == 0, sleep 60s")
                time.sleep(60)
            logging.info("trainers_num:{}".format(trainers_num))
            logging.info("worker_endpoints:{}".format(worker_endpoints))
            logging.info("current_endpoint:{}".format(current_endpoint))
            append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints,
                                 current_endpoint)
            train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                       avg_cost, token_num, predict, pyreader, trainers_num,
                       trainer_id)
            return

        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))

        logging.info("pserver_endpoints:{}".format(pserver_endpoints))
        logging.info("current_endpoint:{}".format(current_endpoint))
        logging.info("trainer_id:{}".format(trainer_id))
        logging.info("pserver_ips:{}".format(pserver_ips))
        logging.info("port:{}".format(port))

        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id,
                    pservers=pserver_endpoints,
                    trainers=trainers,
                    program=train_prog,
                    startup_program=startup_prog)

        if training_role == "PSERVER":
            logging.info("distributed: pserver started")
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                logging.critical("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            logging.info("distributed: trainer started")
            trainer_prog = t.get_trainer_program()

            train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                       avg_cost, token_num, predict, pyreader)
        else:
            logging.critical(
                "environment var TRAINER_ROLE should be TRAINER os PSERVER")
            exit(1)