Beispiel #1
0
    def run_trainer(self, args):
        self.lr = args.lr
        if args.nccl2_reduce_layer_local_run:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, single_device=True)
        elif args.use_dgc:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
        else:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size)

        if args.update_method == "pserver":
            print_to_err(
                type(self).__name__,
                "begin to run transpile on trainer with pserver mode")
            t = self.get_transpiler(
                trainer_id=args.trainer_id,
                main_program=fluid.default_main_program(),
                pserver_endpoints=args.endpoints,
                trainers=args.trainers,
                sync_mode=args.sync_mode,
                dc_asgd=args.dc_asgd,
                hogwild_mode=args.hogwild)

            trainer_prog = t.get_trainer_program()
            print_to_err(
                type(self).__name__,
                "get trainer program done with pserver mode.")
        elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            # transpile for nccl2
            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
            config.nccl_comm_num = args.nccl_comm_num
            if args.use_hallreduce:
                config.use_hierarchical_allreduce = True
                config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
            print_to_err(
                type(self).__name__,
                "begin to run transpile on trainer with nccl2 mode")
            nccl2_t = fluid.DistributeTranspiler(config=config)
            nccl2_t.transpile(
                args.trainer_id,
                program=fluid.default_main_program(),
                startup_program=fluid.default_startup_program(),
                trainers=args.endpoints,
                current_endpoint=args.current_endpoint)
            print_to_err(
                type(self).__name__,
                "get trainer program done. with nccl2 mode")
            trainer_prog = fluid.default_main_program()
        else:
            print_to_err(
                type(self).__name__,
                "do nothing about main program, just use it")
            trainer_prog = fluid.default_main_program()
            print_to_err(type(self).__name__, "use main program done.")

        if args.use_cuda:
            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
            place = fluid.CUDAPlace(device_id)
        else:
            place = fluid.CPUPlace()

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        print_to_err(type(self).__name__, "run worker startup program done.")

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        build_stra = fluid.BuildStrategy()
        # FIXME force disable enable_inplace and memory_optimize
        build_stra.enable_inplace = False
        build_stra.memory_optimize = False

        if args.hogwild:
            build_stra.async_mode = True

        if args.enable_backward_deps:
            build_stra.enable_backward_optimizer_op_deps = True

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        pass_builder = None
        if args.batch_merge_repeat > 1:
            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
            mypass.set("num_repeats", args.batch_merge_repeat)

        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = len(args.endpoints.split(","))
            build_stra.trainer_id = args.trainer_id
        else:
            # case args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = 1
            build_stra.trainer_id = 0

        print_to_err(type(self).__name__, "begin to compile with data parallel")
        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
            loss_name=avg_cost.name,
            build_strategy=build_stra,
            exec_strategy=exec_strategy)
        print_to_err(type(self).__name__, "program compiled with data parallel")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(binary,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        print_to_out(out_losses)
Beispiel #2
0
def main():
    env = os.environ
    FLAGS.local_rank = int(env.get('PADDLE_TRAINER_ID', 0))
    FLAGS.world_size = int(env.get('PADDLE_TRAINERS_NUM', 1))
    FLAGS.device_id = int(env['FLAGS_selected_gpus'])
    FLAGS.whole_batch_size = FLAGS.world_size * FLAGS.batch_size

    pipe = HybridTrainPipe()
    pipe.build()
    sample_per_shard = len(pipe) // FLAGS.world_size
    train_loader = DALIClassificationIterator(pipe, reader_name="Reader",
                                             fill_last_batch=False)

    if FLAGS.local_rank == 0:
        pipe = HybridValPipe()
        pipe.build()
        val_loader = DALIClassificationIterator(pipe, reader_name="Reader",
                                                fill_last_batch=False)

    place = fluid.CUDAPlace(FLAGS.device_id)
    exe = fluid.Executor(place)
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    eval_prog = fluid.Program()

    step_per_epoch = int(math.ceil(sample_per_shard / FLAGS.batch_size))
    milestones = [step_per_epoch * e for e in (30, 60, 80)]
    values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)]

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            train_fetch_list = build()
            learning_rate = fluid.layers.piecewise_decay(
                boundaries=milestones, values=values)
            learning_rate = fluid.layers.linear_lr_warmup(
                learning_rate=learning_rate,
                warmup_steps=5 * step_per_epoch,
                start_lr=0.,
                end_lr=FLAGS.lr)
            decay = FLAGS.weight_decay
            optimizer = fluid.optimizer.Momentum(
                learning_rate=learning_rate,
                momentum=FLAGS.momentum,
                regularization=fluid.regularizer.L2Decay(decay))
            avg_loss = train_fetch_list[0]
            optimizer.minimize(avg_loss)

    with fluid.program_guard(eval_prog, startup_prog):
        with fluid.unique_name.guard():
            eval_fetch_list = build()
        eval_prog = eval_prog.clone(True)

    build_strategy = fluid.BuildStrategy()
    build_strategy.trainer_id = FLAGS.local_rank
    build_strategy.num_trainers = FLAGS.world_size
    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(
        FLAGS.local_rank,
        trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
        current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
        startup_program=startup_prog,
        program=train_prog)

    exec_strategy = fluid.ExecutionStrategy()

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
    compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog)

    total_time = AverageMeter()

    for epoch in range(FLAGS.epochs):
        if FLAGS.local_rank == 0:
            print("==== train epoch {:02d} ====".format(epoch + 1))
        avg_time, _, _ = run(
            exe, compiled_train_prog, train_fetch_list, train_loader, epoch)
        total_time.update(avg_time)
        # reset DALI iterators
        train_loader.reset()

        if FLAGS.local_rank == 0:
            print("==== validation epoch {:02d} ====".format(epoch + 1))
            _, prec1, prec5 = run(
                exe, compiled_eval_prog, eval_fetch_list, val_loader, epoch)

            val_loader.reset()

            ckpt_path = os.path.join('checkpoint', "{:02d}".format(epoch + 1))
            if os.path.isdir(ckpt_path):
                shutil.rmtree(ckpt_path)
            print('Save model to {}.'.format(ckpt_path))
            fluid.io.save_persistables(exe, ckpt_path, train_prog)

            time_per_sample = FLAGS.whole_batch_size / total_time.avg
            if epoch == FLAGS.epochs-1:
                print('##Top-1 {0}\n'
                      '##Top-5 {1}\n'
                      '##Perf  {2}'.format(
                          prec1 * 100, prec5 * 100, time_per_sample))
def main(args):
    """main"""
    model_config = UNIMOConfig(args.unimo_config_path)
    model_config.print_config()

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=args.do_lower_case)

    if not (args.do_train or args.do_val or args.do_test or args.do_test_hard):
        raise ValueError(
            "For args `do_train`, `do_val`, `do_test`, `do_test_hard`, at "
            "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))

    if args.do_train:
        train_data_reader = ClassifyReader(args.train_filelist,
                                           args.max_seq_len, tokenizer)
        train_data_generator = train_data_reader.data_generator(
            batch_size=args.batch_size, epoch=args.epoch, phase="train")

        if args.num_train_examples:
            num_train_examples = args.num_train_examples
        else:
            num_train_examples = train_data_reader.get_num_examples()
        step_num_per_epoch = num_train_examples // args.batch_size // trainers_num
        max_train_steps = args.epoch * step_num_per_epoch

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    config=model_config,
                    pyreader_name="train_reader",
                    is_train=True)

                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

    if args.do_val or args.do_test or args.do_test_hard:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = create_model(
                    args,
                    config=model_config,
                    pyreader_name="dev_reader",
                    is_train=False)
        test_prog = test_prog.clone(for_test=True)
        if args.do_val:
            dev_data_reader = ClassifyReader(args.dev_filelist,
                                             args.max_seq_len, tokenizer)
            dev_data_generator = dev_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="dev")

        if args.do_test:
            test_data_reader = ClassifyReader(args.test_filelist,
                                              args.max_seq_len, tokenizer)
            test_data_generator = test_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="test")

        if args.do_test_hard:
            test_hard_data_reader = ClassifyReader(args.test_hard_filelist,
                                                   args.max_seq_len, tokenizer)
            test_hard_data_generator = test_hard_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="test_hard")

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=train_program)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=train_program)
    elif args.do_val or args.do_test or args.do_test_hard:
        args.init_checkpoint = args.init_pretraining_params
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = 4 if args.use_fp16 else 2
        exec_strategy.num_iteration_per_drop_scope = min(
            args.num_iteration_per_drop_scope, args.skip_steps)

        build_strategy = fluid.BuildStrategy()
        build_strategy.remove_unnecessary_lock = False

        if args.use_fuse:
            build_strategy.fuse_all_reduce_ops = True

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test or args.do_test_hard:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    dev_ret_history = []  # (steps, key_eval, eval)
    test_ret_history = []  # (steps, key_eval, eval)
    test_hard_ret_history = []  # (steps, key_eval, eval)
    steps = 0

    if args.do_train:
        train_pyreader.start()
        time_begin = time.time()
        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if steps % skip_steps == 0:
                    train_fetch_list = [
                        graph_vars["loss"].name, scheduled_lr.name
                    ]
                    res = train_exe.run(fetch_list=train_fetch_list)
                    outputs = {
                        "loss": np.mean(res[0]),
                        'learning_rate': float(res[1][0])
                    }
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, learning_rate: %.10f" % \
                                (train_pyreader.queue.size(), outputs['learning_rate'])
                        print(verbose)
                    current_epoch, current_example, current_file_index, total_file, current_file = \
                            train_data_reader.get_progress()

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("%s - epoch: %d, progress: %d/%d, %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \
                          (get_time(), current_epoch, current_example, num_train_examples, current_file_index, \
                          total_file, steps, outputs["loss"], args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0 and args.save_checkpoints:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path,
                                                   train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            dev_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "dev", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            dev_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            test_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "test", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            test_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                    # evaluate test set
                    if args.do_test_hard:
                        test_pyreader.decorate_tensor_provider(
                            test_hard_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "test_hard", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            test_hard_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

            except fluid.core.EOFException:
                if args.save_checkpoints:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(dev_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "dev", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            dev_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(test_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "test", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            test_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    # final eval on test_hard set
    if args.do_test_hard:
        test_pyreader.decorate_tensor_provider(test_hard_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "test_hard", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            test_hard_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    if nccl2_trainer_id == 0:
        if args.do_val:
            dev_ret_history = sorted(dev_ret_history,
                                     key=lambda a: a[2],
                                     reverse=True)
            print("Best validation result: step %d %s %f" % \
                    (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2]))
Beispiel #4
0
def train():
    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    args = parse_args()
    logger.info('Running with args : {}'.format(args))
    logger.info('Running paddle : {}'.format(paddle.version.commit))

    hidden_size = args.hidden_size
    batch_size = args.batch_size
    data_path = args.data_path
    logger.info("begin to load vocab")
    vocab = data.Vocabulary(args.vocab_path, validate_file=True)
    vocab_size = vocab.size
    logger.info("finished load vocab")

    if args.enable_ce:
        random.seed(args.random_seed)
        np.random.seed(args.random_seed)

    logger.info('build the model...')
    # build model
    train_prog = fluid.Program()
    train_startup_prog = fluid.Program()
    if args.enable_ce:
        train_prog.random_seed = args.random_seed
        train_startup_prog.random_seed = args.random_seed
    # build infer model
    infer_prog = fluid.Program()
    infer_startup_prog = fluid.Program()
    with fluid.program_guard(infer_prog, infer_startup_prog):
        with fluid.unique_name.guard():
            # Infer process
            infer_model = lm_model.LanguageModel(
                args, vocab_size, test_mode=True)
            infer_model.build()
    infer_progs = infer_prog, infer_startup_prog, infer_model

    with fluid.program_guard(train_prog, train_startup_prog):
        with fluid.unique_name.guard():
            # Training process
            train_model = lm_model.LanguageModel(
                args, vocab_size, test_mode=False)
            train_model.build()
            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=args.max_grad_norm))

            # build optimizer
            if args.optim == 'adagrad':
                optimizer = fluid.optimizer.Adagrad(
                    learning_rate=args.learning_rate,
                    epsilon=0.0,
                    initial_accumulator_value=1.0)
            elif args.optim == 'sgd':
                optimizer = fluid.optimizer.SGD(
                    learning_rate=args.learning_rate)
            elif args.optim == 'adam':
                optimizer = fluid.optimizer.Adam(
                    learning_rate=args.learning_rate)
            elif args.optim == 'rprop':
                optimizer = fluid.optimizer.RMSPropOptimizer(
                    learning_rate=args.learning_rate)
            else:
                logger.error('Unsupported optimizer: {}'.format(args.optim))
                exit(-1)
            optimizer.minimize(train_model.loss * args.num_steps)
            # initialize parameters
            place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
            exe = Executor(place)
    train_progs = train_prog, train_startup_prog, train_model

    if args.local:
        logger.info("local start_up:")
        train_loop(args, logger, vocab, train_progs, infer_progs, optimizer)
    else:
        if args.update_method == "nccl2":
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
            if args.test_nccl:
                worker_endpoints_env = os.getenv("PADDLE_WORK_ENDPOINTS")
                worker_endpoints = worker_endpoints_env.split(',')
                trainers_num = len(worker_endpoints)
                current_endpoint = worker_endpoints[trainer_id]
            else:
                port = os.getenv("PADDLE_PORT")
                worker_ips = os.getenv("PADDLE_TRAINERS")
                worker_endpoints = []
                for ip in worker_ips.split(","):
                    worker_endpoints.append(':'.join([ip, port]))
                worker_endpoints_env = ','.join(worker_endpoints)
                trainers_num = len(worker_endpoints)
                current_endpoint = os.getenv("POD_IP") + ":" + port
            if trainer_id == 0:
                logger.info("train_id == 0, sleep 60s")
                time.sleep(60)

            logger.info("trainers_num:{}".format(trainers_num))
            logger.info("worker_endpoints:{}".format(worker_endpoints))
            logger.info("current_endpoint:{}".format(current_endpoint))
            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
            t = fluid.DistributeTranspiler(config=config)
            t.transpile(
                trainer_id,
                trainers=worker_endpoints_env,
                current_endpoint=current_endpoint,
                program=train_prog,
                startup_program=train_startup_prog)
            train_progs = train_prog, train_startup_prog, train_model
            train_loop(args, logger, vocab, train_progs, infer_progs, optimizer,
                       trainers_num, trainer_id, worker_endpoints)
        else:
            port = os.getenv("PADDLE_PORT", "6174")
            pserver_ips = os.getenv("PADDLE_PSERVERS")
            eplist = []
            for ip in pserver_ips.split(","):
                eplist.append(':'.join([ip, port]))
            pserver_endpoints = ",".join(eplist)
            trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
            current_endpoint = os.getenv("POD_IP") + ":" + port
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))

            logger.info("pserver_endpoints:{}".format(pserver_endpoints))
            logger.info("current_endpoint:{}".format(current_endpoint))
            logger.info("trainer_id:{}".format(trainer_id))
            logger.info("pserver_ips:{}".format(pserver_ips))
            logger.info("port:{}".format(port))

            t = fluid.DistributeTranspiler()
            t.transpile(
                trainer_id,
                pservers=pserver_endpoints,
                trainers=trainers,
                program=train_prog,
                startup_program=startup_prog)

            if training_role == "PSERVER":
                logger.info("distributed: pserver started")
                current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                    "PADDLE_PORT")
                if not current_endpoint:
                    logger.critical("need env SERVER_ENDPOINT")
                    exit(1)
                pserver_prog = t.get_pserver_program(current_endpoint)
                pserver_startup = t.get_startup_program(current_endpoint,
                                                        pserver_prog)

                exe.run(pserver_startup)
                exe.run(pserver_prog)
            elif training_role == "TRAINER":
                logger.info("distributed: trainer started")
                trainer_prog = t.get_trainer_program()
                train_loop(args, logger, vocab, train_progs, infer_progs,
                           optimizer)
            else:
                logger.critical(
                    "environment var TRAINER_ROLE should be TRAINER os PSERVER")
                exit(1)
Beispiel #5
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        log.info(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                        % (current_epoch, current_example, num_train_examples,
                           steps, loss, f1, precision, recall,
                           args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0 and steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if nccl2_trainer_id == 0 and steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(reader, exe, test_prog, test_pyreader,
                                         graph_vars, current_epoch, steps)
                    # evaluate test set
                    if args.do_test:
                        predict_wrapper(reader, exe, test_prog, test_pyreader,
                                        graph_vars, current_epoch, steps)

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if nccl2_trainer_id == 0 and args.do_val:
        if not args.do_train:
            current_example, current_epoch = reader.get_train_progress()
        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                         current_epoch, 'final')

    if nccl2_trainer_id == 0 and args.do_test:
        if not args.do_train:
            current_example, current_epoch = reader.get_train_progress()
        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                        current_epoch, 'final')
Beispiel #6
0
def main(args):
    """main"""
    model_config = UNIMOConfig(args.unimo_config_path)
    model_config.print_config()

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=args.do_lower_case)

    data_reader = ClassifyReader(tokenizer, args)

    if not (args.do_train or args.do_val or args.do_val_hard \
            or args.do_test or args.do_test_hard or args.do_diagnostic):
        raise ValueError("For args `do_train`, `do_val`, `do_val_hard`, `do_test`," \
                " `do_test_hard` and `do_diagnostic`, at least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        train_data_generator = data_reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = data_reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // trainers_num
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args, pyreader_name='train_reader', config=model_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \
            or args.do_pred or args.do_pred_hard or args.do_diagnostic:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args, pyreader_name='test_reader', config=model_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=train_program)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=train_program)
    elif args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \
            or args.do_pred or args.do_pred_hard or args.do_diagnostic:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_val_hard or args.do_test or args.do_test_hard \
            or args.do_pred or args.do_pred_hard or args.do_diagnostic:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                              main_program=test_prog,
                                              share_vars_from=train_exe)

    dev_ret_history = []  # (steps, key_eval, eval)
    dev_hard_ret_history = []  # (steps, key_eval, eval)
    test_ret_history = []  # (steps, key_eval, eval)
    test_hard_ret_history = []  # (steps, key_eval, eval)
    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if steps % skip_steps == 0:
                    train_fetch_list = [
                        graph_vars["loss"].name, graph_vars["accuracy"].name,
                        graph_vars["num_seqs"].name
                    ]
                    if "learning_rate" in graph_vars:
                        train_fetch_list.append(
                            graph_vars["learning_rate"].name)
                    res = train_exe.run(fetch_list=train_fetch_list)

                    outputs = {"loss": np.mean(res[0])}
                    if "learning_rate" in graph_vars:
                        outputs["learning_rate"] = float(res[3][0])

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = data_reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("%s - epoch: %d, progress: %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \
                          (get_time(), current_epoch, current_example, num_train_examples, steps, \
                          outputs["loss"], args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0 and args.save_checkpoints:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path,
                                                   train_program)

                    if steps % args.validation_steps == 0:
                        # evaluate dev set
                        if args.do_val:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.dev_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog,
                                               test_pyreader, graph_vars,
                                               "dev")
                            dev_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                        # evaluate dev_hard set
                        if args.do_val_hard:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.dev_hard_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog,
                                               test_pyreader, graph_vars,
                                               "dev_hard")
                            dev_hard_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                        # evaluate test set
                        if args.do_test:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog,
                                               test_pyreader, graph_vars,
                                               "test")
                            test_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                        # evaluate test_hard set
                        if args.do_test_hard:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_hard_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog,
                                               test_pyreader, graph_vars,
                                               "test_hard")
                            test_hard_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                        # pred diagnostic set
                        if args.do_diagnostic:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.diagnostic_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            qids, preds, probs = predict(test_exe,
                                                         test_prog,
                                                         test_pyreader,
                                                         graph_vars,
                                                         dev_count=1)
                            save_path = args.pred_save + '.diagnostic.' + str(
                                steps) + '.txt'
                            print("testing {}, save to {}".format(
                                args.diagnostic_set, save_path))
                            with open(save_path, 'w') as f:
                                for id, s, p in zip(qids, preds, probs):
                                    f.write('{}\t{}\t{}\n'.format(id, s, p))

                        # pred test set
                        if args.do_pred:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            qids, preds, probs = predict(test_exe,
                                                         test_prog,
                                                         test_pyreader,
                                                         graph_vars,
                                                         dev_count=1)
                            save_path = args.pred_save + '.test.' + str(
                                steps) + '.txt'
                            print("testing {}, save to {}".format(
                                args.test_set, save_path))
                            with open(save_path, 'w') as f:
                                for id, s, p in zip(qids, preds, probs):
                                    f.write('{}\t{}\t{}\n'.format(id, s, p))

                        # pred test hard set
                        if args.do_pred_hard:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_hard_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            qids, preds, probs = predict(test_exe,
                                                         test_prog,
                                                         test_pyreader,
                                                         graph_vars,
                                                         dev_count=1)
                            save_path = args.pred_save + '.test_hard.' + str(
                                steps) + '.txt'
                            print("testing {}, save to {}".format(
                                args.test_hard_set, save_path))
                            with open(save_path, 'w') as f:
                                for id, s, p in zip(qids, preds, probs):
                                    f.write('{}\t{}\t{}\n'.format(id, s, p))

            except fluid.core.EOFException:
                if args.save_checkpoints:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if nccl2_trainer_id == 0:
        # final pred on diagnostic set
        if args.do_diagnostic:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.diagnostic_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            qids, preds, probs = predict(test_exe,
                                         test_prog,
                                         test_pyreader,
                                         graph_vars,
                                         dev_count=1)
            save_path = args.pred_save + '.diagnostic.' + str(steps) + '.txt'
            print("testing {}, save to {}".format(args.diagnostic_set,
                                                  save_path))
            with open(save_path, 'w') as f:
                for id, s, p in zip(qids, preds, probs):
                    f.write('{}\t{}\t{}\n'.format(id, s, p))

        # final pred on test set
        if args.do_pred:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.test_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            qids, preds, probs = predict(test_exe,
                                         test_prog,
                                         test_pyreader,
                                         graph_vars,
                                         dev_count=1)
            save_path = args.pred_save + '.test.' + str(steps) + '.txt'
            print("testing {}, save to {}".format(args.test_set, save_path))
            with open(save_path, 'w') as f:
                for id, s, p in zip(qids, preds, probs):
                    f.write('{}\t{}\t{}\n'.format(id, s, p))

        # final pred on test_hard set
        if args.do_pred_hard:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.test_hard_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            qids, preds, probs = predict(test_exe,
                                         test_prog,
                                         test_pyreader,
                                         graph_vars,
                                         dev_count=1)
            save_path = args.pred_save + '.test_hard.' + str(steps) + '.txt'
            print("testing {}, save to {}".format(args.test_hard_set,
                                                  save_path))
            with open(save_path, 'w') as f:
                for id, s, p in zip(qids, preds, probs):
                    f.write('{}\t{}\t{}\n'.format(id, s, p))

        # final eval on test set
        if args.do_test:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.test_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            print("Final test result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader,
                               graph_vars, "test")
            test_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))
            test_ret_history = sorted(test_ret_history,
                                      key=lambda a: a[2],
                                      reverse=True)
            print("Best testing result: step %d %s %f" %
                  (test_ret_history[0][0], test_ret_history[0][1],
                   test_ret_history[0][2]))

        # final eval on test hard set
        if args.do_test_hard:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.test_hard_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            print("Final test_hard result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader,
                               graph_vars, "test_hard")
            test_hard_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))
            test_hard_ret_history = sorted(test_hard_ret_history,
                                           key=lambda a: a[2],
                                           reverse=True)
            print("Best testing hard result: step %d %s %f" %
                  (test_hard_ret_history[0][0], test_hard_ret_history[0][1],
                   test_hard_ret_history[0][2]))

        # final eval on dev set
        if args.do_val:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.dev_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            print("Final validation result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader,
                               graph_vars, "dev")
            dev_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))
            dev_ret_history = sorted(dev_ret_history,
                                     key=lambda a: a[2],
                                     reverse=True)
            print("Best validation result: step %d %s %f" %
                  (dev_ret_history[0][0], dev_ret_history[0][1],
                   dev_ret_history[0][2]))

        # final eval on dev hard set
        if args.do_val_hard:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(args.dev_hard_set,
                                           batch_size=args.batch_size,
                                           epoch=1,
                                           dev_count=1,
                                           shuffle=False))
            print("Final validation_hard result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader,
                               graph_vars, "dev_hard")
            dev_hard_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))
            dev_hard_ret_history = sorted(dev_hard_ret_history,
                                          key=lambda a: a[2],
                                          reverse=True)
            print("Best validation_hard result: step %d %s %f" %
                  (dev_hard_ret_history[0][0], dev_hard_ret_history[0][1],
                   dev_hard_ret_history[0][2]))
Beispiel #7
0
def train(args):
    """
        train
    """
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    if is_local == '0':
        args.local = False
    print(args)

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
    gpus = os.getenv("FLAGS_selected_gpus").split(",")
    gpu_id = int(gpus[0])

    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = len(gpus)

    exe = fluid.Executor(place)

    train_prog = fluid.Program()
    startup_prog = fluid.Program()

    if args.enable_ce:
        train_prog.random_seed = 1000
        startup_prog.random_seed = 1000

    with fluid.program_guard(train_prog, startup_prog):
        logits_list = []

        data_input_names = encoder_data_input_fields + \
                decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields

        all_data_inputs, pyreader = make_all_py_reader_inputs(data_input_names)
        with fluid.unique_name.guard("new_forward"):
            new_forward_sum_cost, new_forward_avg_cost, new_forward_token_num, new_forward_logits, new_forward_xent, new_forward_loss, new_forward_label, new_forward_non_zeros = forward_transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 50,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                ModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=True,
                is_test=False,
                params_type="new",
                all_data_inputs=all_data_inputs)

        with fluid.unique_name.guard("new_relative_position"):
            new_relative_position_sum_cost, new_relative_position_avg_cost, new_relative_position_token_num, new_relative_position_logits, new_relative_position_xent, new_relative_position_loss, new_relative_position_label, new_relative_position_non_zeros = relative_transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 50,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                ModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False,
                params_type="new",
                all_data_inputs=all_data_inputs)

        DenseModelHyperParams.src_vocab_size = ModelHyperParams.src_vocab_size
        DenseModelHyperParams.trg_vocab_size = ModelHyperParams.trg_vocab_size
        DenseModelHyperParams.weight_sharing = ModelHyperParams.weight_sharing
        DenseModelHyperParams.embedding_sharing = ModelHyperParams.embedding_sharing

        with fluid.unique_name.guard("new_dense"):
            new_dense_sum_cost, new_dense_avg_cost, new_dense_token_num, new_dense_logits, new_dense_xent, new_dense_loss, new_dense_label, _ = dense_transformer(
                DenseModelHyperParams.src_vocab_size,
                DenseModelHyperParams.trg_vocab_size,
                DenseModelHyperParams.max_length + 50,
                DenseModelHyperParams.n_layer,
                DenseModelHyperParams.enc_n_layer,
                DenseModelHyperParams.n_head,
                DenseModelHyperParams.d_key,
                DenseModelHyperParams.d_value,
                DenseModelHyperParams.d_model,
                DenseModelHyperParams.d_inner_hid,
                DenseModelHyperParams.prepostprocess_dropout,
                DenseModelHyperParams.attention_dropout,
                DenseModelHyperParams.relu_dropout,
                DenseModelHyperParams.preprocess_cmd,
                DenseModelHyperParams.postprocess_cmd,
                DenseModelHyperParams.weight_sharing,
                DenseModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False,
                params_type="new",
                all_data_inputs=all_data_inputs)

        with fluid.unique_name.guard("fixed_forward"):
            fixed_forward_sum_cost, fixed_forward_avg_cost, fixed_forward_token_num, fixed_forward_logits, fixed_forward_xent, fixed_forward_loss, fixed_forward_label, fixed_forward_non_zeros = forward_transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 50,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                ModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False,
                params_type="fixed",
                all_data_inputs=all_data_inputs)
            logits_list.append(fixed_forward_logits)

        DenseModelHyperParams.src_vocab_size = ModelHyperParams.src_vocab_size
        DenseModelHyperParams.trg_vocab_size = ModelHyperParams.trg_vocab_size
        DenseModelHyperParams.weight_sharing = ModelHyperParams.weight_sharing
        DenseModelHyperParams.embedding_sharing = ModelHyperParams.embedding_sharing

        with fluid.unique_name.guard("fixed_dense"):
            fixed_dense_sum_cost, fixed_dense_avg_cost, fixed_dense_token_num, fixed_dense_logits, fixed_dense_xent, fixed_dense_loss, fixed_dense_label, _ = dense_transformer(
                DenseModelHyperParams.src_vocab_size,
                DenseModelHyperParams.trg_vocab_size,
                DenseModelHyperParams.max_length + 50,
                DenseModelHyperParams.n_layer,
                DenseModelHyperParams.enc_n_layer,
                DenseModelHyperParams.n_head,
                DenseModelHyperParams.d_key,
                DenseModelHyperParams.d_value,
                DenseModelHyperParams.d_model,
                DenseModelHyperParams.d_inner_hid,
                DenseModelHyperParams.prepostprocess_dropout,
                DenseModelHyperParams.attention_dropout,
                DenseModelHyperParams.relu_dropout,
                DenseModelHyperParams.preprocess_cmd,
                DenseModelHyperParams.postprocess_cmd,
                DenseModelHyperParams.weight_sharing,
                DenseModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False,
                params_type="fixed",
                all_data_inputs=all_data_inputs)
            logits_list.append(fixed_dense_logits)

        with fluid.unique_name.guard("fixed_relative_position"):
            fixed_relative_sum_cost, fixed_relative_avg_cost, fixed_relative_token_num, fixed_relative_logits, fixed_relative_xent, fixed_relative_loss, fixed_relative_label, _ = relative_transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 50,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                ModelHyperParams.embedding_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False,
                params_type="fixed",
                all_data_inputs=all_data_inputs)
            logits_list.append(fixed_relative_logits)

        # normalizing
        confidence = 1.0 - TrainTaskConfig.label_smooth_eps
        low_confidence = (1.0 -
                          confidence) / (ModelHyperParams.trg_vocab_size - 1)
        normalizing = -(confidence * math.log(confidence) +
                        (ModelHyperParams.trg_vocab_size - 1) *
                        low_confidence * math.log(low_confidence + 1e-20))

        batch_size = layers.shape(new_forward_logits)[0]
        seq_length = layers.shape(new_forward_logits)[1]
        trg_voc_size = layers.shape(new_forward_logits)[2]

        # ensemble
        teacher_logits = logits_list[0]
        for index in xrange(1, len(logits_list)):
            teacher_logits += logits_list[index]

        teacher_logits = teacher_logits / len(logits_list)

        # new_target
        new_target = layers.softmax(teacher_logits)
        new_target.stop_gradient = True

        # agent_1: forward
        fdistill_xent = layers.softmax_with_cross_entropy(
            logits=new_forward_logits, label=new_target, soft_label=True)
        fdistill_xent -= normalizing
        fdistill_loss = layers.reduce_sum(
            fdistill_xent * new_forward_non_zeros) / new_forward_token_num

        # agent_2: relative
        rdistill_xent = layers.softmax_with_cross_entropy(
            logits=new_relative_position_logits,
            label=new_target,
            soft_label=True)
        rdistill_xent -= normalizing
        rdistill_loss = layers.reduce_sum(
            rdistill_xent * new_forward_non_zeros) / new_forward_token_num

        # agent_3: dense
        ddistill_xent = layers.softmax_with_cross_entropy(
            logits=new_dense_logits, label=new_target, soft_label=True)
        ddistill_xent -= normalizing
        ddistill_loss = layers.reduce_sum(
            ddistill_xent * new_forward_non_zeros) / new_forward_token_num

        teacher_loss = fixed_forward_avg_cost + fixed_dense_avg_cost + fixed_relative_avg_cost
        avg_cost = TrainTaskConfig.beta * new_forward_avg_cost + (
            1.0 - TrainTaskConfig.beta
        ) * fdistill_loss + TrainTaskConfig.beta * new_relative_position_avg_cost + (
            1.0 - TrainTaskConfig.beta
        ) * rdistill_loss + TrainTaskConfig.beta * new_dense_avg_cost + (
            1.0 - TrainTaskConfig.beta) * ddistill_loss + teacher_loss

        avg_cost.persistable = True
        teacher_loss.persistable = True

        optimizer = None
        if args.sync:
            lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
            logging.info("before adam")

            with fluid.default_main_program()._lr_schedule_guard():
                learning_rate = lr_decay * TrainTaskConfig.learning_rate
            optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                             beta1=TrainTaskConfig.beta1,
                                             beta2=TrainTaskConfig.beta2,
                                             epsilon=TrainTaskConfig.eps)
        else:
            optimizer = fluid.optimizer.SGD(0.003)
        if args.use_fp16:
            #black_varnames={"src_slf_attn_bias", "trg_slf_attn_bias", "trg_src_attn_bias", "dense_src_slf_attn_bias", "dense_trg_slf_attn_bias", "dense_trg_src_attn_bias"}
            #amp_lists=fluid.contrib.mixed_precision.AutoMixedPrecisionLists(custom_black_varnames=black_varnames,
            #        custom_black_list=["dropout"])
            #optimizer = fluid.contrib.mixed_precision.decorate(optimizer, amp_lists=amp_lists,
            optimizer = fluid.contrib.mixed_precision.decorate(
                optimizer,
                init_loss_scaling=32768,
                incr_every_n_steps=2000,
                use_dynamic_loss_scaling=True)

        optimizer.minimize(avg_cost)

        loss_scaling = None
        scaled_cost = None
        if args.use_fp16:
            scaled_cost = optimizer.get_scaled_loss()
            loss_scaling = optimizer.get_loss_scaling()

    if args.local:
        logging.info("local start_up:")
        train_loop(exe, train_prog, startup_prog, args, dev_count, avg_cost,
                   teacher_loss, new_relative_position_sum_cost,
                   new_relative_position_avg_cost,
                   new_relative_position_token_num, pyreader, place)
    else:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        logging.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
                trainer_id:{}".format(worker_endpoints, trainers_num,
                                      current_endpoint, trainer_id))

        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            logging.info("use_hierarchical_allreduce")
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce

            config.hierarchical_allreduce_inter_nranks = 8
            if config.hierarchical_allreduce_inter_nranks > 1:
                config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_prog,
                    startup_program=startup_prog)

        train_loop(exe,
                   train_prog,
                   startup_prog,
                   args,
                   dev_count,
                   avg_cost,
                   teacher_loss,
                   new_relative_position_sum_cost,
                   new_relative_position_avg_cost,
                   new_relative_position_token_num,
                   pyreader,
                   place,
                   trainers_num,
                   trainer_id,
                   scaled_cost=scaled_cost,
                   loss_scaling=loss_scaling)
def main(args):
    """
       Main func for downstream tasks
    """
    print("finetuning tasks start")
    ernie_config = ErnieVilConfig(args.ernie_config_path)
    ernie_config.print_config()

    with open(args.task_group_json) as f:
        task_group = json.load(f)
        print('task: ', task_group)

    startup_prog = fluid.Program()
    if args.do_train and args.do_test:
        print("can not set both do_train and do_test as True")
        return

    model_name = MODELS[args.task_name]
    if args.do_train:
        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, model_outputs = model_name(
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    task_group=task_group)

                total_loss = model_outputs[0]
                scheduled_lr = get_optimizer(total_loss, train_program,
                                             startup_prog, args)

    if args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, model_outputs = model_name(
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    task_group=task_group,
                    is_prediction=True)
                total_loss = model_outputs[0]

        test_prog = test_prog.clone(for_test=True)

    if args.use_gpu:
        gpu_id = 0
        if os.getenv("FLAGS_selected_gpus"):
            gpu_id = int(os.getenv("FLAGS_selected_gpus"))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()

    print("theoretical memory usage: ")
    if args.do_train:
        print(
            fluid.contrib.memory_usage(program=train_program,
                                       batch_size=args.batch_size))
    if args.do_test:
        print(
            fluid.contrib.memory_usage(program=test_prog,
                                       batch_size=args.batch_size))

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)

        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_checkpoint != "":
            sys.stderr.write(
                '############################WARNING############################'
            )
            sys.stderr.write(
                '####### using init_pretraining_params, not init_checkpoint ####'
            )
            sys.stderr.write(
                '## meaning hyper param e.g. lr won\'t inherit from checkpoint##'
            )
            sys.stderr.write(
                '###############################################################'
            )
            init_pretraining_params(exe, args.init_checkpoint, train_program)

        reader_name = READERS[args.task_name]
        data_reader = reader_name(
            task_group,
            split=args.split,
            vocab_path=args.vocab_path,
            batch_size=args.batch_size,
            epoch=args.epoch,
        )

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = 2

    exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps)

    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False

    if args.use_fuse:
        build_strategy.fuse_all_reduce_ops = True

    if args.do_train:
        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=total_loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

    if args.do_test:
        predict = predict_wrapper(args,
                                  exe,
                                  ernie_config,
                                  task_group,
                                  test_prog=test_prog,
                                  pyreader=test_pyreader,
                                  graph_vars=model_outputs)
        result = predict()

    if args.do_train:
        train_pyreader.decorate_tensor_provider(data_reader.data_generator())
        train_pyreader.start()

        # For testing purposes
        preds = []
        targets = []

        steps = 0
        time_begin = time.time()
        node_nums = 1  #int(os.getenv("PADDLE_NODES_NUM"))
        used_time_all = 0
        while steps < args.num_train_steps:
            try:
                steps += node_nums
                skip_steps = args.skip_steps * node_nums
                fetch_list = []
                if nccl2_trainer_id == 0 and steps % skip_steps == 0:
                    task_name_list = [v.name for v in model_outputs]
                    fetch_list = task_name_list
                    fetch_list.append(scheduled_lr.name)

                time_begin = time.time()
                outputs = train_exe.run(fetch_list=fetch_list)
                if outputs:
                    print("feed_queue size", train_pyreader.queue.size())
                    progress_file = data_reader.get_progress()
                    epoch = progress_file["current_epoch"]
                    current_file_index = progress_file["current_file_index"]
                    total_file = progress_file["total_file"]
                    current_file = progress_file["current_file"]
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "acc: %f" % (epoch, current_file_index, total_file,
                                       steps, outputs[0][0], outputs[1][0]))
                    print("steps:", steps)
                    print("save_steps:", args.save_steps)

                    # For Validation & testing purposes
                    preds.append(outputs[2][0])
                    targets.append(outputs[3][0])

                    if steps % 500 == 0:
                        print("Train-RCAC", roc_auc_score(targets, preds))
                        preds = []
                        targets = []

                    np_lr = outputs[-1:]

                    date_str = datetime.datetime.now().strftime(
                        "%Y%m%d %H:%M:%S")

                    np_lr = float(np.mean(np_lr[0]))
                    print("%s current learning_rate:%.8f" % (date_str, np_lr))

                    if steps % args.save_steps == 0:
                        save_path = os.path.join(
                            args.checkpoints,
                            "step_" + str(steps) + str(args.split))
                        print("save_path:", save_path)
                        fluid.io.save_persistables(exe, save_path,
                                                   train_program)
                    time_end = time.time()
                    used_time = time_end - time_begin
                    time_end = time_begin
                    print("used_time:", used_time)

                if steps == args.stop_steps:
                    break

            except fluid.core.EOFException:
                train_pyreader.reset()
                break
Beispiel #9
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    if args.task_type == "dialog":
        ernie_config["role_type_size"] = args.role_type_size
        ernie_config["turn_type_size"] = args.turn_type_size
    if args.hidden_dropout_prob >= 0:
        ernie_config["hidden_dropout_prob"] = args.hidden_dropout_prob
    if args.attention_probs_dropout_prob >= 0:
        ernie_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
    ernie_config.print_config()

    if args.pred_batch_size <= 0:
        args.pred_batch_size = args.batch_size

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed:
        gpus = os.getenv("FLAGS_selected_gpus").split(",")
        gpu_id = int(gpus[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = len(gpus) if args.is_distributed else gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    reader = Seq2SeqReader(args)
    ernie_gen = ErnieGenFinetune(args, ernie_config, reader.tokenizer)

    if not (args.do_train or args.do_val or args.do_test or args.do_pred):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM"))
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // trainers_num
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = ernie_gen.create_model()
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test or args.do_pred:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = ernie_gen.create_model(
                    decoding=args.do_decode)
        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))
        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)
    init_model(args, exe, startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)
        train_pyreader.set_batch_generator(train_data_generator)
        train_resource = {
            "exe": train_exe,
            "program": train_program,
            "pyreader": train_pyreader
        }
        save_model = partial(save_checkpoint, program=train_program, exe=exe)

    test_dev_count = 1
    if args.do_val or args.do_test or args.do_pred:
        test_exe = exe
        if args.use_multi_gpu_test:
            test_dev_count = min(trainers_num,
                                 int(os.getenv("PADDLE_PROC_PER_NODE", "1")))
        test_resource = {
            "exe": test_exe,
            "program": test_prog,
            "pyreader": test_pyreader
        }
        eval_data_generator = partial(reader.data_generator,
                                      batch_size=args.pred_batch_size,
                                      epoch=1,
                                      dev_count=test_dev_count,
                                      shuffle=False,
                                      do_decode=args.do_decode,
                                      place=place)
        eval_func = partial(ernie_gen.evaluate,
                            resource=test_resource,
                            graph_vars=test_graph_vars,
                            dev_count=test_dev_count,
                            output_path=args.checkpoints,
                            gpu_id=trainer_id)
        evaluate = partial(evaluate_datasets,
                           pyreader=test_pyreader,
                           reader=reader,
                           eval_func=eval_func,
                           data_generator=eval_data_generator)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        last_epoch = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()

        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if args.save_and_valid_by_epoch:
                    suffix = "epoch_" + str(last_epoch)
                else:
                    suffix = "step_" + str(steps)
                if steps % skip_steps == 0:
                    outputs = ernie_gen.evaluate(train_resource, "train",
                                                 graph_vars)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    if args.in_tokens:
                        current_example, current_epoch = reader.get_train_progress(
                        )
                    else:
                        current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                        current_example = steps * args.batch_size * trainers_num % num_train_examples

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "ppl: %f, speed: %f steps/s" %
                          (current_epoch, current_example, num_train_examples,
                           steps, outputs["loss"], outputs["ppl"],
                           args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id >= test_dev_count:
                    continue

                do_save = False
                do_eval = False
                if not args.save_and_valid_by_epoch:
                    if steps % args.save_steps == 0 and nccl2_trainer_id == 0:
                        do_save = True
                    if steps % args.validation_steps == 0:
                        do_eval = True
                else:
                    if args.in_tokens:
                        current_example, current_epoch = reader.get_train_progress(
                        )
                    else:
                        current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                    if current_epoch != last_epoch:
                        if nccl2_trainer_id == 0:
                            do_save = True
                        do_eval = True

                if do_save:
                    save_model(suffix=suffix)
                if do_eval:
                    evaluate(suffix=suffix)

                last_epoch = current_epoch

            except fluid.core.EOFException:
                save_model(suffix=suffix)
                train_pyreader.reset()
                break

    if nccl2_trainer_id >= test_dev_count:
        return

    if args.do_val or args.do_test or args.do_pred:
        suffix = "output"
        if args.do_train:
            if not args.save_and_valid_by_epoch:
                suffix = "step_" + str(steps)
            else:
                suffix = "epoch_" + str(last_epoch)

        evaluate(suffix=suffix, do_pred=True)
    def run_trainer(self, args):
        self.lr = args.lr
        if args.nccl2_reduce_layer_local_run:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, single_device=True)
        elif args.use_dgc:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
        else:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size)

        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program(),
                                  skip_grads=True)
        if args.update_method == "pserver":
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode, args.dc_asgd)
            trainer_prog = t.get_trainer_program()
        elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            # transpile for nccl2
            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
            config.nccl_comm_num = args.nccl_comm_num
            nccl2_t = fluid.DistributeTranspiler(config=config)
            nccl2_t.transpile(args.trainer_id,
                              program=fluid.default_main_program(),
                              startup_program=fluid.default_startup_program(),
                              trainers=args.endpoints,
                              current_endpoint=args.current_endpoint)

            trainer_prog = fluid.default_main_program()
        else:
            trainer_prog = fluid.default_main_program()

        if args.use_cuda:
            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
            place = fluid.CUDAPlace(device_id)
        else:
            place = fluid.CPUPlace()

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1
        exec_strategy.allow_op_delay = False

        build_stra = fluid.BuildStrategy()
        # FIXME force disable enable_inplace and memory_optimize
        build_stra.enable_inplace = False
        build_stra.memory_optimize = False

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        pass_builder = None
        if args.batch_merge_repeat > 1:
            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
            mypass.set("num_repeats", args.batch_merge_repeat)

        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = len(args.endpoints.split(","))
            build_stra.trainer_id = args.trainer_id
        else:
            # case args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = 1
            build_stra.trainer_id = 0

        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
            loss_name=avg_cost.name,
            build_strategy=build_stra,
            exec_strategy=exec_strategy)

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        out_losses = []
        for _ in six.moves.xrange(RUN_STEP):
            loss, = exe.run(binary,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))
Beispiel #11
0
 def get_transpile(self, mode, trainers="127.0.0.1:6174"):
     config = fluid.DistributeTranspilerConfig()
     config.mode = 'collective'
     config.collective_mode = mode
     t = fluid.DistributeTranspiler(config=config)
     return t
Beispiel #12
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_path)
    word2vec_reader = None
    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
        word2vec_reader = reader.Word2VecReader(
            args.dict_path, args.train_data_path, filelist, 0, 1)
    else:
        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
        trainers = int(os.environ["PADDLE_TRAINERS"])
        word2vec_reader = reader.Word2VecReader(args.dict_path,
                                                args.train_data_path, filelist,
                                                trainer_id, trainer_num)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    loss, py_reader = skip_gram_word2vec(
        word2vec_reader.dict_size,
        word2vec_reader.word_frequencys,
        args.embedding_size,
        args.max_code_length,
        args.with_hs,
        args.with_nce,
        is_sparse=args.is_sparse)

    optimizer = None
    if args.with_Adam:
        optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    else:
        optimizer = fluid.optimizer.SGD(learning_rate=1e-4)

    optimizer.minimize(loss)

    # do local training 
    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
        logger.info("run local training")
        main_program = fluid.default_main_program()

        with open("local.main.proto", "w") as f:
            f.write(str(main_program))

        train_loop(args, main_program, word2vec_reader, py_reader, loss, 0)
    # do distribute training
    else:
        logger.info("run dist training")

        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
        trainers = int(os.environ["PADDLE_TRAINERS"])
        training_role = os.environ["PADDLE_TRAINING_ROLE"]

        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port

        config = fluid.DistributeTranspilerConfig()
        config.slice_var_up = False
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            pservers=pserver_endpoints,
            trainers=trainers,
            sync_mode=True)

        if training_role == "PSERVER":
            logger.info("run pserver")
            prog = t.get_pserver_program(current_endpoint)
            startup = t.get_startup_program(
                current_endpoint, pserver_program=prog)

            with open("pserver.main.proto.{}".format(os.getenv("CUR_PORT")),
                      "w") as f:
                f.write(str(prog))

            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif training_role == "TRAINER":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()

            with open("trainer.main.proto.{}".format(trainer_id), "w") as f:
                f.write(str(train_prog))

            train_loop(args, train_prog, word2vec_reader, py_reader, loss,
                       trainer_id)
Beispiel #13
0
def main(args):
    """Run GraphSum model."""

    model_config = GraphSumConfig(args.config_path)
    model_config.print_config()

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed:
        gpus = os.getenv("FLAGS_selected_gpus").split(",")
        gpu_id = int(gpus[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = len(gpus) if args.is_distributed else gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    """load vocabulary"""
    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(args.vocab_path)
    symbols = {'BOS': spm.PieceToId('<S>'), 'EOS': spm.PieceToId('</S>'), 'PAD': spm.PieceToId('<PAD>'),
               'EOT': spm.PieceToId('<T>'), 'EOP': spm.PieceToId('<P>'), 'EOQ': spm.PieceToId('<Q>'),
               'UNK': spm.PieceToId('<UNK>')}
    logger.info(symbols)
    vocab_size = len(spm)

    """create transformer model"""
    graphsum = GraphSumModel(args=args, config=model_config,
                             padding_idx=symbols['PAD'],
                             bos_idx=symbols['BOS'],
                             eos_idx=symbols['EOS'],
                             tokenizer=spm)

    reader = task_reader.GraphSumReader(
        max_para_num=args.max_para_num,
        max_para_len=args.max_para_len,
        max_tgt_len=args.max_tgt_len,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        bos_idx=symbols['BOS'],
        eos_idx=symbols['EOS'],
        pad_idx=symbols['PAD'],
        n_head=model_config['num_attention_heads'])

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
        train_data_generator = reader.data_generator_with_buffer(
            data_path=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                    args.batch_size // args.max_tgt_len) // trainers_num
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        if args.lr_scheduler == 'linear_warmup_decay':
            warmup_steps = int(max_train_steps * args.warmup_proportion)
        else:
            warmup_steps = args.warmup_steps

        logger.info("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Max train steps: %d" % max_train_steps)
        logger.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = graphsum.create_model(
                    pyreader_name='train_reader')
                scheduled_lr, _ = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    d_model=model_config['hidden_size'],
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio,
                    grad_norm=args.grad_norm,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=float(args.eps))
                """
                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name
                    ])
                """

        # if args.verbose:
        #     if args.in_tokens:
        #         lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
        #             program=train_program,
        #             batch_size=args.batch_size // args.max_tgt_len)
        #     else:
        #         lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
        #             program=train_program, batch_size=args.batch_size)
        #     logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
        #                 (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = graphsum.create_model(
                    pyreader_name='test_reader',
                    is_prediction=args.do_dec)

        test_prog = test_prog.clone(for_test=True)
        print_model_params(test_prog)

    if args.do_dec:
        if not os.path.exists(args.decode_path):
            os.mkdir(args.decode_path)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    logger.info("args.is_distributed: %s" % str(args.is_distributed))
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        logger.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=worker_endpoints_env,
            current_endpoint=current_endpoint,
            program=train_program if args.do_train else test_prog,
            startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        # init position_encoding
        enc_word_pos_emb_param = fluid.global_scope().find_var(
            model_config['enc_word_pos_embedding_name']).get_tensor()
        enc_word_pos_emb_param.set(
            position_encoding_init(model_config['max_position_embeddings'],
                                   model_config['hidden_size'] // 2), place)

        enc_sent_pos_emb_param = fluid.global_scope().find_var(
            model_config['enc_sen_pos_embedding_name']).get_tensor()
        enc_sent_pos_emb_param.set(
            position_encoding_init(model_config['max_position_embeddings'],
                                   model_config['hidden_size'] // 2), place)

        dec_word_pos_emb_param = fluid.global_scope().find_var(
            model_config['dec_word_pos_embedding_name']).get_tensor()
        dec_word_pos_emb_param.set(
            position_encoding_init(model_config['max_position_embeddings'],
                                   model_config['hidden_size']), place)

        if args.init_checkpoint and args.init_pretraining_params:
            logger.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)

    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program,
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(
                use_cuda=args.use_cuda,
                main_program=test_prog,
                share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()

        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1

                if steps % skip_steps == 0:
                    outputs = evaluate(args=args, exe=train_exe, program=train_program,
                                       pyreader=train_pyreader, graph_vars=graph_vars,
                                       eval_phase="train", vocab_size=vocab_size)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size()
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        logger.info(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                                "ppl: %f, acc: %f, learning rate: %.8f, speed: %f steps/s"
                                % (current_epoch, current_example, num_train_examples,
                                   steps, outputs["loss"], outputs["ppl"], outputs["acc"],
                                   outputs["learning_rate"] if warmup_steps > 0 else args.learning_rate,
                                   args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path, train_program)

                    if steps % args.validation_steps == 0:
                        # evaluate dev set
                        if args.do_val:
                            test_pyreader.decorate_tensor_provider(
                                reader.data_generator(
                                    args.dev_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False,
                                    phase='dev',
                                    do_dec=args.do_dec,
                                    place=place))
                            evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader,
                                     graph_vars=test_graph_vars, eval_phase="dev",
                                     vocab_size=vocab_size, do_dec=args.do_dec,
                                     vocab_path=args.vocab_path, features=reader.get_features("dev"),
                                     decode_path=args.decode_path + "/valid_" + str(steps) + "_preds")
                        # evaluate test set
                        if args.do_test:
                            test_pyreader.decorate_tensor_provider(
                                reader.data_generator(
                                    args.test_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False,
                                    phase='test',
                                    do_dec=args.do_dec,
                                    place=place))
                            evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader,
                                     graph_vars=test_graph_vars, eval_phase="test",
                                     vocab_size=vocab_size, do_dec=args.do_dec,
                                     vocab_path=args.vocab_path, features=reader.get_features("test"),
                                     decode_path=args.decode_path + "/test_" + str(steps) + "_preds")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if nccl2_trainer_id == 0:
        # final eval on dev set
        if args.do_val:
            test_pyreader.decorate_tensor_provider(
                reader.data_generator(
                    args.dev_set,
                    batch_size=args.batch_size,
                    epoch=1,
                    dev_count=1,
                    shuffle=False,
                    phase='dev',
                    do_dec=args.do_dec,
                    place=place))
            logger.info("Final validation result:")
            evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader,
                     graph_vars=test_graph_vars, eval_phase="dev",
                     vocab_size=vocab_size, do_dec=args.do_dec,
                     vocab_path=args.vocab_path, features=reader.get_features("dev"),
                     decode_path=args.decode_path + "/valid_final_preds")

        # final eval on test set
        if args.do_test:
            test_pyreader.decorate_tensor_provider(
                reader.data_generator(
                    args.test_set,
                    batch_size=args.batch_size,
                    epoch=1,
                    dev_count=1,
                    shuffle=False,
                    phase='test',
                    do_dec=args.do_dec,
                    place=place))
            logger.info("Final test result:")
            evaluate(args=args, exe=test_exe, program=test_prog, pyreader=test_pyreader,
                     graph_vars=test_graph_vars, eval_phase="test",
                     vocab_size=vocab_size, do_dec=args.do_dec,
                     vocab_path=args.vocab_path, features=reader.get_features("test"),
                     decode_path=args.decode_path + "/test_final_preds")
Beispiel #14
0
def train(args):
    print("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config)
            scheduled_lr = optimization(loss=total_loss,
                                        warmup_steps=args.warmup_steps,
                                        num_train_steps=args.num_train_steps,
                                        learning_rate=args.learning_rate,
                                        train_program=train_program,
                                        startup_prog=startup_prog,
                                        weight_decay=args.weight_decay,
                                        scheduler=args.lr_scheduler,
                                        use_fp16=args.use_fp16,
                                        loss_scaling=args.loss_scaling)

            fluid.memory_optimize(input_program=train_program,
                                  skip_opt_set=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='test_reader', ernie_config=ernie_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)
    print("theoretical memory usage: ")
    if args.in_tokens:
        print(
            fluid.contrib.memory_usage(program=train_program,
                                       batch_size=args.batch_size //
                                       args.max_seq_len))
    else:
        print(
            fluid.contrib.memory_usage(program=train_program,
                                       batch_size=args.batch_size))

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = ErnieDataReader(filelist=args.train_filelist,
                                  batch_size=args.batch_size,
                                  vocab_path=args.vocab_path,
                                  voc_size=ernie_config['vocab_size'],
                                  epoch=args.epoch,
                                  max_seq_len=args.max_seq_len,
                                  generate_neg_sample=args.generate_neg_sample,
                                  in_tokens=args.in_tokens,
                                  is_bidirection=args.is_bidirection)

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps)

    build_strategy = fluid.BuildStrategy()
    build_strategy.remove_unnecessary_lock = False

    train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=total_loss.name,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy,
                                       main_program=train_program,
                                       num_trainers=nccl2_num_trainers,
                                       trainer_id=nccl2_trainer_id)

    if args.valid_filelist and args.valid_filelist != "":
        predict = predict_wrapper(args,
                                  exe,
                                  ernie_config,
                                  test_prog=test_prog,
                                  pyreader=test_pyreader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_pyreader.decorate_tensor_provider(data_reader.data_generator())
    train_pyreader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += nccl2_num_trainers
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                train_exe.run(fetch_list=[])
                continue

            if steps % skip_steps != 0:
                train_exe.run(fetch_list=[])
            else:
                each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
                    fetch_list=[
                        next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                        scheduled_lr.name
                    ])
                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                print("feed_queue size", train_pyreader.queue.size())
                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
                print("current learning_rate:%f" % np_lr[0])
                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
                    %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file, mask_type))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)

            if args.valid_filelist and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_pyreader.reset()
            break
Beispiel #15
0
def train(args):
    print("pretraining start")
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)
            scheduled_lr, loss_scaling = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_program,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
                use_fp16=args.use_fp16,
                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                init_loss_scaling=args.init_loss_scaling,
                incr_every_n_steps=args.incr_every_n_steps,
                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                incr_ratio=args.incr_ratio,
                decr_ratio=args.decr_ratio)

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = DataReader(data_dir=args.data_dir,
                             batch_size=args.batch_size,
                             in_tokens=args.in_tokens,
                             vocab_path=args.vocab_path,
                             voc_size=bert_config['vocab_size'],
                             epoch=args.epoch,
                             max_seq_len=args.max_seq_len,
                             generate_neg_sample=args.generate_neg_sample)

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.use_experimental_executor = args.use_fast_executor
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

    build_strategy = fluid.BuildStrategy()
    if not sys.platform == "win32":
        build_strategy.num_trainers = nccl2_num_trainers
    elif nccl2_num_trainers > 1:
        raise ValueError(
            "Windows platform doesn't support distributed training!")
    build_strategy.trainer_id = nccl2_trainer_id
    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=total_loss.name,
                                              exec_strategy=exec_strategy,
                                              build_strategy=build_strategy)

    if args.validation_set_dir and args.validation_set_dir != "":
        predict = predict_wrapper(args,
                                  exe,
                                  bert_config,
                                  test_prog=test_prog,
                                  data_loader=test_data_loader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_data_loader.set_batch_generator(data_reader.data_generator())
    train_data_loader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += 1
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)
                continue

            if steps % args.skip_steps != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)

            else:
                fetch_list = [
                    next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                    scheduled_lr.name
                ]
                if args.use_fp16:
                    fetch_list.append(loss_scaling.name)

                if use_ngraph:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_program)
                else:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_compiled_program)

                if args.use_fp16:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs
                else:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs

                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file = data_reader.get_progress(
                )
                if args.verbose:
                    verbose = "feed_queue size: %d, " % train_data_loader.queue.size(
                    )
                    verbose += "current learning_rate: %f, " % np_lr[0]
                    if args.use_fp16:
                        verbose += "loss scaling: %f" % np_scaling[0]
                    print(verbose)

                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.save(program=train_program, model_path=save_path)

            if args.validation_set_dir and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_data_loader.reset()
            break
Beispiel #16
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.ClassifyReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        tokenizer=args.tokenizer,
        is_classify=args.is_classify,
        is_regression=args.is_regression,
        for_cn=args.for_cn,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        if args.random_seed is not None and args.enable_ce:
            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
		    use_fp16=args.use_fp16,
		    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
		    init_loss_scaling=args.init_loss_scaling,
		    incr_every_n_steps=args.incr_every_n_steps,
		    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
		    incr_ratio=args.incr_ratio,
		    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)

        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        
        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=worker_endpoints_env,
            current_endpoint=current_endpoint,
            program=train_program if args.do_train else test_prog,
            startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program,
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(
                use_cuda=args.use_cuda,
                main_program=test_prog,
                share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        current_epoch = 0
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(
                        train_exe,
                        train_program,
                        train_pyreader,
                        graph_vars,
                        "train",
                        metric=args.metric,
                        is_classify=args.is_classify,
                        is_regression=args.is_regression)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin

                    if args.is_classify:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"], outputs["accuracy"],
                             args.skip_steps / used_time))
                        ce_info.append(
                            [outputs["loss"], outputs["accuracy"], used_time])
                    if args.is_regression:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            " speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"],
                             args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path, train_program)

                    if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                        # evaluate dev set
                        if args.do_val:
                            evaluate_wrapper(args, reader, exe, test_prog,
                                             test_pyreader, graph_vars,
                                             current_epoch, steps)

                        if args.do_test:
                            predict_wrapper(args, reader, exe, test_prog,
                                            test_pyreader, graph_vars,
                                            current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_loss = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_loss = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                log.info("ce info error")
            log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
            log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
            log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
                        current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.diagnostic,
                batch_size=args.batch_size,
                epoch=1,
                dev_count=1,
                shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(
            test_exe,
            test_prog,
            test_pyreader,
            graph_vars,
            is_classify=args.is_classify,
            is_regression=args.is_regression)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
Beispiel #17
0
def main(args):
    """main func"""
    unimo_config = UNIMOConfig(args.unimo_config_path)
    if args.hidden_dropout_prob >= 0:
        unimo_config["hidden_dropout_prob"] = args.hidden_dropout_prob
    if args.attention_probs_dropout_prob >= 0:
        unimo_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
    unimo_config.print_config()

    if args.pred_batch_size <= 0:
        args.pred_batch_size = args.batch_size

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    """load vocabulary"""
    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=True)

    reader = Img2TxtReader(tokenizer, args)
    img2txt = Img2Txt(args, unimo_config, tokenizer)

    if not (args.do_train or args.do_val or args.do_test or args.do_pred):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
        train_data_generator = reader.data_generator(
            filelist=args.train_filelist,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(
            args.train_filelist)  # 566747
        max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                print("using adv_type is ", args.adv_type)
                if args.adv_type == "freelb_text":
                    train_pyreader, graph_vars = img2txt.create_model_freelb_text(
                    )
                elif args.adv_type == "freelb_image":
                    train_pyreader, graph_vars = img2txt.create_model_freelb_image(
                    )
                elif args.adv_type == "villa":
                    train_pyreader, graph_vars = img2txt.create_model_villa()
                else:
                    print(
                        "Unsupported adv_type, run model without adversial training"
                    )
                    train_pyreader, graph_vars = img2txt.create_model()

                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

    if args.do_val or args.do_test or args.do_pred:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = img2txt.create_model(
                    decoding=args.do_decode)
        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))
        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)
    init_model(args, exe, train_program if args.do_train else test_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = 4 if args.use_fp16 else 2  # 2 for fp32 4 for fp16
        exec_strategy.num_iteration_per_drop_scope = min(
            args.num_iteration_per_drop_scope, args.skip_steps)

        build_strategy = fluid.BuildStrategy()
        build_strategy.remove_unnecessary_lock = False

        if args.use_fuse:
            build_strategy.fuse_all_reduce_ops = True

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)
        train_pyreader.set_batch_generator(train_data_generator)
        train_resource = {
            "exe": train_exe,
            "program": train_program,
            "pyreader": train_pyreader
        }
        save_model = partial(save_checkpoint, program=train_program, exe=exe)

    test_dev_count = 1
    if args.do_val or args.do_test or args.do_pred:
        test_exe = exe
        if args.use_multi_gpu_test:
            test_dev_count = nccl2_num_trainers
        test_resource = {
            "exe": test_exe,
            "program": test_prog,
            "pyreader": test_pyreader
        }
        eval_data_generator = partial(reader.data_generator,
                                      batch_size=args.pred_batch_size,
                                      epoch=1,
                                      dev_count=test_dev_count,
                                      shuffle=False,
                                      do_decode=args.do_decode,
                                      place=place)
        eval_func = partial(img2txt.evaluate,
                            resource=test_resource,
                            graph_vars=test_graph_vars,
                            dev_count=test_dev_count,
                            output_path=args.checkpoints,
                            gpu_id=nccl2_trainer_id)
        evaluate = partial(evaluate_datasets,
                           pyreader=test_pyreader,
                           reader=reader,
                           eval_func=eval_func,
                           data_generator=eval_data_generator)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        last_epoch = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()

        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if args.save_and_valid_by_epoch:
                    suffix = "epoch_" + str(last_epoch)
                else:
                    suffix = "step_" + str(steps)
                if steps % skip_steps == 0:
                    outputs = img2txt.evaluate(train_resource, "train",
                                               graph_vars)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %.8f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                    current_example = steps * args.batch_size * trainers_num % num_train_examples

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        "%s - epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "ppl: %f, speed: %f steps/s" %
                        (get_time(), current_epoch, current_example,
                         num_train_examples, steps, outputs["loss"],
                         outputs["ppl"], args.skip_steps / used_time))
                    time_begin = time.time()

                    if args.visualdl_log and nccl2_trainer_id == 0:
                        visuallog_dict = OrderedDict()
                        visuallog_dict["ppl"] = outputs["ppl"]
                        visualdl_log(visuallog_dict,
                                     outputs["ppl"],
                                     steps,
                                     phase='train')
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id >= test_dev_count:
                    continue

                do_save = False
                do_eval = False
                if not args.save_and_valid_by_epoch:
                    if steps % args.save_steps == 0 and nccl2_trainer_id == 0:
                        do_save = True
                    if steps % args.validation_steps == 0:
                        do_eval = True
                else:
                    current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                    if current_epoch != last_epoch:
                        if nccl2_trainer_id == 0:
                            do_save = True
                        do_eval = True

                if do_save:
                    save_model(suffix=suffix)
                if do_eval:
                    if args.do_val or args.do_test or args.do_pred:
                        evaluate(suffix=suffix)

                if args.save_and_valid_by_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_model(suffix=suffix)
                train_pyreader.reset()
                break

    if nccl2_trainer_id >= test_dev_count:
        return

    if args.do_val or args.do_test or args.do_pred:
        suffix = "output"
        if args.do_train:
            if not args.save_and_valid_by_epoch:
                suffix = "step_" + str(steps)
            else:
                suffix = "epoch_" + str(last_epoch)

        evaluate(suffix=suffix, do_pred=True)