コード例 #1
0
def init_predict_checkpoint(args, exe, startup_prog):
    if args.dataset in ["pathQueryWN", "pathQueryFB"]:
        assert args.sen_candli_file is not None and args.sen_trivial_file is not None, "during test, pathQuery sen_candli_file and path_trivial_file must be set "
    if not args.init_checkpoint:
        raise ValueError("args 'init_checkpoint' should be set if"
                         "only doing prediction!")
    init_checkpoint(exe,
                    args.init_checkpoint,
                    main_program=startup_prog,
                    use_fp16=args.use_fp16)
コード例 #2
0
ファイル: convert_params.py プロジェクト: wykdg/Erine_To_Bert
def convert(args):
    tf_fluid_param_name_map, tf_param_name_shape_map = parse(
        args.init_tf_checkpoint)
    program = fluid.Program()
    global_block = program.global_block()
    for param in tf_fluid_param_name_map:
        global_block.create_parameter(
            name=tf_fluid_param_name_map[param],
            shape=tf_param_name_shape_map[param],
            dtype='float32',
            initializer=fluid.initializer.Constant(value=0.0))

    place = fluid.core.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(program)

    #load the fluid model
    init_checkpoint(exe, "params", program)

    print(
        '---------------------- Converted Parameters -----------------------')
    print(
        '###### [Fluid param name] --> [TF param name]  [param shape] ######')
    print(
        '-------------------------------------------------------------------')

    nodes_value = []
    var_list = []
    for param in tf_fluid_param_name_map:

        init_value = fluid.global_scope().find_var(
            tf_fluid_param_name_map[param]).get_tensor()
        init_value = np.array(init_value)

        if param == 'cls/seq_relationship/output_weights':
            init_value = np.transpose(init_value)
        if param == 'cls/squad/output_weights':
            init_value = np.transpose(init_value)
        fluid_shape = init_value.shape
        # tf.Variable(init_value,name=param)
        n = tf.get_variable(name=param, dtype=tf.float32, shape=fluid_shape)
        var_list.append(n)
        nodes_value.append((n, init_value))
        print(tf_fluid_param_name_map[param], ' --> ', param, '  ',
              fluid_shape)
    saver = tf.train.Saver(var_list=var_list)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    ops = []
    for n, v in nodes_value:
        ops.append(tf.assign(n, v))
    sess.run(ops)

    saver.save(sess, "erine.ckpt", write_meta_graph=False, write_state=False)
コード例 #3
0
def predict_trigger(trigger_dict):
    """
    trigger predict
    """
    log = trigger_dict['log']
    args = trigger_dict['args']
    labels_map = trigger_dict['labels_map']
    ernie_config = trigger_dict['ernie_config']
    place = trigger_dict['place']
    dev_count = trigger_dict['dev_count']
    reader = trigger_dict['reader']
    startup_prog = trigger_dict['startup_prog']
    test_prog = trigger_dict['test_prog']
    test_pyreader = trigger_dict['test_pyreader']
    graph_vars = trigger_dict['graph_vars']
    nccl2_num_trainers = trigger_dict['nccl2_num_trainers']
    nccl2_trainer_id = trigger_dict['nccl2_trainer_id']
    if 'exe' in trigger_dict:
        exe = trigger_dict['exe']
    else:
        exe = None
    trigger_pred_save_path = args.trigger_pred_save_path
    if os.path.exists(trigger_pred_save_path):
        print('delete old file : %s' % trigger_pred_save_path)
        os.remove(trigger_pred_save_path)

    if args.do_test:
        # TODO 2020-11-24
        # TODO 2021-01-20 取消if not exe --  use_fp16=args.use_fp16) del exe
        if not exe:
            exe = fluid.Executor(place)
            exe.run(startup_prog)

            if args.do_val or args.do_test:
                if not args.init_checkpoint:
                    raise ValueError("args 'init_checkpoint' should be set if"
                                     "only doing validation or testing!")
                init_checkpoint(exe,
                                args.init_checkpoint,
                                main_program=startup_prog,
                                use_fp16=args.use_fp16)

        log.error(
            "********************** trigger predict begin **********************"
        )
        test_ret = predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                                   graph_vars, 1, 'final')
        utils.write_by_lines(args.trigger_pred_save_path, test_ret)
        del exe
コード例 #4
0
def init_train_checkpoint(args, exe, startup_prog):
    if args.init_checkpoint and args.init_pretraining_params:
        logger.info(
            "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
            "both are set! Only arg 'init_checkpoint' is made valid.")
    if args.init_checkpoint:
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16,
                        print_var_verbose=False)
    elif args.init_pretraining_params:
        init_pretraining_params(exe,
                                args.init_pretraining_params,
                                main_program=startup_prog,
                                use_fp16=args.use_fp16)
コード例 #5
0
def re_use_exe(tri_dict):
    startup_prog = tri_dict["startup_prog"]
    args = tri_dict["args"]
    place = tri_dict["place"]
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)
    return exe
コード例 #6
0
def re_use_some(tri_dict, suf):
    ernie_config = tri_dict["ernie_config"]
    startup_prog = tri_dict["startup_prog"]
    args = tri_dict["args"]
    labels_map = tri_dict["labels_map"]
    place = tri_dict["place"]
    reader = task_reader.RoleSequenceLabelReader(
        vocab_path=args.vocab_path,
        labels_map=labels_map,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                # TODO pyreader_name 再次调整为不同
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader_role' + suf,
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)
    return reader, startup_prog, test_prog, test_pyreader, graph_vars, exe
コード例 #7
0
def main(args):
    if not (args.do_train or args.do_eval or args.do_predict):
        raise ValueError("For args `do_train`, `do_eval` and `do_predict`, at "
                         "least one of them must be True.")
    if args.do_predict and not args.predict_dir:
        raise ValueError("args 'predict_dir' should be given when doing predict")

    if not os.path.exists(args.predict_dir):
        os.makedirs(args.predict_dir)

    xlnet_config = XLNetConfig(args.model_config_path)
    xlnet_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = get_device_num()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processors = {
      "mnli_matched": reader.MnliMatchedProcessor,
      "mnli_mismatched": reader.MnliMismatchedProcessor,
      'sts-b': reader.StsbProcessor,
      'imdb': reader.ImdbProcessor,
      "yelp5": reader.Yelp5Processor
    }

    processor = processors[task_name](args)

    label_list = processor.get_labels() if not args.is_regression else None
    num_labels = len(label_list) if label_list is not None else None
    train_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
        train_program.random_seed = args.random_seed

    if args.do_train:
        # NOTE: If num_trainers > 1, the shuffle_seed must be set, because
        # the order of batch data generated by reader
        # must be the same in the respective processes.
        shuffle_seed = 1 if num_trainers > 1 else None
        train_data_generator = processor.data_generator(
            batch_size=args.train_batch_size,
            is_regression=args.is_regression,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=args.shuffle)

        num_train_examples = processor.get_num_examples(phase='train')
        print("Device count: %d" % dev_count)
        print("Max num of epoches: %d" % args.epoch)
        print("Num of train examples: %d" % num_train_examples)
        print("Num of train steps: %d" % args.train_steps)
        print("Num of warmup steps: %d" % args.warmup_steps)

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_data_loader, loss, logits, num_seqs, label_ids = create_model(
                    args,
                    xlnet_config=xlnet_config,
                    n_class=num_labels)
                scheduled_lr = optimization(
                    loss=loss,
                    warmup_steps=args.warmup_steps,
                    num_train_steps=args.train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    lr_layer_decay_rate=args.lr_layer_decay_rate,
                    scheduler=args.lr_scheduler)

    if args.do_eval:
        dev_prog = fluid.Program()
        with fluid.program_guard(dev_prog, startup_prog):
            with fluid.unique_name.guard():
                dev_data_loader, loss, logits, num_seqs, label_ids = create_model(
                    args,
                    xlnet_config=xlnet_config,
                    n_class=num_labels)

        dev_prog = dev_prog.clone(for_test=True)
        dev_data_loader.set_batch_generator(
            processor.data_generator(
                batch_size=args.eval_batch_size,
                is_regression=args.is_regression,
                phase=args.eval_split,
                epoch=1,
                dev_count=1,
                shuffle=False), place)

    if args.do_predict:
        predict_prog = fluid.Program()
        with fluid.program_guard(predict_prog, startup_prog):
            with fluid.unique_name.guard():
                predict_data_loader, loss, logits, num_seqs, label_ids = create_model(
                    args,
                    xlnet_config=xlnet_config,
                    n_class=num_labels)

        predict_prog = predict_prog.clone(for_test=True)
        predict_data_loader.set_batch_generator(
            processor.data_generator(
                batch_size=args.predict_batch_size,
                is_regression=args.is_regression,
                phase=args.eval_split,
                epoch=1,
                dev_count=1,
                shuffle=False), place)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog)
    elif args.do_eval or args.do_predict:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        build_strategy = fluid.BuildStrategy()

        if args.use_cuda and num_trainers > 1:
            assert shuffle_seed is not None
            dist_utils.prepare_for_multi_process(exe, build_strategy, train_program)
            train_data_generator = fluid.contrib.reader.distributed_batch_reader(
                  train_data_generator)

        train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
                 loss_name=loss.name, build_strategy=build_strategy)

        train_data_loader.set_batch_generator(train_data_generator, place)


    if args.do_train:
        train_data_loader.start()
        steps = 0
        total_cost, total_num_seqs, total_time = [], [], 0.0
        throughput = []
        ce_info = []
        while steps < args.train_steps:
            try:
                time_begin = time.time()
                steps += 1
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, scheduled_lr.name, num_seqs.name]
                else:
                    fetch_list = []

                outputs = exe.run(train_compiled_program, fetch_list=fetch_list)

                time_end = time.time()
                used_time = time_end - time_begin
                total_time += used_time

                if steps % args.skip_steps == 0:
                    np_loss, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
                        )
                        verbose += "learning rate: %f" % np_lr[0]
                        print(verbose)

                    current_example, current_epoch = processor.get_train_progress(
                    )

                    log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}".format(
                           current_epoch, current_example, num_train_examples,
                           steps, np.sum(total_cost) / np.sum(total_num_seqs))
                    ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), used_time])
                    if steps > 0 :
                        throughput.append( args.skip_steps / total_time)
                        log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / total_time)
                        print(log_record)
                    else:
                        print(log_record)
                    total_cost, total_num_seqs, total_time = [], [], 0.0

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    print("Average throughtput: %s" % (np.average(throughput)))
                    throughput = []
                    # evaluate dev set
                    if args.do_eval:
                        evaluate(exe, dev_prog, dev_data_loader,
                                 [loss.name,  num_seqs.name, logits.name, label_ids.name],
                                 args.eval_split, processor.get_num_examples(phase=args.eval_split))
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_data_loader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_cost = 0
            ce_time = 0
            try:
                ce_cost = ce_info[-2][0]
                ce_time = ce_info[-2][1]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_%s_card%s\t%s" %
                (args.task_name.replace("-", "_"), card_num, ce_time))
            print("kpis\ttrain_cost_%s_card%s\t%f" %
                (args.task_name.replace("-", "_"), card_num, ce_cost))


    # final eval on dev set
    if args.do_eval:
        evaluate(exe, dev_prog, dev_data_loader,
                 [loss.name, num_seqs.name, logits.name, label_ids], args.eval_split,
                 processor.get_num_examples(phase=args.eval_split))

    # final eval on test set
    if args.do_predict:
        predict(exe, predict_prog, predict_data_loader, task_name, label_list, [logits.name])
コード例 #8
0
ファイル: mtl_run.py プロジェクト: 0YuanZhang0/PALM
def train(multitask_config):

    # load task config
    print("Loading multi_task configure...................")
    args = PDConfig(yaml_file=[multitask_config])
    args.build()

    index = 0
    reader_map_task = dict()
    task_args_list = list()
    reader_args_list = list()
    id_map_task = {index: args.main_task}
    print("Loading main task configure....................")
    main_task_name = args.main_task
    task_config_files = [
        i for i in os.listdir(TASKSET_PATH) if i.endswith('.yaml')
    ]
    main_config_list = [
        config for config in task_config_files
        if config.split('.')[0] == main_task_name
    ]
    main_args = None
    for config in main_config_list:
        main_yaml = os.path.join(TASKSET_PATH, config)
        main_args = PDConfig(yaml_file=[multitask_config, main_yaml])
        main_args.build()
        main_args.Print()
        if not task_args_list or main_task_name != task_args_list[-1][0]:
            task_args_list.append((main_task_name, main_args))
        reader_args_list.append((config.strip('.yaml'), main_args))
        reader_map_task[config.strip('.yaml')] = main_task_name

    print("Loading auxiliary tasks configure...................")
    aux_task_name_list = args.auxiliary_task.strip().split()
    for aux_task_name in aux_task_name_list:
        index += 1
        id_map_task[index] = aux_task_name
        print("Loading %s auxiliary tasks configure......." % aux_task_name)
        aux_config_list = [
            config for config in task_config_files
            if config.split('.')[0] == aux_task_name
        ]
        for aux_yaml in aux_config_list:
            aux_yaml = os.path.join(TASKSET_PATH, aux_yaml)
            aux_args = PDConfig(yaml_file=[multitask_config, aux_yaml])
            aux_args.build()
            aux_args.Print()
            if aux_task_name != task_args_list[-1][0]:
                task_args_list.append((aux_task_name, aux_args))
            reader_args_list.append((aux_yaml.strip('.yaml'), aux_args))
            reader_map_task[aux_yaml.strip('.yaml')] = aux_task_name

    # import tasks reader module and build joint_input_shape
    input_shape_list = []
    reader_module_dict = {}
    input_shape_dict = {}
    for params in task_args_list:
        task_reader_mdl = "%s_reader" % params[0]
        reader_module = importlib.import_module(task_reader_mdl)
        reader_servlet_cls = getattr(reader_module, "get_input_shape")
        reader_input_shape = reader_servlet_cls(params[1])
        reader_module_dict[params[0]] = reader_module
        input_shape_list.append(reader_input_shape)
        input_shape_dict[params[0]] = reader_input_shape
    train_input_shape, test_input_shape, task_map_id = joint_reader.joint_input_shape(
        input_shape_list)

    # import backbone model
    backbone_mdl = args.backbone_model
    backbone_cls = "Model"
    backbone_module = importlib.import_module(backbone_mdl)
    backbone_servlet = getattr(backbone_module, backbone_cls)

    if not (args.do_train or args.do_predict):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")
    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
    startup_prog = fluid.default_startup_program()

    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        #create joint pyreader
        print('creating readers...')
        gens = []
        main_generator = ""
        for params in reader_args_list:
            generator_cls = getattr(
                reader_module_dict[reader_map_task[params[0]]],
                "DataProcessor")
            generator_inst = generator_cls(params[1])
            reader_generator = generator_inst.data_generator(
                phase='train', shuffle=True, dev_count=dev_count)
            if not main_generator:
                main_generator = generator_inst
            gens.append((reader_generator, params[1].mix_ratio,
                         reader_map_task[params[0]]))
        joint_generator, train_pyreader, model_inputs = create_reader(
            "train_reader", train_input_shape, True, task_map_id, gens)

        train_pyreader.decorate_tensor_provider(joint_generator)

        # build task inputs
        task_inputs_list = []
        main_test_input = []
        task_id = model_inputs[0]
        backbone_inputs = model_inputs[task_map_id[0][0]:task_map_id[0][1]]
        for i in range(1, len(task_map_id)):
            task_inputs = backbone_inputs + model_inputs[
                task_map_id[i][0]:task_map_id[i][1]]
            task_inputs_list.append(task_inputs)

        # build backbone model
        print('building model backbone...')
        conf = vars(args)
        if args.pretrain_config_path is not None:
            model_conf = JsonConfig(args.pretrain_config_path).asdict()
            for k, v in model_conf.items():
                if k in conf:
                    assert k == conf[
                        k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml"
            conf.update(model_conf)

        backbone_inst = backbone_servlet(conf, is_training=True)

        print('building task models...')
        num_train_examples = main_generator.get_num_examples()
        if main_args.in_tokens:
            max_train_steps = int(main_args.epoch * num_train_examples) // (
                main_args.batch_size // main_args.max_seq_len) // dev_count
        else:
            max_train_steps = int(main_args.epoch * num_train_examples) // (
                main_args.batch_size) // dev_count
        mix_ratio_list = [
            task_args[1].mix_ratio for task_args in task_args_list
        ]
        args.max_train_steps = int(max_train_steps *
                                   (sum(mix_ratio_list) / main_args.mix_ratio))
        print("Max train steps: %d" % max_train_steps)

        build_strategy = fluid.BuildStrategy()
        train_program = fluid.default_main_program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():

                backbone_inst.build_model(backbone_inputs)
                all_loss_list = []

                for i in range(len(task_args_list)):
                    task_name = task_args_list[i][0]
                    task_args = task_args_list[i][1]

                    if hasattr(task_args, 'paradigm'):
                        task_net = task_args.paradigm
                    else:
                        task_net = task_name

                    task_net_mdl = importlib.import_module(task_net)
                    task_net_cls = getattr(task_net_mdl, "create_model")
                    output_tensor = task_net_cls(task_inputs_list[i],
                                                 base_model=backbone_inst,
                                                 is_training=True,
                                                 args=task_args)
                    loss_cls = getattr(task_net_mdl, "compute_loss")
                    task_loss = loss_cls(output_tensor, task_args)
                    all_loss_list.append(task_loss)
                    num_seqs = output_tensor['num_seqs']

                task_one_hot = fluid.layers.one_hot(task_id,
                                                    len(task_args_list))
                all_loss = fluid.layers.concat(all_loss_list, axis=0)
                loss = fluid.layers.reduce_sum(task_one_hot * all_loss)

                programs = [train_program, startup_prog]
                optimizer_mdl = importlib.import_module(args.optimizer)
                optimizer_inst = getattr(optimizer_mdl, "optimization")
                optimizer_inst(loss, programs, args=args)

                loss.persistable = True
                num_seqs.persistable = True

                ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
                ema.update()

        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=loss.name,
                                              build_strategy=build_strategy)

    if args.do_predict:
        conf = vars(args)
        if args.pretrain_config_path is not None:
            model_conf = JsonConfig(args.pretrain_config_path).asdict()
            for k, v in model_conf.items():
                if k in conf:
                    assert v == conf[
                        k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml".format(
                            k)
            conf.update(model_conf)
        mod = reader_module_dict[main_task_name]
        DataProcessor = getattr(mod, 'DataProcessor')
        predict_processor = DataProcessor(main_args)
        test_generator = predict_processor.data_generator(phase='predict',
                                                          shuffle=False,
                                                          dev_count=dev_count)

        new_test_input_shape = input_shape_dict[main_task_name][1][
            'backbone'] + input_shape_dict[main_task_name][1]['task']
        assert new_test_input_shape == test_input_shape
        build_strategy = fluid.BuildStrategy()
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                placeholder = Placeholder(test_input_shape)
                test_pyreader, model_inputs = placeholder.build(
                    capacity=100, reader_name="test_reader")

                test_pyreader.decorate_tensor_provider(test_generator)

                # create model
                backbone_inst = backbone_servlet(conf, is_training=False)

                backbone_inst.build_model(model_inputs)

                task_net_mdl = importlib.import_module(main_task_name)
                task_net_cls = getattr(task_net_mdl, "create_model")
                postprocess = getattr(task_net_mdl, "postprocess")
                global_postprocess = getattr(task_net_mdl,
                                             "global_postprocess")
                output_tensor = task_net_cls(model_inputs,
                                             base_model=backbone_inst,
                                             is_training=False,
                                             args=main_args)

                if 'ema' not in dir():
                    ema = fluid.optimizer.ExponentialMovingAverage(
                        args.ema_decay)

                pred_fetch_names = []
                fetch_vars = []
                for i, j in output_tensor.items():
                    pred_fetch_names.append(i)
                    fetch_vars.append(j)
                for var in fetch_vars:
                    var.persistable = True
                pred_fetch_list = [i.name for i in fetch_vars]

        test_prog = test_prog.clone(for_test=True)
        test_compiled_program = fluid.CompiledProgram(
            test_prog).with_data_parallel(build_strategy=build_strategy)

    exe.run(startup_prog)

    if args.do_train:
        if args.pretrain_model_path:
            init_pretraining_params(exe,
                                    args.pretrain_model_path,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
        if args.checkpoint_path:
            if os.path.exists(args.checkpoint_path):
                init_checkpoint(exe,
                                args.checkpoint_path,
                                main_program=startup_prog,
                                use_fp16=args.use_fp16)
            else:
                os.makedirs(args.checkpoint_path)

    elif args.do_predict:
        if not args.checkpoint_path:
            raise ValueError("args 'checkpoint_path' should be set if"
                             "only doing prediction!")
        init_checkpoint(exe,
                        args.checkpoint_path,
                        main_program=test_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        print('start training...')
        train_pyreader.start()

        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, num_seqs.name, task_id.name]
                else:
                    fetch_list = []

                outputs = exe.run(train_compiled_program,
                                  fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    np_loss, np_num_seqs, np_task_id = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = main_generator.get_train_progress(
                    )

                    cur_task_name = id_map_task[np_task_id[0][0]]
                    print(
                        "epoch: %d, task_name: %s, progress: %d/%d, step: %d, loss: %f, "
                        "speed: %f steps/s" %
                        (epoch, cur_task_name, current_example,
                         num_train_examples, steps, np.sum(total_cost) /
                         np.sum(total_num_seqs), args.skip_steps / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoint_path,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                if steps == max_train_steps:
                    save_path = os.path.join(args.checkpoint_path,
                                             "step_" + str(steps) + "_final")
                    fluid.io.save_persistables(exe, save_path, train_program)
                    break
            except paddle.fluid.core.EOFException as err:
                save_path = os.path.join(args.checkpoint_path,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if args.do_predict:
        print('start predicting...')
        cnt = 0
        if args.use_ema:
            with ema.apply(exe):
                test_pyreader.start()
                pred_buf = []
                while True:
                    try:
                        fetch_res = exe.run(fetch_list=pred_fetch_list,
                                            program=test_compiled_program)
                        cnt += 1
                        if cnt % 200 == 0:
                            print('predicting {}th batch...'.format(cnt))
                        fetch_dict = {}
                        for key, val in zip(pred_fetch_names, fetch_res):
                            fetch_dict[key] = val
                        res = postprocess(fetch_dict)
                        if res is not None:
                            pred_buf.extend(res)
                    except fluid.core.EOFException:
                        test_pyreader.reset()
                        break
                global_postprocess(pred_buf, predict_processor, args,
                                   main_args)
        else:
            test_pyreader.start()
            pred_buf = []
            while True:
                try:
                    fetch_res = exe.run(fetch_list=pred_fetch_list,
                                        program=test_compiled_program)
                    cnt += 1
                    if cnt % 200 == 0:
                        print('predicting {}th batch...'.format(cnt))
                    fetch_dict = {}
                    for key, val in zip(pred_fetch_names, fetch_res):
                        fetch_dict[key] = val
                    res = postprocess(fetch_dict)
                    if res is not None:
                        pred_buf.extend(res)
                except fluid.core.EOFException:
                    test_pyreader.reset()
                    break
            global_postprocess(pred_buf, predict_processor, args, main_args)
コード例 #9
0
ファイル: server.py プロジェクト: Zrachel/ERNIE
ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config()
test_prog = fluid.Program()

with fluid.program_guard(test_prog, startup_prog):
    with fluid.unique_name.guard():
        _, graph_vars = create_model_predict(args,
                                             ernie_config=ernie_config,
                                             is_prediction=True)
LABELMODE = [str(x) for x in range(args.num_labels - 1)]

test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog)

init_checkpoint(exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)

reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
                                    label_map_config=args.label_map_config,
                                    max_seq_len=args.max_seq_len,
                                    do_lower_case=args.do_lower_case,
                                    in_tokens=args.in_tokens,
                                    random_seed=args.random_seed)

#print(datetime.datetime.now())


def rm_space(line):
    line = re.sub(ur'(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])', '', line)
    return line
コード例 #10
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    reader = task_reader.MisspellingReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        tokenizer=args.tokenizer,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        log.info(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                        % (current_epoch, current_example, num_train_examples,
                           steps, loss, f1, precision, recall,
                           args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0 and steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    latest_path = os.path.join(
                        args.checkpoints, "latest"
                    )  # Always save the current copy and cover with the latest copy
                    fluid.io.save_persistables(exe, save_path, train_program)
                    fluid.io.save_persistables(exe, latest_path, train_program)

                if nccl2_trainer_id == 0 and steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(reader, exe, test_prog, test_pyreader,
                                         graph_vars, current_epoch, steps)
                    # evaluate test set
                    if args.do_test:
                        predict_wrapper(reader, exe, test_prog, test_pyreader,
                                        graph_vars, current_epoch, steps)

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if nccl2_trainer_id == 0 and args.do_val:
        current_example, current_epoch = reader.get_train_progress()
        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                         current_epoch, 'final')

    if nccl2_trainer_id == 0 and args.do_test:
        current_example, current_epoch = reader.get_train_progress()
        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                        current_epoch, 'final')
コード例 #11
0
def train(args):
    ernie_config = ErnieConfig(args.ernie_config)
    ernie_config.print_config()

    if not (args.do_train or args.do_predict):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    processor = DataProcessor(vocab_path=args.vocab_path,
                              do_lower_case=args.do_lower_case,
                              max_seq_length=args.max_seq_len,
                              in_tokens=args.in_tokens,
                              doc_stride=args.doc_stride,
                              max_query_length=args.max_query_length)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            data_path=args.train_file,
            batch_size=args.batch_size,
            phase='train',
            shuffle=True,
            dev_count=dev_count,
            version_2_with_negative=args.version_2_with_negative,
            epoch=args.epoch)

        num_train_examples = processor.get_num_examples(phase='train')
        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size) // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_data_loader, loss, num_seqs = create_model(
                    ernie_config=ernie_config, is_training=True)

                scheduled_lr, loss_scaling = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

    if args.do_predict:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model(
                    ernie_config=ernie_config, is_training=False)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_predict:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing prediction!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=loss.name,
                                              exec_strategy=exec_strategy)

        train_data_loader.set_batch_generator(train_data_generator, place)

        train_data_loader.start()
        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name,
                            loss_scaling.name
                        ]
                    else:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = exe.run(train_compiled_program,
                                  fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        np_loss, np_lr, np_num_seqs, np_scaling = outputs
                    else:
                        np_loss, np_lr, np_num_seqs = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
                        )
                        verbose += "learning rate: %f " % np_lr[0]
                        if args.use_fp16:
                            verbose += ", loss scaling: %f" % np_scaling[0]
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = processor.get_train_progress()

                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "speed: %f steps/s" %
                          (epoch, current_example, num_train_examples, steps,
                           np.sum(total_cost) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0 or steps == max_train_steps:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_data_loader.reset()
                break

    if args.do_predict:
        input_files = []
        for input_pattern in args.predict_file:
            input_files.extend(glob.glob(input_pattern))
        assert len(input_files) > 0, 'Can not find predict_file {}'.format(
            args.predict_file)
        for input_file in input_files:
            print('Run prediction on {}'.format(input_file))
            prefix = os.path.basename(input_file)
            prefix = re.sub('.json', '', prefix)

            test_data_loader.set_batch_generator(
                processor.data_generator(data_path=input_file,
                                         batch_size=args.batch_size,
                                         phase='predict',
                                         shuffle=False,
                                         dev_count=1,
                                         epoch=1), place)

            predict(exe,
                    test_prog,
                    test_data_loader, [
                        unique_ids.name, start_logits.name, end_logits.name,
                        num_seqs.name
                    ],
                    processor,
                    prefix=prefix)
コード例 #12
0
def main(args): 
    """main function"""
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if args.use_cuda: 
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else: 
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    paradigm_inst = define_paradigm.Paradigm(task_name)

    processors = {
        'udc': reader.UDCProcessor,
        'swda': reader.SWDAProcessor,
        'mrda': reader.MRDAProcessor,
        'atis_slot': reader.ATISSlotProcessor,
        'atis_intent': reader.ATISIntentProcessor,
        'dstc2': reader.DSTC2Processor,
    }
    in_tokens = {
        'udc': True,
        'swda': True,
        'mrda': True,
        'atis_slot': False,
        'atis_intent': True,
        'dstc2': True,
    }

    processor = processors[task_name](data_dir=args.data_dir,
                                      vocab_path=args.vocab_path,
                                      max_seq_len=args.max_seq_len,
                                      do_lower_case=args.do_lower_case, 
                                      in_tokens=in_tokens[task_name],
                                      task_name=task_name, 
                                      random_seed=args.random_seed)

    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_test): 
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train: 
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            shuffle=True)
        num_train_examples = processor.get_num_examples(phase='train')

        if in_tokens[task_name]: 
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else: 
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        if args.random_seed is not None:
            train_program.random_seed = args.random_seed
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                results = create_model(
                    args,
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    num_labels=num_labels,
                    paradigm_inst=paradigm_inst)
                train_pyreader = results.get("pyreader", None)
                loss = results.get("loss", None)
                probs = results.get("probs", None)
                accuracy = results.get("accuracy", None)
                num_seqs = results.get("num_seqs", None)
                scheduled_lr = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    loss_scaling=args.loss_scaling)

                if accuracy is not None: 
                    skip_opt_set = [loss.name, probs.name, accuracy.name, num_seqs.name]
                else: 
                    skip_opt_set = [loss.name, probs.name, num_seqs.name]
                fluid.memory_optimize(
                    input_program=train_program, 
                    skip_opt_set=skip_opt_set)

        if args.verbose: 
            if in_tokens[task_name]: 
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else: 
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_results = create_model(
                    args,
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    num_labels=num_labels,
                    paradigm_inst=paradigm_inst)
                test_pyreader = test_results.get("pyreader", None)
                loss = test_results.get("loss", None)
                probs = test_results.get("probs", None)
                accuracy = test_results.get("accuracy", None)
                num_seqs = test_results.get("num_seqs", None)
        test_prog = test_prog.clone(for_test=True)
    
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                  "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                  "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe, 
                args.init_checkpoint, 
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe, 
                args.init_pretraining_params, 
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test: 
        if not args.init_checkpoint: 
            raise ValueError("args 'init_checkpoint' should be set if"
                    "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train: 
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=loss.name,
            exec_strategy=exec_strategy,
            main_program=train_program)
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else: 
        train_exe = None

    if args.do_val or args.do_test:  
        test_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            main_program=test_prog,
            share_vars_from=train_exe)
   
    if args.do_train: 
        train_pyreader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        ce_info = []
        while True:
            try: 
                steps += 1
                if steps % args.skip_steps == 0: 
                    if warmup_steps <= 0: 
                        if accuracy is not None: 
                            fetch_list = [loss.name, accuracy.name, num_seqs.name]
                        else: 
                            fetch_list = [loss.name, num_seqs.name]
                    else: 
                        if accuracy is not None:
                            fetch_list = [
                                loss.name, accuracy.name, scheduled_lr.name,
                                num_seqs.name
                            ]
                        else: 
                            fetch_list = [loss.name, scheduled_lr.name, num_seqs.name]
                else: 
                    fetch_list = []
                if accuracy is not None: 
                    fetch_test_list = [loss.name, accuracy.name, num_seqs.name]
                else: 
                    fetch_test_list = [loss.name, num_seqs.name]

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0: 
                    if warmup_steps <= 0: 
                        if accuracy is not None: 
                            np_loss, np_acc, np_num_seqs = outputs
                        else: 
                            np_loss, np_num_seqs = outputs
                    else: 
                        if accuracy is not None:
                            np_loss, np_acc, np_lr, np_num_seqs = outputs
                        else: 
                            np_loss, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs) 
                    total_num_seqs.extend(np_num_seqs)
                    if accuracy is not None: 
                        total_acc.extend(np_acc * np_num_seqs)
                    
                    if args.verbose: 
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size()
                        verbose += "learning rate: %f" % (
                            np_lr[0]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose) 

                    current_example, current_epoch = processor.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    if accuracy is not None: 
                        print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                              "ave acc: %f, speed: %f steps/s" %
                              (current_time, current_epoch, current_example, num_train_examples,
                               steps, np.sum(total_cost) / np.sum(total_num_seqs),
                               np.sum(total_acc) / np.sum(total_num_seqs),
                               args.skip_steps / used_time))
                        ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time])
                    else: 
                        print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "speed: %f steps/s" %
                            (current_time, current_epoch, current_example, num_train_examples,
                            steps, np.sum(total_cost) / np.sum(total_num_seqs),
                            args.skip_steps / used_time))
                        ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time])
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                if steps % args.validation_steps == 0: 
                    #evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(  
                                batch_size=args.batch_size,
                                phase='dev',
                                epoch=1,
                                shuffle=False))
                        evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev")
                    #evaluate test set
                    if args.do_test: 
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                batch_size=args.batch_size,
                                phase='test',
                                epoch=1,
                                shuffle=False))
                        evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
    if args.do_train and args.enable_ce:
        card_num = get_cards()
        print("zytest_card_num", card_num)
        ce_loss = 0
        ce_acc = 0
        ce_time = 0
        try:
            ce_loss = ce_info[-2][0]
            ce_acc = ce_info[-2][1]
            ce_time = ce_info[-2][2]
        except:
            print("ce info error")
        print("kpis\teach_step_duration_%s_card%s\t%s" %
                (task_name, card_num, ce_time))
        print("kpis\ttrain_loss_%s_card%s\t%f" %
            (task_name, card_num, ce_loss))
        print("kpis\ttrain_acc_%s_card%s\t%f" %
            (task_name, card_num, ce_acc))

    #final eval on dev set
    if args.do_val: 
        test_pyreader.decorate_tensor_provider( 
            processor.data_generator( 
                batch_size=args.batch_size, phase='dev', epoch=1,
                shuffle=False))
        print("Final validation result:")
        evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev")

    #final eval on test set
    if args.do_test: 
        test_pyreader.decorate_tensor_provider( 
            processor.data_generator(
                batch_size=args.batch_size,
                phase='test',
                epoch=1,
                shuffle=False)) 
        print("Final test result:") 
        evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
コード例 #13
0
def main(args):
    """main function"""
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
                                        label_map_config=args.label_map_config,
                                        max_seq_len=args.max_seq_len,
                                        do_lower_case=args.do_lower_case,
                                        in_tokens=args.in_tokens,
                                        random_seed=args.random_seed,
                                        tokenizer=args.tokenizer,
                                        is_classify=args.is_classify,
                                        is_regression=args.is_regression,
                                        for_cn=args.for_cn,
                                        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size is None:
        args.predict_batch_size = args.batch_size
    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        """
        if args.random_seed is not None and args.enable_ce:
            train_program.random_seed = args.random_seed
        """
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)

        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                              main_program=test_prog,
                                              share_vars_from=train_exe)

    steps = 10000
    current_epoch = 1
    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        current_epoch = 0
        previous_eval_acc = 0.80
        previous_train_acc = 0.90
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe,
                                       train_program,
                                       train_pyreader,
                                       graph_vars,
                                       "train",
                                       metric=args.metric,
                                       is_classify=args.is_classify,
                                       is_regression=args.is_regression)
                    acc = outputs["accuracy"]
                    if acc > previous_train_acc or acc > 0.95:
                        print(
                            "previous train accuracy is %f and current train accuracy is %f "
                            % (previous_train_acc, acc))
                        previous_train_acc = acc
                        eval_acc = evaluate_wrapper(args, reader, exe,
                                                    test_prog, test_pyreader,
                                                    graph_vars, current_epoch,
                                                    steps)
                        print(
                            "previous evaluate accuracy is %f and current evaluate accuracy is %f "
                            % (previous_eval_acc, eval_acc))
                        if eval_acc > previous_eval_acc:
                            previous_eval_acc = eval_acc
                            save_path = os.path.join(
                                args.checkpoints,
                                "evalacc_" + str(eval_acc).split('.')[1])
                            fluid.io.save_persistables(exe, save_path,
                                                       train_program)
                            predict_wrapper(args,
                                            reader,
                                            exe,
                                            test_prog,
                                            test_pyreader,
                                            graph_vars,
                                            current_epoch,
                                            steps="evalacc_" +
                                            str(eval_acc).split('.')[1])
                            print(
                                "predict and save model!!!!!!!!!!!!!!!!!!!!!!!!!! in %s"
                                % (save_path))
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example, num_train_examples,
                         steps, outputs["loss"], outputs["accuracy"],
                         args.skip_steps / used_time))
                    ce_info.append(
                        [outputs["loss"], outputs["accuracy"], used_time])

                    time_begin = time.time()

                # if steps % args.save_steps == 0:
                #     save_path = os.path.join(args.checkpoints,
                #                              "step_" + str(steps))
                #     fluid.io.save_persistables(exe, save_path, train_program)

                # if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                #     # evaluate dev set
                #     if args.do_val:
                #         ret=evaluate_wrapper(args, reader, exe, test_prog,
                #                          test_pyreader, graph_vars,
                #                          current_epoch, steps)

                #     if args.do_test:
                #         predict_wrapper(args, reader, exe,
                #                 test_prog, test_pyreader, graph_vars,
                #                 current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    # if args.do_val:
    #     evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
    #                      graph_vars, current_epoch, steps)

    # final eval on test set
    steps = 0
    # if args.do_test:
    #     current_epoch = 0
    #     predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
    #                     current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.diagnostic,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        print("Final diagnostic")
        qids, preds, probs = predict(test_exe,
                                     test_prog,
                                     test_pyreader,
                                     graph_vars,
                                     is_classify=args.is_classify,
                                     is_regression=args.is_regression)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        print("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
コード例 #14
0
def train(args):
    print("pretraining start")
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)
            scheduled_lr, loss_scaling = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_program,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
                use_fp16=args.use_fp16,
                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                init_loss_scaling=args.init_loss_scaling,
                incr_every_n_steps=args.incr_every_n_steps,
                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                incr_ratio=args.incr_ratio,
                decr_ratio=args.decr_ratio)

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = DataReader(data_dir=args.data_dir,
                             batch_size=args.batch_size,
                             in_tokens=args.in_tokens,
                             vocab_path=args.vocab_path,
                             voc_size=bert_config['vocab_size'],
                             epoch=args.epoch,
                             max_seq_len=args.max_seq_len,
                             generate_neg_sample=args.generate_neg_sample)

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.use_experimental_executor = args.use_fast_executor
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

    build_strategy = fluid.BuildStrategy()
    if not sys.platform == "win32":
        build_strategy.num_trainers = nccl2_num_trainers
    elif nccl2_num_trainers > 1:
        raise ValueError(
            "Windows platform doesn't support distributed training!")
    build_strategy.trainer_id = nccl2_trainer_id
    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=total_loss.name,
                                              exec_strategy=exec_strategy,
                                              build_strategy=build_strategy)

    if args.validation_set_dir and args.validation_set_dir != "":
        predict = predict_wrapper(args,
                                  exe,
                                  bert_config,
                                  test_prog=test_prog,
                                  data_loader=test_data_loader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_data_loader.set_batch_generator(data_reader.data_generator())
    train_data_loader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += 1
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)
                continue

            if steps % args.skip_steps != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)

            else:
                fetch_list = [
                    next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                    scheduled_lr.name
                ]
                if args.use_fp16:
                    fetch_list.append(loss_scaling.name)

                if use_ngraph:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_program)
                else:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_compiled_program)

                if args.use_fp16:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs
                else:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs

                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file = data_reader.get_progress(
                )
                if args.verbose:
                    verbose = "feed_queue size: %d, " % train_data_loader.queue.size(
                    )
                    verbose += "current learning_rate: %f, " % np_lr[0]
                    if args.use_fp16:
                        verbose += "loss scaling: %f" % np_scaling[0]
                    print(verbose)

                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.save(program=train_program, model_path=save_path)

            if args.validation_set_dir and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_data_loader.reset()
            break
コード例 #15
0
def train(args):
    if not (args.do_train or args.do_predict):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")

    xlnet_config = XLNetConfig(args.model_config_path)
    xlnet_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    processor = DataProcessor(spiece_model_file=args.spiece_model_file,
                              uncased=args.uncased,
                              max_seq_length=args.max_seq_length,
                              doc_stride=args.doc_stride,
                              max_query_length=args.max_query_length)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            data_path=args.train_file,
            batch_size=args.train_batch_size,
            phase='train',
            shuffle=True,
            dev_count=dev_count,
            epoch=args.epoch)

        num_train_examples = processor.get_num_examples(phase='train')
        print("Device count: %d" % dev_count)
        print("Max num of epoches: %d" % args.epoch)
        print("Num of train examples: %d" % num_train_examples)
        print("Num of train steps: %d" % args.train_steps)
        print("Num of warmup steps: %d" % args.warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_data_loader, loss = create_model(
                    xlnet_config=xlnet_config, is_training=True)

                scheduled_lr = optimization(
                    loss=loss,
                    warmup_steps=args.warmup_steps,
                    num_train_steps=args.train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    lr_layer_decay_rate=args.lr_layer_decay_rate,
                    scheduler=args.lr_scheduler)

    if args.do_predict:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_data_loader, predictions = create_model(
                    xlnet_config=xlnet_config, is_training=False)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog)

    elif args.do_predict:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing prediction!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        build_strategy = fluid.BuildStrategy()
        # These two flags must be set in this model for correctness
        build_strategy.fuse_all_optimizer_ops = True
        build_strategy.enable_inplace = False
        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=loss.name,
                                           exec_strategy=exec_strategy,
                                           build_strategy=build_strategy,
                                           main_program=train_program)

        train_data_loader.set_batch_generator(train_data_generator, place)

        train_data_loader.start()
        steps = 0
        total_cost = []
        time_begin = time.time()
        print("Begin to train model  ...")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        while steps < args.train_steps:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, scheduled_lr.name]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    np_loss, np_lr = outputs
                    total_cost.extend(np_loss)

                    if args.verbose:
                        verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
                        )
                        verbose += "learning rate: %f " % np_lr[0]
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = processor.get_train_progress()

                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "speed: %f steps/s" %
                          (epoch, current_example, num_train_examples, steps,
                           np.mean(total_cost), args.skip_steps / used_time))
                    total_cost = []
                    time_begin = time.time()

                if steps % args.save_steps == 0 or steps == args.train_steps:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_data_loader.reset()
                break
        print("Finish model training ...")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

    if args.do_predict:
        print("Begin to do prediction  ...")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        test_data_loader.set_batch_generator(
            processor.data_generator(data_path=args.predict_file,
                                     batch_size=args.predict_batch_size,
                                     phase='predict',
                                     shuffle=False,
                                     dev_count=1,
                                     epoch=1), place)

        predict(exe,
                test_prog,
                test_data_loader, [
                    predictions['unique_ids'].name,
                    predictions['start_top_log_probs'].name,
                    predictions['start_top_index'].name,
                    predictions['end_top_log_probs'].name,
                    predictions['end_top_index'].name,
                    predictions['cls_logits'].name
                ],
                processor,
                name='')

        print("Finish prediction ...")
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
コード例 #16
0
ファイル: base_squad.py プロジェクト: DeepLearnXMU/IMM
def train(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if not (args.do_train or args.do_predict):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    processor = DataProcessor(
        vocab_path=args.vocab_path,
        do_lower_case=args.do_lower_case,
        max_seq_length=args.max_seq_len,
        in_tokens=args.in_tokens,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        adv_text_path=args.adv_text_path)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            data_path=args.train_file,
            batch_size=args.batch_size,
            phase='train',
            shuffle=True,
            dev_count=dev_count,
            version_2_with_negative=args.version_2_with_negative,
            epoch=args.epoch)

        num_train_examples = processor.get_num_examples(phase='train')
        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size) // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, num_seqs = create_model(
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    is_training=True)

                scheduled_lr = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    loss_scaling=args.loss_scaling)

                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training:  %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_predict:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    is_training=False)

                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
                    start_logits.name, end_logits.name, num_seqs.name])

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_predict:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing prediction!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=loss.name,
            exec_strategy=exec_strategy,
            main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)

        train_pyreader.start()
        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()

        best_f1 = -1
        while steps < max_train_steps:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        np_loss, np_num_seqs = outputs
                    else:
                        np_loss, np_lr, np_num_seqs = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            np_lr[0]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = processor.get_train_progress()

                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "speed: %f steps/s" %
                          (epoch, current_example, num_train_examples, steps,
                           np.sum(total_cost) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if (steps % args.save_steps == 0 or steps == max_train_steps) and steps > int(max_train_steps/3.0):
                #if (steps % args.save_steps == 0 or steps == max_train_steps):
                    if args.do_predict:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                data_path=args.predict_file,
                                batch_size=args.batch_size,
                                phase='predict',
                                shuffle=False,
                                dev_count=1,
                                epoch=1))
                        adv_f1 = predict(exe, test_prog, test_pyreader, [
                            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
                        ], processor)
                        # print(adv_f1)
                        # continue

                    # if steps != max_train_steps:
                    if adv_f1 > best_f1:
                        best_f1 = adv_f1
                        save_path = os.path.join(args.checkpoints,
                                                "step_best")
                        print("best adv model saved")
                    # else:
                    #     save_path = os.path.join(args.checkpoints,
                    #                             "step_last")
                        fluid.io.save_persistables(exe, save_path, train_program)
                    test_pyreader.decorate_tensor_provider(
                                processor.data_generator(
                                    data_path=args.predict_file.replace("dev", "test"),
                                    batch_size=args.batch_size,
                                    phase='predict',
                                    shuffle=False,
                                    dev_count=1,
                                    epoch=1))
                    test_f1 = predict(exe, test_prog, test_pyreader, [
                        unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
                    ], processor, args.predict_file.replace("dev", "test"))
                    print("This is the test score.")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if args.do_predict and not args.do_train:
        test_pyreader.decorate_tensor_provider(
            processor.data_generator(
                data_path=args.predict_file,
                batch_size=args.batch_size,
                phase='predict',
                shuffle=False,
                dev_count=1,
                epoch=1))

        predict(exe, test_prog, test_pyreader, [
            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
        ], processor)
コード例 #17
0
ファイル: inference_de.py プロジェクト: quyingqi/Research
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = reader_de_infer.ClassifyReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        q_max_seq_len=args.q_max_seq_len,
        p_max_seq_len=args.p_max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        tokenizer=args.tokenizer,
        for_cn=args.for_cn,
        task_id=args.task_id)

    assert args.test_save is not None
    startup_prog = fluid.Program()

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, graph_vars = create_model(
                args,
                pyreader_name='test_reader',
                ernie_config=ernie_config,
                batch_size=args.batch_size,
                is_prediction=True)

    test_prog = test_prog.clone(for_test=True)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_checkpoint:
        raise ValueError("args 'init_checkpoint' should be set if"
                         "only doing validation or testing!")
    init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    test_sets = args.test_set.split(',')
    save_dirs = args.test_save.split(',')
    assert len(test_sets) == len(save_dirs)
    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size

    for test_f, save_f in zip(test_sets, save_dirs):
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(test_f,
                                  batch_size=batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        save_path = save_f
        log.info("testing {}, save to {}".format(test_f, save_path))
        predict(args,
                exe,
                test_prog,
                test_pyreader,
                graph_vars,
                output_item=args.output_item,
                output_file_name=args.output_file_name,
                hidden_size=ernie_config['hidden_size'])
コード例 #18
0
def main(args):
    """main"""
    model_config = UNIMOConfig(args.unimo_config_path)
    model_config.print_config()

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=args.do_lower_case)

    if not (args.do_train or args.do_val or args.do_test or args.do_test_hard):
        raise ValueError(
            "For args `do_train`, `do_val`, `do_test`, `do_test_hard`, at "
            "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))

    if args.do_train:
        train_data_reader = ClassifyReader(args.train_filelist,
                                           args.max_seq_len, tokenizer)
        train_data_generator = train_data_reader.data_generator(
            batch_size=args.batch_size, epoch=args.epoch, phase="train")

        if args.num_train_examples:
            num_train_examples = args.num_train_examples
        else:
            num_train_examples = train_data_reader.get_num_examples()
        step_num_per_epoch = num_train_examples // args.batch_size // trainers_num
        max_train_steps = args.epoch * step_num_per_epoch

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    config=model_config,
                    pyreader_name="train_reader",
                    is_train=True)

                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

    if args.do_val or args.do_test or args.do_test_hard:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = create_model(
                    args,
                    config=model_config,
                    pyreader_name="dev_reader",
                    is_train=False)
        test_prog = test_prog.clone(for_test=True)
        if args.do_val:
            dev_data_reader = ClassifyReader(args.dev_filelist,
                                             args.max_seq_len, tokenizer)
            dev_data_generator = dev_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="dev")

        if args.do_test:
            test_data_reader = ClassifyReader(args.test_filelist,
                                              args.max_seq_len, tokenizer)
            test_data_generator = test_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="test")

        if args.do_test_hard:
            test_hard_data_reader = ClassifyReader(args.test_hard_filelist,
                                                   args.max_seq_len, tokenizer)
            test_hard_data_generator = test_hard_data_reader.data_generator(
                batch_size=args.test_batch_size, epoch=1, phase="test_hard")

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=train_program)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=train_program)
    elif args.do_val or args.do_test or args.do_test_hard:
        args.init_checkpoint = args.init_pretraining_params
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = 4 if args.use_fp16 else 2
        exec_strategy.num_iteration_per_drop_scope = min(
            args.num_iteration_per_drop_scope, args.skip_steps)

        build_strategy = fluid.BuildStrategy()
        build_strategy.remove_unnecessary_lock = False

        if args.use_fuse:
            build_strategy.fuse_all_reduce_ops = True

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test or args.do_test_hard:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    dev_ret_history = []  # (steps, key_eval, eval)
    test_ret_history = []  # (steps, key_eval, eval)
    test_hard_ret_history = []  # (steps, key_eval, eval)
    steps = 0

    if args.do_train:
        train_pyreader.start()
        time_begin = time.time()
        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if steps % skip_steps == 0:
                    train_fetch_list = [
                        graph_vars["loss"].name, scheduled_lr.name
                    ]
                    res = train_exe.run(fetch_list=train_fetch_list)
                    outputs = {
                        "loss": np.mean(res[0]),
                        'learning_rate': float(res[1][0])
                    }
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, learning_rate: %.10f" % \
                                (train_pyreader.queue.size(), outputs['learning_rate'])
                        print(verbose)
                    current_epoch, current_example, current_file_index, total_file, current_file = \
                            train_data_reader.get_progress()

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("%s - epoch: %d, progress: %d/%d, %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \
                          (get_time(), current_epoch, current_example, num_train_examples, current_file_index, \
                          total_file, steps, outputs["loss"], args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0 and args.save_checkpoints:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path,
                                                   train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            dev_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "dev", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            dev_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            test_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "test", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            test_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

                    # evaluate test set
                    if args.do_test_hard:
                        test_pyreader.decorate_tensor_provider(
                            test_hard_data_generator)
                        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars, \
                                "test_hard", trainers_num, nccl2_trainer_id)
                        if nccl2_trainer_id == 0:
                            test_hard_ret_history.append(
                                (steps, outputs['key_eval'],
                                 outputs[outputs['key_eval']]))

            except fluid.core.EOFException:
                if args.save_checkpoints:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(dev_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "dev", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            dev_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(test_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "test", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            test_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    # final eval on test_hard set
    if args.do_test_hard:
        test_pyreader.decorate_tensor_provider(test_hard_data_generator)
        outputs = evaluate(args, test_exe, test_pyreader, test_graph_vars,
                           "test_hard", trainers_num, nccl2_trainer_id)
        if nccl2_trainer_id == 0:
            test_hard_ret_history.append(
                (steps, outputs['key_eval'], outputs[outputs['key_eval']]))

    if nccl2_trainer_id == 0:
        if args.do_val:
            dev_ret_history = sorted(dev_ret_history,
                                     key=lambda a: a[2],
                                     reverse=True)
            print("Best validation result: step %d %s %f" % \
                    (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2]))
コード例 #19
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = reader_ce.ClassifyReader(vocab_path=args.vocab_path,
                                      label_map_config=args.label_map_config,
                                      max_seq_len=args.max_seq_len,
                                      total_num=args.train_data_size,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed,
                                      tokenizer=args.tokenizer,
                                      for_cn=args.for_cn,
                                      task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size

    if args.do_train:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        dev_count = fleet.worker_num()

        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=1,
            trainer_id=fleet.worker_index(),
            trainer_num=fleet.worker_num(),
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        # use fleet api
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        if args.is_distributed:
            exec_strategy.num_threads = 3

        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 1
        if args.is_distributed:
            dist_strategy.nccl_comm_num = 2
        dist_strategy.use_hierarchical_allreduce = True

        if args.use_mix_precision:
            dist_strategy.use_amp = True

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio,
                    dist_strategy=dist_strategy)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_prediction=True)

        test_prog = test_prog.clone(for_test=True)

    train_program = fleet.main_program

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)

    if args.do_train:
        train_exe = exe
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    #    if args.do_val or args.do_test:
    #        if args.use_multi_gpu_test:
    #            test_exe = fluid.ParallelExecutor(
    #                use_cuda=args.use_cuda,
    #                main_program=test_prog,
    #                share_vars_from=train_exe)

    current_epoch = 0
    steps = 0
    if args.do_train:
        train_pyreader.start()
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        while True:
            try:
                steps += 1
                #                log.info("step: %d" % steps)

                if fleet.worker_index() != 0:
                    train_exe.run(fetch_list=[], program=train_program)
                    continue

                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[], program=train_program)

                else:
                    outputs = evaluate(train_exe,
                                       train_program,
                                       train_pyreader,
                                       graph_vars,
                                       "train",
                                       metric=args.metric)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example * dev_count,
                         num_train_examples, steps, outputs["loss"],
                         outputs["accuracy"], args.skip_steps / used_time))
                    ce_info.append(
                        [outputs["loss"], outputs["accuracy"], used_time])

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path,
                                               fleet._origin_program)


#                if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(args, reader, exe, test_prog,
                                         test_pyreader, graph_vars,
                                         current_epoch, steps)

                    if args.do_test:
                        predict_wrapper(args, reader, exe, test_prog,
                                        test_pyreader, graph_vars,
                                        current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path,
                                           fleet._origin_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                        graph_vars, current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.diagnostic,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(test_exe, test_prog, test_pyreader,
                                     graph_vars)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
コード例 #20
0
def train(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if not (args.do_train or args.do_predict or args.do_val):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    wn_id2concept, wn_concept2id, wn_concept_embedding_mat = read_concept_embedding(
        args.wn_concept_embedding_path)
    nell_id2concept, nell_concept2id, nell_concept_embedding_mat = read_concept_embedding(
        args.nell_concept_embedding_path)

    processor = DataProcessor(vocab_path=args.vocab_path,
                              do_lower_case=args.do_lower_case,
                              max_seq_length=args.max_seq_len,
                              in_tokens=args.in_tokens,
                              doc_stride=args.doc_stride,
                              max_query_length=args.max_query_length)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
        random.seed(args.random_seed)
        np.random.seed(args.random_seed)

    if args.do_train:
        train_concept_settings = {
            'tokenization_path':
            '../retrieve_concepts/tokenization_squad/tokens/train.tokenization.{}.data'
            .format('uncased' if args.do_lower_case else 'cased'),
            'wn_concept2id':
            wn_concept2id,
            'nell_concept2id':
            nell_concept2id,
            'use_wordnet':
            args.use_wordnet,
            'retrieved_synset_path':
            args.retrieved_synset_path,
            'use_nell':
            args.use_nell,
            'retrieved_nell_concept_path':
            args.train_retrieved_nell_concept_path,
        }
        train_data_generator = processor.data_generator(
            data_path=args.train_file,
            batch_size=args.batch_size,
            phase='train',
            shuffle=True,
            dev_count=dev_count,
            version_2_with_negative=args.version_2_with_negative,
            epoch=args.epoch,
            **train_concept_settings)

        num_train_examples = processor.get_num_examples(phase='train')
        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size) // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
        logger.info("Device count: %d" % dev_count)
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Max train steps: %d" % max_train_steps)
        logger.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        # if args.random_seed is not None:
        #     train_program.random_seed = args.random_seed
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, num_seqs = create_model(
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    max_wn_concept_length=processor.
                    train_wn_max_concept_length,
                    max_nell_concept_length=processor.
                    train_nell_max_concept_length,
                    wn_concept_embedding_mat=wn_concept_embedding_mat,
                    nell_concept_embedding_mat=nell_concept_embedding_mat,
                    is_training=True,
                    freeze=args.freeze)

                scheduled_lr = optimization(loss=loss,
                                            warmup_steps=warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=args.use_fp16,
                                            loss_scaling=args.loss_scaling)

                if args.use_ema:
                    ema = fluid.optimizer.ExponentialMovingAverage(
                        args.ema_decay)
                    ema.update()

                fluid.memory_optimize(train_program,
                                      skip_opt_set=[loss.name, num_seqs.name])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            logger.info(
                "Theoretical memory usage in training:  %.3f - %.3f %s" %
                (lower_mem, upper_mem, unit))

    if args.do_predict or args.do_val:
        eval_concept_settings = {
            'tokenization_path':
            '../retrieve_concepts/tokenization_squad/tokens/dev.tokenization.{}.data'
            .format('uncased' if args.do_lower_case else 'cased'),
            'wn_concept2id':
            wn_concept2id,
            'nell_concept2id':
            nell_concept2id,
            'use_wordnet':
            args.use_wordnet,
            'retrieved_synset_path':
            args.retrieved_synset_path,
            'use_nell':
            args.use_nell,
            'retrieved_nell_concept_path':
            args.dev_retrieved_nell_concept_path,
        }
        eval_data_generator = processor.data_generator(
            data_path=args.predict_file,
            batch_size=args.batch_size,
            phase='predict',
            shuffle=False,
            dev_count=1,
            epoch=1,
            **eval_concept_settings)

        test_prog = fluid.Program()
        # if args.random_seed is not None:
        #     test_prog.random_seed = args.random_seed
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    max_wn_concept_length=processor.
                    predict_wn_max_concept_length,
                    max_nell_concept_length=processor.
                    predict_nell_max_concept_length,
                    wn_concept_embedding_mat=wn_concept_embedding_mat,
                    nell_concept_embedding_mat=nell_concept_embedding_mat,
                    is_training=False)

                if args.use_ema and 'ema' not in dir():
                    ema = fluid.optimizer.ExponentialMovingAverage(
                        args.ema_decay)

                fluid.memory_optimize(test_prog,
                                      skip_opt_set=[
                                          unique_ids.name, start_logits.name,
                                          end_logits.name, num_seqs.name
                                      ])

        test_prog = test_prog.clone(for_test=True)
        # if args.random_seed is not None:
        #     test_prog.random_seed = args.random_seed

    exe.run(startup_prog)

    if args.do_train:
        logger.info('load pretrained concept embedding')
        fluid.global_scope().find_var('wn_concept_emb_mat').get_tensor().set(
            wn_concept_embedding_mat, place)
        fluid.global_scope().find_var('nell_concept_emb_mat').get_tensor().set(
            nell_concept_embedding_mat, place)

        if args.init_checkpoint and args.init_pretraining_params:
            logger.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_predict or args.do_val:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing prediction!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=loss.name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)

        train_pyreader.start()
        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()
        while steps < max_train_steps:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        np_loss, np_num_seqs = outputs
                    else:
                        np_loss, np_lr, np_num_seqs = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (np_lr[0] if
                                                          warmup_steps > 0 else
                                                          args.learning_rate)
                        logger.info(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = processor.get_train_progress()

                    logger.info(
                        "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "speed: %f steps/s" %
                        (epoch, current_example, num_train_examples, steps,
                         np.sum(total_cost) / np.sum(total_num_seqs),
                         args.skip_steps / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0 or steps == max_train_steps:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0 or steps == max_train_steps:
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                data_path=args.predict_file,
                                batch_size=args.batch_size,
                                phase='predict',
                                shuffle=False,
                                dev_count=1,
                                epoch=1,
                                **eval_concept_settings))
                        val_performance = predict(
                            exe, test_prog, test_pyreader, [
                                unique_ids.name, start_logits.name,
                                end_logits.name, num_seqs.name
                            ], processor, eval_concept_settings,
                            'validate_result_step_{}.json'.format(steps))
                        logger.info(
                            "Validation performance after step {}:\n* Exact_match: {}\n* F1: {}"
                            .format(steps, val_performance['exact_match'],
                                    val_performance['f1']))

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if args.do_predict:
        test_pyreader.decorate_tensor_provider(eval_data_generator)

        if args.use_ema:
            with ema.apply(exe):
                eval_performance = predict(exe, test_prog, test_pyreader, [
                    unique_ids.name, start_logits.name, end_logits.name,
                    num_seqs.name
                ], processor, eval_concept_settings)
        else:
            eval_performance = predict(exe, test_prog, test_pyreader, [
                unique_ids.name, start_logits.name, end_logits.name,
                num_seqs.name
            ], processor, eval_concept_settings)

        logger.info("Eval performance:\n* Exact_match: {}\n* F1: {}".format(
            eval_performance['exact_match'], eval_performance['f1']))
コード例 #21
0
def main(args):
    """main"""
    reader = task_reader.RoleSequenceLabelReader(
        vocab_path=args.vocab_path,
        labels_map=labels_map,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        print(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        u"【train】epoch: {}, step: {}, loss: {:.6f}, "
                        "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s"
                        .format(current_epoch, steps, float(loss), float(f1),
                                float(precision), float(recall),
                                args.skip_steps / used_time))
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))
                    # evaluate test set
                    if args.do_test:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "final_model")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog,
                                                 test_pyreader, graph_vars, 1,
                                                 'final')
        print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format(
            float(precision), float(recall), float(f1)))

    if args.do_test:
        test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader,
                                   graph_vars, 1, 'final')
        utils.write_by_lines(args.trigger_pred_save_path, test_ret)
コード例 #22
0
def main(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processors = {
        'xnli': reader.XnliProcessor,
        'cola': reader.ColaProcessor,
        'mrpc': reader.MrpcProcessor,
        'mnli': reader.MnliProcessor,
    }

    processor = processors[task_name](data_dir=args.data_dir,
                                      vocab_path=args.vocab_path,
                                      max_seq_len=args.max_seq_len,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed)
    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            shuffle=True)

        num_train_examples = processor.get_num_examples(phase='train')

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, probs, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    num_labels=num_labels)
                scheduled_lr = optimization(loss=loss,
                                            warmup_steps=warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=args.use_fp16,
                                            loss_scaling=args.loss_scaling)

                fluid.memory_optimize(input_program=train_program,
                                      skip_opt_set=[
                                          loss.name, probs.name, accuracy.name,
                                          num_seqs.name
                                      ])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, loss, probs, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    num_labels=num_labels)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=loss.name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, accuracy.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        np_loss, np_acc, np_num_seqs = outputs
                    else:
                        np_loss, np_acc, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (np_lr[0] if
                                                          warmup_steps > 0 else
                                                          args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = processor.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example, num_train_examples,
                         steps, np.sum(total_cost) / np.sum(total_num_seqs),
                         np.sum(total_acc) / np.sum(total_num_seqs),
                         args.skip_steps / used_time))
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                batch_size=args.batch_size,
                                phase='dev',
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")
                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                batch_size=args.batch_size,
                                phase='test',
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='dev',
                                     epoch=1,
                                     shuffle=False))
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='test',
                                     epoch=1,
                                     shuffle=False))
        print("Final test result:")
        evaluate(exe, test_prog, test_pyreader,
                 [loss.name, accuracy.name, num_seqs.name], "test")
コード例 #23
0
ファイル: run_regression.py プロジェクト: RonDen/Research
def main(args):
    """main"""
    model_config = UNIMOConfig(args.unimo_config_path)
    model_config.print_config()

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=args.do_lower_case)

    data_reader = RegressionReader(tokenizer, args)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        train_data_generator = data_reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = data_reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                    args.batch_size // args.max_seq_len) // trainers_num
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    config=model_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test or args.do_pred:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    config=model_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=worker_endpoints_env,
            current_endpoint=current_endpoint,
            program=train_program if args.do_train else test_prog,
            startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=train_program)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=train_program)
    elif args.do_val or args.do_test or args.do_pred:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program,
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test or args.do_pred:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(
                use_cuda=args.use_cuda,
                main_program=test_prog,
                share_vars_from=train_exe)

    dev_ret_history = [] # (steps, key_eval, eval)
    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if steps % skip_steps == 0:
                    train_fetch_list = [
                        graph_vars["loss"].name,
                    ]
                    if "learning_rate" in graph_vars:
                        train_fetch_list.append(graph_vars["learning_rate"].name)
                    res = train_exe.run(fetch_list=train_fetch_list)

                    outputs = {"loss": np.mean(res[0])}
                    if "learning_rate" in graph_vars:
                        outputs["learning_rate"] = float(res[1][0])

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = data_reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("%s - epoch: %d, progress: %d/%d, step: %d, ave loss: %f, speed: %f steps/s" % \
                          (get_time(), current_epoch, current_example, num_train_examples, steps, \
                          outputs["loss"], args.skip_steps / used_time))
                    time_begin = time.time()
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0 and args.save_checkpoints:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path, train_program)

                    if steps % args.validation_steps == 0:
                        # evaluate dev set
                        if args.do_val:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.dev_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev")
                            dev_ret_history.append((steps, outputs['key_eval'], outputs[outputs['key_eval']]))

                        # evaluate test set
                        if args.do_test:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test")

                        if args.do_pred:
                            test_pyreader.decorate_tensor_provider(
                                data_reader.data_generator(
                                    args.test_set,
                                    batch_size=args.batch_size,
                                    epoch=1,
                                    dev_count=1,
                                    shuffle=False))
                            qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1)
                            save_path = args.pred_save + '.test.' + str(steps) + '.txt'
                            print("testing {}, save to {}".format(args.test_set, save_path))
                            with open(save_path, 'w') as f:
                                for id, s, p in zip(qids, preds, probs):
                                    f.write('{}\t{}\t{}\n'.format(id, s, p))

            except fluid.core.EOFException:
                if args.save_checkpoints:
                    save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if nccl2_trainer_id == 0:
        # final eval on dev set
        if args.do_val:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(
                    args.dev_set,
                    batch_size=args.batch_size,
                    epoch=1,
                    dev_count=1,
                    shuffle=False))
            print("Final validation result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "dev")
            dev_ret_history.append((steps, outputs['key_eval'], outputs[outputs['key_eval']]))
            dev_ret_history = sorted(dev_ret_history, key=lambda a: a[2], reverse=True)
            print("Best validation result: step %d %s %f" \
                    % (dev_ret_history[0][0], dev_ret_history[0][1], dev_ret_history[0][2]))
 
        # final eval on test set
        if args.do_test:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(
                    args.test_set,
                    batch_size=args.batch_size,
                    epoch=1,
                    dev_count=1,
                    shuffle=False))
            print("Final test result:")
            outputs = evaluate(args, test_exe, test_prog, test_pyreader, graph_vars, "test")

       # final eval on test set
        if args.do_pred:
            test_pyreader.decorate_tensor_provider(
                data_reader.data_generator(
                    args.test_set,
                    batch_size=args.batch_size,
                    epoch=1,
                    dev_count=1,
                    shuffle=False))
            qids, preds, probs = predict(test_exe, test_prog, test_pyreader, graph_vars, dev_count=1)
            save_path = args.pred_save + '.' + str(steps) + '.txt'
            print("testing {}, save to {}".format(args.test_set, save_path))
            with open(save_path, 'w') as f:
                for id, s, p in zip(qids, preds, probs):
                    f.write('{}\t{}\t{}\n'.format(id, s, p))
コード例 #24
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    loss_scaling=args.loss_scaling)

                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name, graph_vars["labels"].name,
                        graph_vars["infers"].name, graph_vars["seq_lens"].name
                    ])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            main_program=test_prog,
            share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        if args.save_log and args.log_path:
            if os.path.exists(args.log_path):
                raise FileExistsError("Logging file already exists!")
            with open(args.log_path, 'w') as logfile:
                logfile.write('%s\n' % time.asctime())
            print('Writing logs into %s' % args.log_path)

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe, train_program, train_pyreader,
                                       graph_vars, args.num_labels, "train",
                                       dev_count)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["lr"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                          % (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"], outputs["f1"],
                             outputs["precision"], outputs["recall"],
                             args.skip_steps / used_time))

                    if args.save_log and args.log_path:
                        with open(args.log_path, 'a') as logfile:
                            logfile.write("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                                          "f1: %f, precision: %f, recall: %f\n" % (
                                              current_epoch, current_example, num_train_examples,
                                              steps, outputs["loss"], outputs["f1"],
                                              outputs["precision"], outputs["recall"]))

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.dev_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "dev")
                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.test_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "test")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.dev_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "dev")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.test_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final test result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "test")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")
def main(args):
    """main"""
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.RankReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        tokenizer=args.tokenizer,
        is_classify=args.is_classify,
        is_regression=args.is_regression,
        for_cn=args.for_cn,
        task_id=args.task_id,
    )

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError(
            "For args `do_train`, `do_val` and `do_test`, at "
            "least one of them must be True.", )

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train",
        )

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, \
                        got batch_size:%d seqlen:%d' %
                    (args.batch_size, args.max_seq_len))
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        if args.random_seed is not None and args.enable_ce:
            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression,
                )
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio,
                )

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len,
                )
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size,
                )
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression,
                )

        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=worker_endpoints_env,
            current_endpoint=current_endpoint,
            program=train_program if args.do_train else test_prog,
            startup_program=startup_prog,
        )
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.", )
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16,
            )
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16,
            )
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError(
                "args 'init_checkpoint' should be set if"
                "only doing validation or testing!", )
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16,
        )

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program,
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id,
        )

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(
                use_cuda=args.use_cuda,
                main_program=test_prog,
                share_vars_from=train_exe,
            )

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        current_epoch = 0
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(
                        train_exe,
                        train_program,
                        train_pyreader,
                        graph_vars,
                        "train",
                        metric=args.metric,
                        is_classify=args.is_classify,
                        is_regression=args.is_regression,
                    )

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    if args.is_classify:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" % (
                                current_epoch,
                                current_example,
                                num_train_examples,
                                steps,
                                outputs["loss"],
                                outputs['acc'],
                                args.skip_steps / used_time,
                            ), )
                        ce_info.append([outputs["loss"], used_time], )
                    if args.is_regression:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            " speed: %f steps/s" % (
                                current_epoch,
                                current_example,
                                num_train_examples,
                                steps,
                                outputs["loss"],
                                args.skip_steps / used_time,
                            ), )
                    time_begin = time.time()

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(
                            args.checkpoints,
                            "step_" + str(steps),
                        )
                        fluid.io.save_persistables(
                            exe,
                            save_path,
                            train_program,
                        )

                    if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                        # evaluate dev set
                        if args.do_val:
                            evaluate_wrapper(
                                args,
                                reader,
                                exe,
                                test_prog,
                                test_pyreader,
                                graph_vars,
                                current_epoch,
                                steps,
                            )

                        if args.do_test:
                            predict_wrapper(
                                args,
                                reader,
                                exe,
                                test_prog,
                                test_pyreader,
                                graph_vars,
                                current_epoch,
                                steps,
                            )

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(
                    args.checkpoints,
                    "step_" + str(steps),
                )
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_loss = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_loss = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                log.info("ce info error")
            log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
            log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
            log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(
            args,
            reader,
            exe,
            test_prog,
            test_pyreader,
            graph_vars,
            current_epoch,
            steps,
        )

    # final eval on test set
    if args.do_test:
        predict_wrapper(
            args,
            reader,
            exe,
            test_prog,
            test_pyreader,
            graph_vars,
            current_epoch,
            steps,
        )

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.diagnostic,
                batch_size=args.batch_size,
                epoch=1,
                dev_count=1,
                shuffle=False,
            ), )

        log.info("Final diagnostic")
        qids, preds, probs = predict(
            test_exe,
            test_prog,
            test_pyreader,
            graph_vars,
            is_classify=args.is_classify,
            is_regression=args.is_regression,
        )
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids),
            len(preds),
        )
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save, ))
コード例 #26
0
def train(args):
    log.info("pretraining start")
    profile = False

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    # define execution strategy
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    # define distribution strategy
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3
    if args.use_recompute:
        log.info("using recompute.")
    dist_strategy.recompute = args.use_recompute
    dist_strategy.sharding = args.use_sharding
    dist_strategy.pipeline = args.num_pp > 1

    # define topology structure for dp/pp/mp
    topo = Topology(rank=fleet.worker_index(),
                    world_size=fleet.worker_num(),
                    dp=args.num_dp,
                    pp=args.num_pp,
                    sharding=args.num_sharding,
                    mp=args.num_mp)

    is_last = False
    if topo.pp.rank == (topo.pp.size - 1):
        is_last = True

    dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank
    dp_worldsize = topo.dp.size * topo.sharding.size
    bsz_per_dp = args.global_bsz // dp_worldsize

    micro_bsz = args.micro_bsz
    assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}"
    acc_steps = bsz_per_dp // micro_bsz

    # sharding \ model parallel \ pipeline
    assert dist_strategy.sharding == True
    dist_strategy.sharding_configs = {
        "segment_broadcast_MB": 32,
        "sharding_degree": args.num_sharding,
        "mp_degree": args.num_mp,
        "pp_degree": args.num_pp,
        "dp_degree": args.num_dp,
        "optimize_offload": True,
    }
    dist_strategy.pipeline_configs = {
        "schedule_mode": "1F1B",
        "micro_batch_size": micro_bsz,
        "accumulate_steps": acc_steps,
    }
    log.info(
        f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}"
    )

    dist_strategy.amp = args.use_amp
    dist_strategy.amp_configs = {
        "custom_white_list": ['softmax', 'layer_norm', 'gelu'],
        "init_loss_scaling": 32768,
        "decr_every_n_nan_or_inf": 2,
        "incr_every_n_steps": 1000,
        "incr_ratio": 2.0,
        "use_dynamic_loss_scaling": True,
        "decr_ratio": 0.5,
        "use_pure_fp16": False,
        "use_fp16_guard": False,
    }

    dist_strategy.lamb = args.use_lamb
    dist_strategy.lamb_configs = {
        'lamb_weight_decay':
        0.01,
        'exclude_from_weight_decay':
        ['layer_norm_bias', 'layer_norm_scale', '.b_0']
    }

    train_program = fluid.Program()
    startup_program = fluid.Program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            graph_vars = create_model(args, 'train', micro_bsz,
                                      dp_sharding_rank, dp_worldsize, topo)
            data_loader = graph_vars['data_loader']
            for op in train_program.global_block().ops:
                if op.type == 'fill_constant':
                    op._set_attr(
                        'op_device', "gpu:0"
                    )  # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376

            if args.use_recompute:
                dist_strategy.recompute_configs = {
                    "checkpoints": graph_vars['checkpoints'],
                    # "enable_offload": args.use_offload,
                    # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096],
                }

            log.debug("base lr: {}".format(args.learning_rate))
            scheduled_lr = linear_warmup_decay(
                learning_rate=args.learning_rate,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps)

            clip_norm_thres = 1.0
            if paddlenlp.ops.optimizer._jit_compile():
                optimizer = paddlenlp.ops.optimizer.AdamwOptimizer(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    weight_decay=args.weight_decay,
                    apply_decay_param_fun=apply_weight_decay_fun)
            else:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    #multi_precision=True,
                    #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248
                    #exclude_from_weight_decay_fn=exclude_from_weight_decay
                )

            optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
            log.info(f"using dist strategy: {dist_strategy}")

            optimizer.minimize(graph_vars['total_loss'])

            final_strategy = fleet._final_strategy()
            applied_meta_list = fleet._get_applied_meta_list()
            log.info("final strategy: {}".format(final_strategy))
            log.info("applied_meta_list: {}".format(applied_meta_list))

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(
            program_desc_dir + "/main_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(train_program))

    with open(
            program_desc_dir + "/startup_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(startup_program))

    exe = fluid.Executor(place)
    exe.run(startup_program)

    optimizer.amp_init(place)

    #save_path = os.path.join(args.output_dir, 'step_0')
    #log.debug("saving models to {}".format(save_path))
    #save_persistables(exe, save_path, train_program)

    if args.init_checkpoint and args.init_checkpoint != "":
        log.info(' ')
        log.info(
            '############################WARNING############################')
        log.info(
            '####### using ini_checkpoint, not init_pretraining_params ####')
        log.info(
            '## meaning hyper param e.g. lr will inherit from checkpoint ##')
        log.info(
            '###############################################################')
        init_checkpoint(exe, args.init_checkpoint, train_program)
        log.info(' ')

    output_dir = args.output_dir
    save_steps = args.save_steps
    total_time = 0
    cost_vals, lm_losses, sop_accs = [], [], []
    global_steps = args.global_steps + 1
    steps = 0
    log_path = 'train_log/node-%d' % fleet.worker_index()
    start_time = time.time()
    with LogWriter(os.path.join(args.output_dir, log_path)) as swriter:
        data_loader.start()
        while True:
            #if steps < global_steps:
            #    steps += 1
            #    continue
            if not is_last:
                fetch_list = []
            else:
                fetch_list = [
                    graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'],
                    scheduled_lr
                ]
                if args.use_sop:
                    fetch_list.extend(
                        [graph_vars['sop_acc'], graph_vars['sop_loss']])
                if args.use_amp:
                    loss_scaling = train_program.global_block(
                    ).vars['loss_scaling_0']
                    fetch_list.append(loss_scaling)

            ret = exe.run(train_program, fetch_list=fetch_list
                          )  # run one mini-batch(=acc_steps micro-batch)
            #use_program_cache=True)

            steps += 1

            if is_last:
                if args.use_sop and args.use_amp:
                    cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret
                elif args.use_sop:
                    cost_val, lm_loss, lr, sop_acc, sop_loss = ret
                elif args.use_amp:
                    cost_val, lm_loss, lr, loss_scaling_0 = ret
                else:
                    cost_val, lm_loss, lr = ret
                cost_vals.append(cost_val[0])
                lm_losses.append(lm_loss[0])
                if args.use_sop:
                    sop_accs.append(sop_acc[0])

                if steps > 0 and (steps % args.log_steps) == 0:
                    end_time = time.time()
                    total_time = end_time - start_time
                    cost_val = np.mean(cost_vals)
                    lm_loss = np.mean(lm_losses)
                    swriter.add_scalar('loss/total_loss', cost_val, steps)
                    swriter.add_scalar('loss/mlm_loss', lm_loss, steps)
                    swriter.add_scalar('lr/scheduled_lr', lr[0], steps)

                    if args.use_sop:
                        sop_acc = np.mean(sop_accs)
                        swriter.add_scalar('loss/sop_loss', sop_loss, steps)
                        swriter.add_scalar('train/sop_acc', sop_acc, steps)
                    else:
                        sop_acc = 0.0

                    if args.use_amp:
                        swriter.add_scalar('lr/loss_scaling',
                                           loss_scaling_0[0], steps)
                    else:
                        loss_scaling_0 = [0.0]

                    log.info(
                        "worker_index: %d, step: %d, cost: %f, "
                        "mlm loss: %f, sentence order acc: %f, "
                        "speed: %f steps/s, "
                        "speed: %f samples/s, "
                        "speed: %f tokens/s, "
                        "learning rate: %.3e, loss_scalings: %f" %
                        (fleet.worker_index(), steps, cost_val, lm_loss,
                         sop_acc, args.log_steps / total_time,
                         args.log_steps * args.global_bsz / total_time,
                         args.log_steps * args.global_bsz * args.max_seq_len /
                         total_time, lr[0], loss_scaling_0[0]))

                    cost_vals, lm_losses, sop_accs = [], [], []
                    start_time = time.time()

            # TODO: add evaluation
            if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0:
                pass

            if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir, 'step_' + str(steps))
                log.debug("saving models to {}".format(save_path))
                save_persistables(exe, save_path, train_program)

            if steps == args.num_train_steps:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir,
                                         'final_step_' + str(steps))
                save_persistables(exe, save_path, train_program)
                log.debug("saving final models to {}".format(save_path))
                log.debug("end of training, total steps: {}".format(steps))
コード例 #27
0
def get_role_init_dict(args, suf):
    """main"""
    # log = logging.getLogger()
    # prepare_logger(log)
    log = logging.getLogger(__name__)
    check_cuda(args.use_cuda)
    labels_map = {}  # label

    for line in utils.read_by_lines(args.label_map_config):
        arr = line.split("\t")
        labels_map[arr[0]] = int(arr[1])
    args.num_labels = len(labels_map)

    print("=========ERNIE CONFIG============")
    ernie_config = ErnieConfig(args.ernie_config_path)
    # ernie_config.print_config()
    print("=========ERNIE CONFIG============")
    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        print("==============place==================", place)
        # place = dev_list[1]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    print("==============place, dev_count==================", place, dev_count)
    reader = task_reader.RoleSequenceLabelReader(
        vocab_path=args.vocab_path,
        labels_map=labels_map,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                # TODO pyreader_name 再次调整为不同
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader_role' + suf,
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)
    trigger_dict = dict()
    trigger_dict['log'] = log
    trigger_dict['args'] = args
    trigger_dict['labels_map'] = labels_map
    trigger_dict['ernie_config'] = ernie_config
    trigger_dict['place'] = place
    trigger_dict['dev_count'] = dev_count
    trigger_dict['reader'] = reader
    trigger_dict['startup_prog'] = startup_prog
    trigger_dict['test_prog'] = test_prog
    trigger_dict['test_pyreader'] = test_pyreader
    trigger_dict['graph_vars'] = graph_vars
    trigger_dict['nccl2_num_trainers'] = nccl2_num_trainers
    trigger_dict['nccl2_trainer_id'] = nccl2_trainer_id
    trigger_dict['exe'] = exe
    return trigger_dict
コード例 #28
0
def train(args):
    print("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config)
            scheduled_lr = optimization(loss=total_loss,
                                        warmup_steps=args.warmup_steps,
                                        num_train_steps=args.num_train_steps,
                                        learning_rate=args.learning_rate,
                                        train_program=train_program,
                                        startup_prog=startup_prog,
                                        weight_decay=args.weight_decay,
                                        scheduler=args.lr_scheduler,
                                        use_fp16=args.use_fp16,
                                        loss_scaling=args.loss_scaling)

            fluid.memory_optimize(input_program=train_program,
                                  skip_opt_set=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='test_reader', ernie_config=ernie_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)
    print("theoretical memory usage: ")
    if args.in_tokens:
        print(
            fluid.contrib.memory_usage(program=train_program,
                                       batch_size=args.batch_size //
                                       args.max_seq_len))
    else:
        print(
            fluid.contrib.memory_usage(program=train_program,
                                       batch_size=args.batch_size))

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = ErnieDataReader(filelist=args.train_filelist,
                                  batch_size=args.batch_size,
                                  vocab_path=args.vocab_path,
                                  voc_size=ernie_config['vocab_size'],
                                  epoch=args.epoch,
                                  max_seq_len=args.max_seq_len,
                                  generate_neg_sample=args.generate_neg_sample,
                                  in_tokens=args.in_tokens,
                                  is_bidirection=args.is_bidirection)

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps)

    build_strategy = fluid.BuildStrategy()
    build_strategy.remove_unnecessary_lock = False

    train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=total_loss.name,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy,
                                       main_program=train_program,
                                       num_trainers=nccl2_num_trainers,
                                       trainer_id=nccl2_trainer_id)

    if args.valid_filelist and args.valid_filelist != "":
        predict = predict_wrapper(args,
                                  exe,
                                  ernie_config,
                                  test_prog=test_prog,
                                  pyreader=test_pyreader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_pyreader.decorate_tensor_provider(data_reader.data_generator())
    train_pyreader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += nccl2_num_trainers
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                train_exe.run(fetch_list=[])
                continue

            if steps % skip_steps != 0:
                train_exe.run(fetch_list=[])
            else:
                each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
                    fetch_list=[
                        next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                        scheduled_lr.name
                    ])
                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                print("feed_queue size", train_pyreader.queue.size())
                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
                print("current learning_rate:%f" % np_lr[0])
                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
                    %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file, mask_type))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)

            if args.valid_filelist and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_pyreader.reset()
            break
コード例 #29
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.MRCReader(vocab_path=args.vocab_path,
                                   label_map_config=args.label_map_config,
                                   max_seq_len=args.max_seq_len,
                                   do_lower_case=args.do_lower_case,
                                   in_tokens=args.in_tokens,
                                   random_seed=args.random_seed,
                                   tokenizer=args.tokenizer,
                                   is_classify=args.is_classify,
                                   is_regression=args.is_regression,
                                   for_cn=args.for_cn,
                                   task_id=args.task_id,
                                   doc_stride=args.doc_stride,
                                   max_query_length=args.max_query_length)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size
    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples("train")

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_training=True)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16)
                """
                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name,
                        graph_vars["num_seqs"].name,
                    ])
                """

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_training=False)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe, train_program,
                                       train_pyreader, graph_vars, "train")

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "speed: %f steps/s" %
                        (current_epoch, current_example, num_train_examples,
                         steps, outputs["loss"], args.skip_steps / used_time))
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(args.dev_set,
                                                  batch_size=args.batch_size,
                                                  epoch=1,
                                                  dev_count=1,
                                                  shuffle=False,
                                                  phase="dev"))
                        evaluate(exe,
                                 test_prog,
                                 test_pyreader,
                                 test_graph_vars,
                                 str(steps) + "_dev",
                                 examples=reader.get_examples("dev"),
                                 features=reader.get_features("dev"),
                                 args=args)

                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(args.test_set,
                                                  batch_size=args.batch_size,
                                                  epoch=1,
                                                  dev_count=1,
                                                  shuffle=False,
                                                  phase="test"))
                        evaluate(exe,
                                 test_prog,
                                 test_pyreader,
                                 test_graph_vars,
                                 str(steps) + "_test",
                                 examples=reader.get_examples("test"),
                                 features=reader.get_features("test"),
                                 args=args)

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        print("Final validation result:")
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.dev_set,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False,
                                  phase="dev"))
        evaluate(exe,
                 test_prog,
                 test_pyreader,
                 test_graph_vars,
                 "dev",
                 examples=reader.get_examples("dev"),
                 features=reader.get_features("dev"),
                 args=args)

    # final eval on test set
    if args.do_test:
        print("Final test result:")
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(args.test_set,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False,
                                  phase="test"))
        evaluate(exe,
                 test_prog,
                 test_pyreader,
                 test_graph_vars,
                 "test",
                 examples=reader.get_examples("test"),
                 features=reader.get_features("test"),
                 args=args)
コード例 #30
0
def main(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = get_device_num()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processors = {
        'xnli': reader.XnliProcessor,
        'cola': reader.ColaProcessor,
        'mrpc': reader.MrpcProcessor,
        'mnli': reader.MnliProcessor,
    }

    processor = processors[task_name](data_dir=args.data_dir,
                                      vocab_path=args.vocab_path,
                                      max_seq_len=args.max_seq_len,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed)
    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
        train_program.random_seed = args.random_seed

    if args.do_train:
        # NOTE: If num_trainers > 1, the shuffle_seed must be set, because
        # the order of batch data generated by reader
        # must be the same in the respective processes.
        shuffle_seed = 1 if num_trainers > 1 else None
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=args.shuffle,
            shuffle_seed=shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr, loss_scaling = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

    if args.do_val:
        dev_prog = fluid.Program()
        with fluid.program_guard(dev_prog, startup_prog):
            with fluid.unique_name.guard():
                dev_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)

        dev_prog = dev_prog.clone(for_test=True)
        dev_data_loader.set_batch_generator(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='dev',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False), place)

    if args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)

        test_prog = test_prog.clone(for_test=True)
        test_data_loader.set_batch_generator(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='test',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False), place)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
        build_strategy = fluid.BuildStrategy()

        if args.use_cuda and num_trainers > 1:
            assert shuffle_seed is not None
            dist_utils.prepare_for_multi_process(exe, build_strategy,
                                                 train_program)
            train_data_generator = fluid.contrib.reader.distributed_batch_reader(
                train_data_generator)

        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=loss.name,
                                              build_strategy=build_strategy)

        train_data_loader.set_batch_generator(train_data_generator, place)

    if args.do_train:
        train_data_loader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        throughput = []
        ce_info = []

        total_batch_num = 0  # used for benchmark

        while True:
            try:
                steps += 1

                total_batch_num += 1  # used for benchmark
                if args.max_iter and total_batch_num == args.max_iter:  # used for benchmark
                    return

                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name, loss_scaling.name
                        ]
                    else:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = exe.run(train_compiled_program,
                                  fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs
                    else:
                        np_loss, np_acc, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
                        )
                        verbose += "learning rate: %f" % np_lr[0]
                        if args.use_fp16:
                            verbose += ", loss scaling: %f" % np_scaling[0]
                        print(verbose)

                    current_example, current_epoch = processor.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    # profiler tools
                    if args.is_profiler and current_epoch == 0 and steps == args.skip_steps:
                        profiler.start_profiler("All")
                    elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2:
                        profiler.stop_profiler("total", args.profiler_path)
                        return

                    log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format(
                        current_epoch, current_example, num_train_examples,
                        steps,
                        np.sum(total_cost) / np.sum(total_num_seqs),
                        np.sum(total_acc) / np.sum(total_num_seqs))
                    ce_info.append([
                        np.sum(total_cost) / np.sum(total_num_seqs),
                        np.sum(total_acc) / np.sum(total_num_seqs), used_time
                    ])
                    if steps > 0:
                        throughput.append(args.skip_steps / used_time)
                        log_record = log_record + ", speed: %f steps/s" % (
                            args.skip_steps / used_time)
                        print(log_record)
                    else:
                        print(log_record)
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.save(program=train_program, model_path=save_path)

                if steps % args.validation_steps == 0:
                    print("Average throughtput: %s" % (np.average(throughput)))
                    throughput = []
                    # evaluate dev set
                    if args.do_val:
                        evaluate(exe, dev_prog, dev_data_loader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")
                    # evaluate test set
                    if args.do_test:
                        evaluate(exe, test_prog, test_data_loader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.save(program=train_program, model_path=save_path)
                train_data_loader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_cost = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_cost = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_%s_card%s\t%s" %
                  (args.task_name, card_num, ce_time))
            print("kpis\ttrain_cost_%s_card%s\t%f" %
                  (args.task_name, card_num, ce_cost))
            print("kpis\ttrain_acc_%s_card%s\t%f" %
                  (args.task_name, card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        print("Final validation result:")
        evaluate(exe, dev_prog, dev_data_loader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_test:
        print("Final test result:")
        evaluate(exe, test_prog, test_data_loader,
                 [loss.name, accuracy.name, num_seqs.name], "test")