Exemple #1
0
def origin_events_process4test():
    """
    origin_events_process
    deal test json data like:{"text": "国际组织IEEE限制华为后,北大教授宣布退出IEEE编委会", "id": "17388c65ef9d77aff12d8cc1f645e41c"}
    """
    origin_events_path = sys.argv[2]
    save_dir = sys.argv[3]
    if not origin_events_path or not save_dir:
        raise Exception("set origin_events_path and save_dir first")
    output = []
    lines = utils.read_by_lines(origin_events_path)
    for line in lines:
        d_json = json.loads(line)
        event = {}
        event["trigger"] = ""
        event["trigger_start_index"] = 0
        event["class"] = ""
        event["event_type"] = ""
        event["arguments"] = []
        argument = {}
        argument["argument_start_index"] = 0
        argument["role"] = ""
        argument["argument"] = ""
        argument["alias"] = []
        event["arguments"].append(argument)
        event["event_id"] = u"{}_{}".format(d_json["id"], "no_event")
        event["text"] = d_json["text"]
        event["id"] = d_json["id"]
        output.append(json.dumps(event, ensure_ascii=False))
    random.shuffle(output)  # 随机一下

    print(u"include sentences {}, events {}, test datas {}".format(
        len(lines), len(output), len(output)))
    utils.write_by_lines(u"{}/test.json".format(save_dir), output)
def test_data_2_eval(test_file_path, save_path):
    """test_2_eval_data"""
    # test_file_path = sys.argv[2]
    # save_path = sys.argv[3]
    if not test_file_path or not save_path:
        raise Exception("must set test_data_path and save_path")

    datas = utils.read_by_lines(test_file_path)
    all_events = {}
    for data in datas:
        d_json = json.loads(data)
        text = d_json["text"]
        _id = utils.cal_md5(text.encode("utf-8"))
        event = {
            "trigger": d_json["trigger"],
            "trigger_start_index": d_json["trigger_start_index"],
            "event_type": d_json["event_type"],
            "class": d_json["class"],
            "arguments": d_json["arguments"],
        }
        if _id not in all_events:
            all_events[_id] = {u"id": _id, u"text": text, u"event_list": []}
        all_events[_id][u"event_list"].append(event)
    outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()]
    utils.write_by_lines(save_path, outputs)
    print(u"test data 2 eval data, inputs {} outputs {}".format(
        len(datas), len(outputs)))
Exemple #3
0
def predict_trigger(trigger_dict):
    """
    trigger predict
    """
    log = trigger_dict['log']
    args = trigger_dict['args']
    labels_map = trigger_dict['labels_map']
    ernie_config = trigger_dict['ernie_config']
    place = trigger_dict['place']
    dev_count = trigger_dict['dev_count']
    reader = trigger_dict['reader']
    startup_prog = trigger_dict['startup_prog']
    test_prog = trigger_dict['test_prog']
    test_pyreader = trigger_dict['test_pyreader']
    graph_vars = trigger_dict['graph_vars']
    nccl2_num_trainers = trigger_dict['nccl2_num_trainers']
    nccl2_trainer_id = trigger_dict['nccl2_trainer_id']
    if 'exe' in trigger_dict:
        exe = trigger_dict['exe']
    else:
        exe = None
    trigger_pred_save_path = args.trigger_pred_save_path
    if os.path.exists(trigger_pred_save_path):
        print('delete old file : %s' % trigger_pred_save_path)
        os.remove(trigger_pred_save_path)

    if args.do_test:
        # TODO 2020-11-24
        # TODO 2021-01-20 取消if not exe --  use_fp16=args.use_fp16) del exe
        if not exe:
            exe = fluid.Executor(place)
            exe.run(startup_prog)

            if args.do_val or args.do_test:
                if not args.init_checkpoint:
                    raise ValueError("args 'init_checkpoint' should be set if"
                                     "only doing validation or testing!")
                init_checkpoint(exe,
                                args.init_checkpoint,
                                main_program=startup_prog,
                                use_fp16=args.use_fp16)

        log.error(
            "********************** trigger predict begin **********************"
        )
        test_ret = predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                                   graph_vars, 1, 'final')
        utils.write_by_lines(args.trigger_pred_save_path, test_ret)
        del exe
Exemple #4
0
def schema_role_process():
    """schema_role_process"""
    schema_path = sys.argv[2]
    save_path = sys.argv[3]
    if not schema_path or not save_path:
        raise Exception("set schema_path and save_path first")
    index = 0
    roles = set()
    for line in utils.read_by_lines(schema_path):
        d_json = json.loads(line)
        for role in d_json["role_list"]:
            roles.add(role["role"])
    outputs = []
    for r in list(roles):
        outputs.append(u"B-{}\t{}".format(r, index))
        index += 1
        outputs.append(u"I-{}\t{}".format(r, index))
        index += 1
    outputs.append(u"O\t{}".format(index))
    print(u"include roles {},create label {}".format(len(roles), len(outputs)))
    utils.write_by_lines(save_path, outputs)
Exemple #5
0
def origin_events_process():
    """origin_events_process"""
    origin_events_path = sys.argv[2]
    save_dir = sys.argv[3]
    if not origin_events_path or not save_dir:
        raise Exception("set origin_events_path and save_dir first")
    output = []
    lines = utils.read_by_lines(origin_events_path)
    for line in lines:
        d_json = json.loads(line)
        for event in d_json["event_list"]:
            event["event_id"] = u"{}_{}".format(d_json["id"], event["trigger"])
            event["text"] = d_json["text"]
            event["id"] = d_json["id"]
            output.append(json.dumps(event, ensure_ascii=False))
    random.shuffle(output)  # 随机一下
    # 按照 8 / 2 分
    train_data_len = int(len(output) * 0.8)
    train_data = output[:train_data_len]
    test_data = output[train_data_len:]
    print(
        u"include sentences {}, events {}, train datas {}, dev datas {}, test datas {}"
        .format(len(lines), len(output), len(train_data), len(test_data),
                len(test_data)))
    utils.write_by_lines(u"{}/train.json".format(save_dir), train_data)
    utils.write_by_lines(u"{}/dev.json".format(save_dir), test_data)
    utils.write_by_lines(u"{}/test.json".format(save_dir), test_data)
Exemple #6
0
def schema_event_type_process():
    """schema_process"""
    schema_path = sys.argv[2]
    save_path = sys.argv[3]
    if not schema_path or not save_path:
        raise Exception("set schema_path and save_path first")
    index = 0
    event_types = set()
    for line in utils.read_by_lines(schema_path):
        d_json = json.loads(line)
        event_types.add(d_json["event_type"])

    outputs = []
    for et in list(event_types):
        outputs.append(u"B-{}\t{}".format(et, index))
        index += 1
        outputs.append(u"I-{}\t{}".format(et, index))
        index += 1
    outputs.append(u"O\t{}".format(index))
    print(u"include event type {},  create label {}".format(
        len(event_types), len(outputs)))
    utils.write_by_lines(save_path, outputs)
def predict_data_2_eval(pred_trigger_path, pred_role_path, schema_path,
                        save_path):
    """pred_process_with_golden_type"""
    # pred_trigger_path = sys.argv[2]
    # pred_role_path = sys.argv[3]
    # schema_path = sys.argv[4]
    # save_path = sys.argv[5]
    if not pred_trigger_path or not pred_role_path or not schema_path or not save_path:
        raise Exception(
            "must set pred_trigger_path and pred_role_path and schema_path and save_path"
        )
    print(u"predict data 2 eval data start")

    trigger_data_list = utils.read_by_lines(pred_trigger_path)
    trigger_datas = {}
    for d in trigger_data_list:
        d_json = json.loads(d)
        trigger_datas[d_json["event_id"]] = d_json
    print(u"load trigger predict datas {} from {}".format(
        len(trigger_datas), pred_trigger_path))

    role_data_list = utils.read_by_lines(pred_role_path)
    role_datas = {}
    for d in role_data_list:
        d_json = json.loads(d)
        role_datas[d_json["event_id"]] = d_json
    print(u"load role predict datas {} from {}".format(len(role_datas),
                                                       pred_role_path))

    schema_data_list = utils.read_by_lines(schema_path)
    schema_datas = {}
    for d in schema_data_list:
        d_json = json.loads(d)
        schema_datas[d_json["event_type"]] = [
            r["role"] for r in d_json["role_list"]
        ]
    print(u"load schema datas {} from {}".format(len(schema_data_list),
                                                 schema_path))

    all_events = {}
    for t_json in trigger_datas.values():
        text = t_json["sentence"]
        _id = utils.cal_md5(text.encode("utf-8"))
        exist_event_type = set()
        for tri_info in t_json["trigger_ret"]:
            event_type = tri_info["event_type"]
            if event_type in exist_event_type:
                continue
            trigger = tri_info["text"]
            role_type_set = set(schema_datas[event_type])

            r_json = role_datas[t_json["event_id"]]
            arguments = []
            for p_r in r_json["roles_ret"]:
                role_type = p_r["role_type"]
                if role_type in role_type_set:
                    arguments.append({
                        u"role": role_type,
                        u"argument": p_r["text"]
                    })
            if len(arguments) > 0:
                event = {
                    u"trigger": trigger,
                    u"event_type": event_type,
                    u"arguments": arguments
                }
                if _id not in all_events:
                    all_events[_id] = {
                        u"id": _id,
                        u"text": text,
                        u"event_list": []
                    }
                all_events[_id][u"event_list"].append(event)
            exist_event_type.add(event_type)
    outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()]
    utils.write_by_lines(save_path, outputs)
    print(u"predict data 2 eval data is finished, outputs {}".format(
        len(outputs)))
def main(args):
    """main"""
    reader = task_reader.RoleSequenceLabelReader(
        vocab_path=args.vocab_path,
        labels_map=labels_map,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        print(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        u"【train】epoch: {}, step: {}, loss: {:.6f}, "
                        "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s"
                        .format(current_epoch, steps, float(loss), float(f1),
                                float(precision), float(recall),
                                args.skip_steps / used_time))
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))
                    # evaluate test set
                    if args.do_test:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "final_model")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog,
                                                 test_pyreader, graph_vars, 1,
                                                 'final')
        print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format(
            float(precision), float(recall), float(f1)))

    if args.do_test:
        test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader,
                                   graph_vars, 1, 'final')
        utils.write_by_lines(args.trigger_pred_save_path, test_ret)