Example #1
0
def one_autofinetune(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  str(seq_label_task.best_score))

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        run_states = seq_label_task.predict(data=input_data[1:])
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, r_label in zip(predict_sents, results):
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(args.predict_data, args.do_model, id), ret)
    # Load model from the defined model path or not
    #

    # seq_label_task.finetune_and_eval()
    # run_states = seq_label_task.eval()
    # eval_avg_score, eval_avg_loss, eval_run_speed =seq_label_task._calculate_metrics(
    #     run_states)
    # Move ckpt/best_model to the defined saved parameters directory
    best_model_dir = os.path.join(args.checkpoint_dir, "best_model")
    if is_path_valid(args.saved_params_dir) and os.path.exists(best_model_dir):
        shutil.copytree(best_model_dir, args.saved_params_dir)
        shutil.rmtree(args.checkpoint_dir)
    write_log('./work/log/' + args.do_model + '.txt', args,
              id + ',' + str(seq_label_task.best_score))
    print(seq_label_task.best_score)
    hub.report_final_result(seq_label_task.best_score)
Example #2
0
def one(args, schema_labels, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args, id + ',' + str(seq_label_task.best_score))
def one(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  id + ',' + str(seq_label_task.best_score))

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        # print(input_data[:10])
        run_states = seq_label_task.predict(data=input_data)
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            # print('batch_infers',batch_results )
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, input, r_label in zip(predict_sents, input_data, results):
            sent["input"] = input
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(output_predict_data_path, args.do_model,
                                   id), ret)
        get_submit_postprocess(args, id)
        get_submit_postprocess(args, id, check=True)
def one(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  id + ',' + str(seq_label_task.best_score))

    if args.do_model == 'role' and args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}

        input_data = [[d] for d in predict_data]
        # print(input_data[:10])
        run_states = seq_label_task.predict(data=input_data)
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            # print(batch_results)
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, input, r_label in zip(predict_sents, input_data, results):
            sent["input"] = input
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(output_predict_data_path, args.do_model,
                                   id), ret)
        get_submit_postprocess(args, id)
        get_submit_postprocess(args, id, check=True)

    if args.do_model in ['mcls', "mcls_onlysentence"] and args.do_predict:
        input_data = predict_data
        result = seq_label_task.predict(data=input_data, return_result=True)
        ret = []
        submit = []
        for s, r in zip(predict_sents, result):
            s['labels'] = []
            # print(r)
            for r0 in r:
                # print(r0)
                for k, v in r0.items():
                    # print(k,v)
                    if (v == 1):
                        s['labels'].append(k)
                        if (args.do_model == 'mcls_onlysentence'):
                            submit.append('\t'.join([str(s["id"]), k]))
                        else:
                            submit.append('\t'.join(
                                [str(s["id"]), k, s["entity"]]))
            ret.append(json.dumps(s, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.40-55.pred".format(output_predict_data_path,
                                         args.do_model, id), ret)
        write_by_lines(
            "{}{}.{}.40-55.ucas_valid_result.csv".format(
                output_path, args.do_model, id), submit)