Esempio n. 1
0
def predict_by_model_path(args, model_path, schema_labels, predict_data,
                          predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, predict_data,
                                      predict_sents, id)
    seq_label_task.init_if_necessary()
    seq_label_task.load_parameters(model_path)
    logger.info("PaddleHub has loaded model from %s" % model_path)
    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        run_states = seq_label_task.predict(data=input_data[1:])
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, r_label in zip(predict_sents, results):
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(output_predict_data_path, args.do_model,
                                   id), ret)
Esempio n. 2
0
def get_submit_postprocess(args, id):
    results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path,
                                                   args.do_model, id))
    submit = []
    count = 0
    for j in range(len(results)):
        json_result = json.loads(results[j])
        text = json_result['text']
        label = json_result["labels"]
        now_label = ''
        now_entity = ''
        count = 0
        # print(len(text),len(label))
        for i, l in enumerate(label):
            # print(l,text[i])
            if (l == 'O'):
                if (now_label != ''):
                    count += 1
                    submit.append('\t'.join(
                        [str(json_result['id']), now_label, now_entity]))
                    now_label = ''
                    now_entity = ''
            else:
                if (l.startswith('B-')):
                    now_label = l[2:]
                    now_entity = text[i]
                else:
                    now_entity += text[i]
        # if(count==0):
        #     submit.append('\t'.join([str(json_result['id']),'','',text,str(label)]))
        # print(submit)
    write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id),
                   submit)
Esempio n. 3
0
def one_autofinetune(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  str(seq_label_task.best_score))

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        run_states = seq_label_task.predict(data=input_data[1:])
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, r_label in zip(predict_sents, results):
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(args.predict_data, args.do_model, id), ret)
    # Load model from the defined model path or not
    #

    # seq_label_task.finetune_and_eval()
    # run_states = seq_label_task.eval()
    # eval_avg_score, eval_avg_loss, eval_run_speed =seq_label_task._calculate_metrics(
    #     run_states)
    # Move ckpt/best_model to the defined saved parameters directory
    best_model_dir = os.path.join(args.checkpoint_dir, "best_model")
    if is_path_valid(args.saved_params_dir) and os.path.exists(best_model_dir):
        shutil.copytree(best_model_dir, args.saved_params_dir)
        shutil.rmtree(args.checkpoint_dir)
    write_log('./work/log/' + args.do_model + '.txt', args,
              id + ',' + str(seq_label_task.best_score))
    print(seq_label_task.best_score)
    hub.report_final_result(seq_label_task.best_score)
def process_data(args):
    # get_train_dev()
    predict_data, predict_sents = get_predict()

    # write_by_lines("{}/{}_train.tsv".format(args.data_dir, args.do_model), train_data)
    # write_by_lines("{}/{}_dev.tsv".format(args.data_dir, args.do_model), dev_data)
    # write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data)
    write_by_lines("{}/predict.txt".format(args.data_dir), predict_data)

    schema_labels = read_label('{}/entity2id.txt'.format(args.data_dir))
    return schema_labels, predict_data, predict_sents
def one(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  id + ',' + str(seq_label_task.best_score))

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        # print(input_data[:10])
        run_states = seq_label_task.predict(data=input_data)
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            # print('batch_infers',batch_results )
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, input, r_label in zip(predict_sents, input_data, results):
            sent["input"] = input
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(output_predict_data_path, args.do_model,
                                   id), ret)
        get_submit_postprocess(args, id)
        get_submit_postprocess(args, id, check=True)
Esempio n. 6
0
def get_check_postprocess(args, id):
    import json
    results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path,
                                                   args.do_model, id))
    print(results[0])
    submit = []
    count = 0
    for j in range(len(results)):
        json_result = json.loads(results[j])
        text = json_result['text']
        label = json_result["labels"]
        now_label = ''
        now_entity = -1
        count = 0
        # print(len(text),len(label))
        for i, l in enumerate(label):
            # print(l,text[i])
            if (l == 'O'):
                if (now_label != ''):
                    count += 1
                    submit.append('\t'.join([
                        str(json_result['id']), now_label, text[now_entity:i],
                        str(json_result['input'][0][now_entity * 2:i * 2]),
                        str(json_result['input'][0]), text,
                        str(label)
                    ]))
                    now_label = ''
                    # now_entity=-1
            else:
                if (l.startswith('B-')):
                    now_label = l[2:]
                    now_entity = i
                elif (now_label != ''):
                    # if(len(submit)>1):
                    #     submit.pop()
                    now_label = label[i][2:]

        # if(count==0):
        #     submit.append('\t'.join([str(json_result['id']),'','',text,str(label)]))
        # print(submit)
    write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id),
                   submit)
Esempio n. 7
0
def process_data(args, i=4):
    if args.do_model == 'mcls':
        train1, dev1 = get_mcls_train_dev(args, i)
        predict_data, predict_sents = get_mcls_predict(args)
        # write_by_lines("{}/predict_mcls.txt".format(args.data_dir), predict_data)
        schema_labels = read_label('{}/event2id.txt'.format(args.data_dir))[1:]
    elif args.do_model == 'role':
        train1, dev1 = get_train_dev(args, i)
        predict_data, predict_sents = get_predict(args)

        write_by_lines("{}/train.txt".format(args.data_dir), train1)
        write_by_lines("{}/dev.txt".format(args.data_dir), dev1)
        # write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data)
        write_by_lines("{}/predict.txt".format(args.data_dir), predict_data)
        if (args.change_event == 'BIO_event'):
            schema_labels = read_label('{}/entity2id.txt'.format(
                args.data_dir))
        elif (args.change_event == 'no'):
            schema_labels = read_label('{}/event2id.txt'.format(args.data_dir))
        else:
            schema_labels = ['O', 'B', 'I']
    elif args.do_model == 'mrc_relation':
        train1, dev1 = get_mrc_relation_train_dev(args, i)
        predict_data, predict_sents = get_mrc_relation_predict(args)
        schema_labels = ['0', '1']
    else:  #if args.do_model=='mcls_onlysentence'
        train1, dev1 = get_mcls_onlysentence_train_dev(args, i)
        predict_data, predict_sents = get_mcls_onlysentence_predict(args)
        # write_by_lines("{}/predict_mcls.txt".format(args.data_dir), predict_data)
        schema_labels = read_label('{}/event2id.txt'.format(args.data_dir))[1:]
    return schema_labels, predict_data, predict_sents
Esempio n. 8
0
def get_submit_postprocess(args, id, check=False, mcls=False):
    results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path,
                                                   args.do_model, id))
    submit = []
    count = 0
    # print(results)
    for j in range(len(results)):
        json_result = json.loads(results[j])
        text = json_result['text']
        label = json_result["labels"]
        now_label = ''
        now_entity = -1
        count = 0
        # print(len(text),len(label))
        for i, l in enumerate(label):
            # print(l,text[i])
            if (l == 'O' or l == '<NA>'):
                if (now_label != ''):
                    count += 1
                    if (check):
                        submit.append('\t'.join([
                            str(json_result['id']), now_label,
                            text[now_entity:i],
                            str(json_result['input'][0][now_entity * 2:i * 2]),
                            str(json_result['input'][0]), text,
                            str(label)
                        ]))
                    elif (mcls):
                        submit.append('\t'.join([
                            str(json_result['id']), text, text[now_entity:i],
                            now_label
                        ]))
                    else:
                        submit.append('\t'.join([
                            str(json_result['id']), now_label,
                            text[now_entity:i]
                        ]))
                    now_label = ''
            else:
                # print(l,text[i])
                if (now_label == '' and l != '<NA>'):
                    now_label = l
                    now_entity = i
                elif (l.startswith('B')):
                    if (args.change_event == 'BIO_event'):
                        now_label = l[2:]
                    else:
                        now_label = l
                    now_entity = i
                    # print()
                elif (args.add_rule and now_label == ''):
                    # print(args.change_event)
                    if (args.change_event == 'BIO_event'
                            and label[i][2:] == label[now_entity][2:]):
                        now_label = label[i][2:]
                        submit.pop(-1)
        # if(count==0):
        #     submit.append('\t'.join([str(json_result['id']),'','',text,str(label)]))
        # print(submit)
    if (check):
        write_by_lines(
            "{}/{}ucas_valid_result_check.csv".format(output_path, id), submit)
    else:
        write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id),
                       submit)
def one(args, schema_labels, predict_data, predict_sents, id):
    seq_label_task, reader = get_task(args, schema_labels, id)
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"

    # PaddleHub Finetune API
    # 将自动训练、评测并保存模型
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()
        write_log('./work/log/' + args.do_model + '.txt', args,
                  id + ',' + str(seq_label_task.best_score))

    if args.do_model == 'role' and args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}

        input_data = [[d] for d in predict_data]
        # print(input_data[:10])
        run_states = seq_label_task.predict(data=input_data)
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            # print(batch_results)
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, input, r_label in zip(predict_sents, input_data, results):
            sent["input"] = input
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.pred".format(output_predict_data_path, args.do_model,
                                   id), ret)
        get_submit_postprocess(args, id)
        get_submit_postprocess(args, id, check=True)

    if args.do_model in ['mcls', "mcls_onlysentence"] and args.do_predict:
        input_data = predict_data
        result = seq_label_task.predict(data=input_data, return_result=True)
        ret = []
        submit = []
        for s, r in zip(predict_sents, result):
            s['labels'] = []
            # print(r)
            for r0 in r:
                # print(r0)
                for k, v in r0.items():
                    # print(k,v)
                    if (v == 1):
                        s['labels'].append(k)
                        if (args.do_model == 'mcls_onlysentence'):
                            submit.append('\t'.join([str(s["id"]), k]))
                        else:
                            submit.append('\t'.join(
                                [str(s["id"]), k, s["entity"]]))
            ret.append(json.dumps(s, ensure_ascii=False))
        write_by_lines(
            "{}.{}.{}.40-55.pred".format(output_predict_data_path,
                                         args.do_model, id), ret)
        write_by_lines(
            "{}{}.{}.40-55.ucas_valid_result.csv".format(
                output_path, args.do_model, id), submit)