コード例 #1
0
def predict_data_process(trigger_file, role_file, schema_file, save_path):
    """predict_data_process"""
    pred_ret = []
    trigger_datas = read_by_lines(trigger_file)
    role_data = read_by_lines(role_file)
    schema_datas = read_by_lines(schema_file)
    print("trigger predict {} load from {}".format(len(trigger_datas),
                                                   trigger_file))
    print("role predict {} load from {}".format(len(role_data), role_file))
    print("schema {} load from {}".format(len(schema_datas), schema_file))

    schema = {}
    for s in schema_datas:
        d_json = json.loads(s)
        schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]

    # process the role data
    sent_role_mapping = {}
    for d in role_data:
        d_json = json.loads(d)
        r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        role_ret = {}
        for r in r_ret:
            role_type = r["type"]
            if role_type not in role_ret:
                role_ret[role_type] = []
            role_ret[role_type].append("".join(r["text"]))
        sent_role_mapping[d_json["id"]] = role_ret

    for d in trigger_datas:
        d_json = json.loads(d)
        t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        pred_event_types = list(set([t["type"] for t in t_ret]))
        event_list = []
        for event_type in pred_event_types:
            role_list = schema[event_type]
            arguments = []
            for role_type, ags in sent_role_mapping[d_json["id"]].items():
                if role_type not in role_list:
                    continue
                for arg in ags:
                    if len(arg) == 1:
                        continue
                    arguments.append({"role": role_type, "argument": arg})
            event = {"event_type": event_type, "arguments": arguments}
            event_list.append(event)
        pred_ret.append({
            "id": d_json["id"],
            "text": d_json["text"],
            "event_list": event_list
        })
    pred_ret = [json.dumps(r, ensure_ascii=False) for r in pred_ret]
    print("submit data {} save to {}".format(len(pred_ret), save_path))
    write_by_lines(save_path, pred_ret)
コード例 #2
0
def schema_process(path, model="trigger"):
    """schema_process"""
    def label_add(labels, _type):
        """label_add"""
        if "B-{}".format(_type) not in labels:
            labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
        return labels

    labels = []
    for line in read_by_lines(path):
        d_json = json.loads(line.strip())
        if model == "trigger":
            labels = label_add(labels, d_json["event_type"])
        elif model == "role":
            for role in d_json["role_list"]:
                if role["role"] == enum_role:
                    continue
                labels = label_add(labels, role["role"])
        elif model == "enum":
            for role in d_json["role_list"]:
                if role["role"] == enum_role:
                    labels = role["enum_items"]

    labels.append("O")
    tags = []
    for index, label in enumerate(labels):
        tags.append("{}\t{}".format(index, label))
    if model == "enum":
        tags = tags[:-1]
    return tags
コード例 #3
0
def do_predict():
    paddle.set_device(args.device)

    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}
    model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map))

    no_entity_label = "O"
    ignore_label = len(label_map)

    print("============start predict==========")
    if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
        raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
    else:
        state_dict = paddle.load(args.init_ckpt)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.init_ckpt)

    # load data from predict file
    sentences = read_by_lines(args.predict_data) # origin data format
    sentences = [json.loads(sent) for sent in sentences]

    encoded_inputs_list = []
    for sent in sentences:
        sent = sent["text"].replace(" ", "\002")
        input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer,
                    max_seq_len=args.max_seq_len, is_test=True)
        encoded_inputs_list.append((input_ids, token_type_ids, seq_len))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids
        Stack(dtype='int64') # sequence lens
    ): fn(samples)
    # Seperates data into some batches.
    batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size]
                            for i in range(0, len(encoded_inputs_list), args.batch_size)]
    results = []
    model.eval()
    for batch in batch_encoded_inputs:
        input_ids, token_type_ids, seq_lens = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=-1)
        probs_ids = paddle.argmax(probs, -1).numpy()
        probs = probs.numpy()
        for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()):
            prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])]
            label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]]
            results.append({"probs": prob_one, "labels": label_one})
    assert len(results) == len(sentences)
    for sent, ret in zip(sentences, results):
        sent["pred"] = ret
    sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
    write_by_lines(args.predict_save_path, sentences)
    print("save data {} to {}".format(len(sentences), args.predict_save_path))
コード例 #4
0
def docs_data_process(path):
    """docs_data_process"""
    lines = read_by_lines(path)
    sentences = []
    for line in lines:
        d_json = json.loads(line)
        sentences.extend(marked_doc_2_sentence(d_json))
    sentences = [json.dumps(s, ensure_ascii=False) for s in sentences]
    return sentences
コード例 #5
0
def data_process(path, model="trigger", is_predict=False):
    """data_process"""
    def label_data(data, start, l, _type):
        """label_data"""
        for i in range(start, start + l):
            suffix = "B-" if i == start else "I-"
            data[i] = "{}{}".format(suffix, _type)
        return data

    sentences = []
    output = ["text_a"] if is_predict else ["text_a\tlabel"]

    for line in read_by_lines(path):
        d_json = json.loads(line)
        _id = d_json["id"]
        text_a = [
            "," if t == " " or t == "\n" or t == "\t" else t
            for t in list(d_json["text"].lower())
        ]
        if is_predict:
            sentences.append({"text": d_json["text"], "id": _id})
            output.append('\002'.join(text_a))
        else:
            if model == u"trigger":
                labels = ["O"] * len(text_a)
                if len(d_json.get("event_list", [])) == 0:
                    continue
                for event in d_json.get("event_list", []):
                    event_type = event["event_type"]
                    start = event["trigger_start_index"]
                    trigger = event["trigger"]
                    labels = label_data(labels, start, len(trigger),
                                        event_type)
                output.append("{}\t{}".format('\002'.join(text_a),
                                              '\002'.join(labels)))
            elif model == u"role":
                for event in d_json.get("event_list", []):
                    labels = ["O"] * len(text_a)
                    for arg in event["arguments"]:
                        role_type = arg["role"]
                        if role_type == enum_role:
                            continue
                        argument = arg["argument"]
                        start = arg["argument_start_index"]
                        labels = label_data(labels, start, len(argument),
                                            role_type)
                    output.append("{}\t{}".format('\002'.join(text_a),
                                                  '\002'.join(labels)))
    return output
コード例 #6
0
    def __init__(self,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 in_tokens=False,
                 is_inference=False,
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 is_classify=True,
                 is_regression=False,
                 for_cn=True,
                 task_id=0):
        self.tokenizer = BertTokenizer.from_pretrained(config.model_name)

        self.max_seq_len = max_seq_len

        labels_map = {}  # label
        for line in utils.read_by_lines(config.trigger_label_map):
            arr = line.split("\t")
            labels_map[arr[0]] = int(arr[1])
        # self.tokenizer = tokenization.FullTokenizer(
        #     vocab_file=vocab_path, do_lower_case=do_lower_case)
        # self.vocab = self.tokenizer.vocab
        # self.pad_id = self.vocab["[PAD]"]
        # self.cls_id = self.vocab["[CLS]"]
        # self.sep_id = self.vocab["[SEP]"]
        # self.in_tokens = in_tokens
        # self.is_inference = is_inference
        # self.for_cn = for_cn
        # self.task_id = task_id

        # np.random.seed(random_seed)

        # self.is_classify = is_classify
        # self.is_regression = is_regression
        # self.current_example = 0
        # self.current_epoch = 0
        # self.num_examples = 0

        self.label_map = labels_map
コード例 #7
0
def enum_data_process(path, is_predict=False):
    """enum_data_process"""
    output = ["text_a"] if is_predict else ["label\ttext_a"]
    for line in read_by_lines(path):
        d_json = json.loads(line)
        text = d_json["text"].lower().replace("\t", " ")
        if is_predict:
            output.append(text)
            continue
        if len(d_json.get("event_list", [])) == 0:
            continue
        label = None
        for event in d_json["event_list"]:
            if event["event_type"] != "公司上市":
                continue
            for argument in event["arguments"]:
                role_type = argument["role"]
                if role_type == enum_role:
                    label = argument["argument"]
        if label:
            output.append("{}\t{}".format(label, text))
    return output
コード例 #8
0
def predict_data_process(trigger_file, role_file, schema_file, save_path):
    """predict_data_process"""
    pred_ret = []
    trigger_datas = read_by_lines(trigger_file)
    role_data = read_by_lines(role_file)
    schema_datas = read_by_lines(schema_file)
    print("trigger predict {} load from {}".format(len(trigger_datas),
                                                   trigger_file))
    print("role predict {} load from {}".format(len(role_data), role_file))
    print("schema {} load from {}".format(len(schema_datas), schema_file))

    schema = {}
    for s in schema_datas:
        d_json = json.loads(s)
        schema[d_json["event_type"]] = [
            r["role"] for r in d_json["role_list"]
        ]  # 字典:key-event_type,value-role_list, {event_type: [role1, role2, ...]}

    # process the role data
    sent_role_mapping = {}
    for d in role_data:
        d_json = json.loads(d)
        r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        role_ret = {}
        for r in r_ret:  # 对每个预测到的论元组,即字典{'start': , 'text': , 'type': }
            role_type = r["type"]  # 获取预测的role
            if role_type not in role_ret:
                role_ret[role_type] = []
            role_ret[role_type].append(
                "".join(r["text"])
            )  # 汇集论元对应的实例化的词,即{'role_type': [role_arg1, role_arg2, ...]}
        sent_role_mapping[d_json[
            "id"]] = role_ret  # 当前句子的{id : {'role_type': [role_arg1, role_arg2, ...]}}

    for d in trigger_datas:
        d_json = json.loads(d)
        t_ret = extract_result(
            d_json["text"],
            d_json["pred"]["labels"])  # [{'start': , 'text': , 'type': }, ...]
        pred_event_types = list(set([
            t["type"] for t in t_ret
        ]))  # 获取预测的trigger类型,即事件类型。利用触发词来拿到事件类型,一个句子可以有多个事件类型,所以用了序列标注的方法
        event_list = []
        for event_type in pred_event_types:
            role_list = schema[event_type]
            arguments = []
            for role_type, ags in sent_role_mapping[d_json["id"]].items():
                if role_type not in role_list:  # 这里的判断可以排除理应不在本事件类型中出现的论元;我们只关心本事件类型出现的论元是否被预测到
                    continue
                for arg in ags:
                    if len(arg) == 1:
                        continue
                    arguments.append({"role": role_type, "argument": arg})
            event = {"event_type": event_type, "arguments": arguments}
            event_list.append(event)
        pred_ret.append({
            "id": d_json["id"],
            "text": d_json["text"],
            "event_list": event_list
        })
    pred_ret = [
        json.dumps(r, ensure_ascii=False) for r in pred_ret
    ]  # 最后的预测数据里的event_list是没有index的,只有event_type, {role, argument}
    print("submit data {} save to {}".format(len(pred_ret), save_path))
    write_by_lines(save_path, pred_ret)
コード例 #9
0
def do_predict():
    set_seed(args.seed)
    device = args.device

    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}

    model = ErnieForSequenceClassification(num_classes=len(label_map))
    model.to(torch.device(device))
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0")

    print("============start predict==========")
    if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
        raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
    else:
        state_dict = torch.load(args.init_ckpt)
        model.load_state_dict(state_dict)
        print("Loaded parameters from %s" % args.init_ckpt)

    # load data from predict file
    sentences = read_by_lines(args.predict_data)  # origin data format
    sentences = [json.loads(sent) for sent in sentences]

    encoded_inputs_list = []
    for sent in sentences:
        sent = sent["text"]
        input_sent = [sent]  # only text_a
        if "text_b" in sent:
            input_sent = [[sent, sent["text_b"]]]  # add text_b
        example = data_2_examples(input_sent)[0]
        input_ids, token_type_ids = convert_example(
            example, tokenizer, max_seq_len=args.max_seq_len, is_test=True)
        encoded_inputs_list.append((input_ids, token_type_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
    ): fn(samples)
    # Seperates data into some batches.
    batch_encoded_inputs = [
        encoded_inputs_list[i:i + args.batch_size]
        for i in range(0, len(encoded_inputs_list), args.batch_size)
    ]
    results = []
    model.eval()
    for batch in batch_encoded_inputs:
        input_ids, token_type_ids = batchify_fn(batch)
        input_ids = to_var(input_ids, device=device).long()
        token_type_ids = to_var(token_type_ids, device=device).long()

        logits = model(input_ids, token_type_ids)
        probs = torch.softmax(logits, dim=1)
        if device == "cuda":
            probs_ids = torch.argmax(probs, -1).detach().cpu().numpy()
            probs = probs.detach().cpu().numpy()
        else:
            probs_ids = torch.argmax(probs, -1).detach().numpy()
            probs = probs.detach().numpy()
        for prob_one, p_id in zip(probs.tolist(), probs_ids.tolist()):
            label_probs = {}
            for idx, p in enumerate(prob_one):
                label_probs[id2label[idx]] = p
            results.append({"probs": label_probs, "label": id2label[p_id]})

    assert len(results) == len(sentences)
    for sent, ret in zip(sentences, results):
        sent["pred"] = ret
    sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
    write_by_lines(args.predict_save_path, sentences)
    print("save data {} to {}".format(len(sentences), args.predict_save_path))
コード例 #10
0
def predict_data_process(trigger_file, role_file, enum_file, schema_file,
                         save_path):
    """predict_data_process"""
    pred_ret = []
    trigger_data = read_by_lines(trigger_file)
    role_data = read_by_lines(role_file)
    enum_data = read_by_lines(enum_file)
    schema_data = read_by_lines(schema_file)
    print("trigger predict {} load from {}".format(len(trigger_data),
                                                   trigger_file))
    print("role predict {} load from {}".format(len(role_data), role_file))
    print("enum predict {} load from {}".format(len(enum_data), enum_file))
    print("schema {} load from {}".format(len(schema_data), schema_file))

    schema, sent_role_mapping, sent_enum_mapping = {}, {}, {}
    for s in schema_data:
        d_json = json.loads(s)
        schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]

    # role depends on id and sent_id
    for d in role_data:
        d_json = json.loads(d)
        r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        role_ret = {}
        for r in r_ret:
            role_type = r["type"]
            if role_type not in role_ret:
                role_ret[role_type] = []
            role_ret[role_type].append("".join(r["text"]))
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        sent_role_mapping[_id] = role_ret

    # process the enum_role data
    for d in enum_data:
        d_json = json.loads(d)
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        label = d_json["pred"]["label"]
        sent_enum_mapping[_id] = label

    # process trigger data
    for d in trigger_data:
        d_json = json.loads(d)
        t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        pred_event_types = list(set([t["type"] for t in t_ret]))
        event_list = []
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        for event_type in pred_event_types:
            role_list = schema[event_type]
            arguments = []
            for role_type, ags in sent_role_mapping[_id].items():
                if role_type not in role_list:
                    continue
                for arg in ags:
                    arguments.append({"role": role_type, "argument": arg})
            # 特殊处理环节
            if event_type == enum_event_type:
                arguments.append({
                    "role": enum_role,
                    "argument": sent_enum_mapping[_id]
                })
            event = {
                "event_type": event_type,
                "arguments": arguments,
                "text": d_json["text"]
            }
            event_list.append(event)
        pred_ret.append({
            "id": d_json["id"],
            "sent_id": d_json["sent_id"],
            "text": d_json["text"],
            "event_list": event_list
        })
    doc_pred = {}
    for d in pred_ret:
        if d["id"] not in doc_pred:
            doc_pred[d["id"]] = {"id": d["id"], "event_list": []}
        doc_pred[d["id"]]["event_list"].extend(d["event_list"])

    # unfiy the all prediction results and save them
    doc_pred = [
        json.dumps(event_normalization(r), ensure_ascii=False)
        for r in doc_pred.values()
    ]
    print("submit data {} save to {}".format(len(doc_pred), save_path))
    write_by_lines(save_path, doc_pred)