def test_data_2_eval(test_file_path, save_path):
    """test_2_eval_data"""
    # test_file_path = sys.argv[2]
    # save_path = sys.argv[3]
    if not test_file_path or not save_path:
        raise Exception("must set test_data_path and save_path")

    datas = utils.read_by_lines(test_file_path)
    all_events = {}
    for data in datas:
        d_json = json.loads(data)
        text = d_json["text"]
        _id = utils.cal_md5(text.encode("utf-8"))
        event = {
            "trigger": d_json["trigger"],
            "trigger_start_index": d_json["trigger_start_index"],
            "event_type": d_json["event_type"],
            "class": d_json["class"],
            "arguments": d_json["arguments"],
        }
        if _id not in all_events:
            all_events[_id] = {u"id": _id, u"text": text, u"event_list": []}
        all_events[_id][u"event_list"].append(event)
    outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()]
    utils.write_by_lines(save_path, outputs)
    print(u"test data 2 eval data, inputs {} outputs {}".format(
        len(datas), len(outputs)))
def predict_data_2_eval(pred_trigger_path, pred_role_path, schema_path,
                        save_path):
    """pred_process_with_golden_type"""
    # pred_trigger_path = sys.argv[2]
    # pred_role_path = sys.argv[3]
    # schema_path = sys.argv[4]
    # save_path = sys.argv[5]
    if not pred_trigger_path or not pred_role_path or not schema_path or not save_path:
        raise Exception(
            "must set pred_trigger_path and pred_role_path and schema_path and save_path"
        )
    print(u"predict data 2 eval data start")

    trigger_data_list = utils.read_by_lines(pred_trigger_path)
    trigger_datas = {}
    for d in trigger_data_list:
        d_json = json.loads(d)
        trigger_datas[d_json["event_id"]] = d_json
    print(u"load trigger predict datas {} from {}".format(
        len(trigger_datas), pred_trigger_path))

    role_data_list = utils.read_by_lines(pred_role_path)
    role_datas = {}
    for d in role_data_list:
        d_json = json.loads(d)
        role_datas[d_json["event_id"]] = d_json
    print(u"load role predict datas {} from {}".format(len(role_datas),
                                                       pred_role_path))

    schema_data_list = utils.read_by_lines(schema_path)
    schema_datas = {}
    for d in schema_data_list:
        d_json = json.loads(d)
        schema_datas[d_json["event_type"]] = [
            r["role"] for r in d_json["role_list"]
        ]
    print(u"load schema datas {} from {}".format(len(schema_data_list),
                                                 schema_path))

    all_events = {}
    for t_json in trigger_datas.values():
        text = t_json["sentence"]
        _id = utils.cal_md5(text.encode("utf-8"))
        exist_event_type = set()
        for tri_info in t_json["trigger_ret"]:
            event_type = tri_info["event_type"]
            if event_type in exist_event_type:
                continue
            trigger = tri_info["text"]
            role_type_set = set(schema_datas[event_type])

            r_json = role_datas[t_json["event_id"]]
            arguments = []
            for p_r in r_json["roles_ret"]:
                role_type = p_r["role_type"]
                if role_type in role_type_set:
                    arguments.append({
                        u"role": role_type,
                        u"argument": p_r["text"]
                    })
            if len(arguments) > 0:
                event = {
                    u"trigger": trigger,
                    u"event_type": event_type,
                    u"arguments": arguments
                }
                if _id not in all_events:
                    all_events[_id] = {
                        u"id": _id,
                        u"text": text,
                        u"event_list": []
                    }
                all_events[_id][u"event_list"].append(event)
            exist_event_type.add(event_type)
    outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()]
    utils.write_by_lines(save_path, outputs)
    print(u"predict data 2 eval data is finished, outputs {}".format(
        len(outputs)))
def predict_data_2_eval_1126(test_trigger_list, test_role_list, schema_path):
    """pred_process_with_golden_type"""
    if not test_trigger_list or not test_role_list or not schema_path:
        raise Exception(
            "must set test_trigger_list and test_role_list and schema_path")
    print(u"predict data 2 eval data start")

    trigger_data_list = test_trigger_list
    trigger_datas = {}
    for d_json in trigger_data_list:
        # d_json = json.loads(d)
        trigger_datas[d_json["event_id"]] = d_json
    # print(u"load trigger predict datas {} from {}".format(
    #     len(trigger_datas), test_trigger_list))
    print(u"load trigger predict datas {} ".format(len(trigger_datas)))

    role_data_list = test_role_list
    role_datas = {}
    for d_json in role_data_list:
        # d_json = json.loads(d)
        role_datas[d_json["event_id"]] = d_json
    # print(u"load role predict datas {} from {}".format(
    #     len(role_datas), test_role_list))
    print(u"load role predict datas {}".format(len(role_datas)))

    schema_data_list = utils.read_by_lines(schema_path)
    schema_datas = {}
    for d in schema_data_list:
        d_json = json.loads(d)
        schema_datas[d_json["event_type"]] = [
            r["role"] for r in d_json["role_list"]
        ]
    print(u"load schema datas {} from {}".format(len(schema_data_list),
                                                 schema_path))

    all_events = {}
    for t_json in trigger_datas.values():
        text = t_json["sentence"]
        _id = utils.cal_md5(text.encode("utf-8"))
        exist_event_type = set()
        for tri_info in t_json["trigger_ret"]:
            event_type = tri_info["event_type"]
            if event_type in exist_event_type:
                continue
            trigger = tri_info["text"]
            role_type_set = set(schema_datas[event_type])

            r_json = role_datas[t_json["event_id"]]
            arguments = []
            for p_r in r_json["roles_ret"]:
                role_type = p_r["role_type"]
                if role_type in role_type_set:
                    arguments.append({
                        u"role": role_type,
                        u"argument": p_r["text"]
                    })
            if len(arguments) > 0:
                event = {
                    u"trigger": trigger,
                    u"event_type": event_type,
                    u"arguments": arguments
                }
                if _id not in all_events:
                    all_events[_id] = {
                        u"id": _id,
                        u"text": text,
                        u"event_list": []
                    }
                all_events[_id][u"event_list"].append(event)
            exist_event_type.add(event_type)
    outputs = [x for x in all_events.values()]
    print(u"predict data 2 eval data is finished, outputs {}".format(
        len(outputs)))
    return outputs