Beispiel #1
0
def get_bracket_words():
    lines = file_op.read_lines(conf['chat_pred'])
    logger.info("read %s success!" % conf['chat_pred'])
    bracket_pat = re.compile("\[(.*?)\]")
    bracket_values = []
    for line in lines:
        values = bracket_pat.findall(line)
        bracket_values.extend(values)
    bracket_values = list(set(bracket_values))
    # order_ids = [value for value in bracket_values if "ORDERID_" in value]
    values = [
        value for value in bracket_values
        if "ORDERID_" not in value and "USERID_" not in value
    ]
    file_op.write_lines("temp.txt", values)
Beispiel #2
0
def merge_78():
    """合并多余的字段"""
    lines = file_op.read_lines(conf['chat'])
    logger.info("read chat.txt success!")
    for i in range(len(lines)):
        line = lines[i]
        line_ = line.strip("\r\n").split('\t')
        if len(line_) > 7:
            line_pred = line_[:6]
            text = " ".join(line_[6:])
            line_pred.append(text)
            lines[i] = '\t'.join(line_pred)
            # print('\t'.join(line_pred))
    file_op.write_lines(conf['chat_pred'], lines)
    logger.info("write results to  %s success!" % conf['chat_pred'])
Beispiel #3
0
def chat_parse():
    """逐行解析chat数据,按照session进行统计,归集"""
    file_chat = base_conf.file_chat
    logger.info('reading chat from %s' % file_chat)
    lines = file_op.read_lines(file_chat)
    chat_parsed = []

    # 初始化 session info
    sess_info = {
        "session_id": lines[0].split('\t')[0],
        "q_nums": 0,
        "a_nums": 0,
        "lines": []
    }

    for line in lines:
        line = line.strip('\t').replace("\t", '|')
        try:
            cols = line.split("|")
            line_cols = {
                "id": cols[0],
                "user": cols[1],
                "waiter_send": cols[2],
                "transfer": cols[3],
                "repeat": cols[4],
                "sku": cols[5],
                "content": "|".join(cols[6:])
            }
            # assert len(cols) == 7, "总共有七个字段,当前行有%i个字段" % len(cols)
            if sess_info['session_id'] == line_cols['id']:
                sess_info = _update_nums(sess_info, line_cols)
                sess_info['lines'].append(line)
            else:
                chat_parsed.append(sess_info)
                sess_info = {
                    "session_id": line_cols['id'],
                    "q_nums": 0,
                    "a_nums": 0,
                    "lines": [line]
                }
                sess_info = _update_nums(sess_info, line_cols)
        except Exception as e:
            logger.error('line error: %s' % line)
            logger.exception(e)
    file_op.write_lines(base_conf.file_chat_parsed, chat_parsed)
    logger.info("chat parse result saved in %s" % base_conf.file_chat_parsed)
    return chat_parsed
Beispiel #4
0
def chat_session_parse():
    """输入chat_parse()返回的结果,将连续的q、a进行合并,并标记顺序"""
    logger.info("reading chat parsed from %s" % base_conf.file_chat_parsed)
    chat_parsed = file_op.read_lines(base_conf.file_chat_parsed)
    chat_parsed = [eval(x) for x in chat_parsed]
    session_parsed = []
    for sess_info in chat_parsed:
        try:
            sess_parsed = _parse_session(sess_info)
            session_parsed.append(sess_parsed)
        except Exception as e:
            logger.error("sess info parse error, sess_id: %s" % sess_info['session_id'])
            logger.exception(e)
    file_session_parsed = base_conf.file_session_parsed
    logger.info("save session parse result to %s" % file_session_parsed)
    file_op.write_lines(file_session_parsed, session_parsed)
    logger.info("save success!")
Beispiel #5
0
def async_batch_tokenize(chat_batch):
    """批量处理"""
    inner = conf.inner
    url_outer = "http://jdialog-lexeme.jd.com/lexeme?token={token}&text={text}"
    url_inner = "http://jdialog-lexeme-stage.jd.com/lexeme?token={token}&text={text}"
    if inner:
        base_url = url_inner
    else:
        base_url = url_outer
    urls = map(lambda x: base_url.format(token=conf.api_token,
                                         text=x.strip('\r\n').split('\t')[6]), chat_batch)

    def exception_handler(request, e):
        logger.error(request)
        logger.exception(e)

    tasks = [grequests.get(url, timeout=3) for url in urls]
    results = grequests.map(tasks, size=20, exception_handler=exception_handler)

    logger.info("finish batch api request, start collect res ...")
    chat_tokenize = []
    for i, res in enumerate(results):
        line = chat_batch[i]
        try:
            res_json = res.json()
            if res_json['status'] == 0:
                line += "\t" + str(res_json['tokenizedText'])
            else:
                line += "\t" + "fail_tokenize"
        except Exception as e:
            line += "\t" + "fail_tokenize"
            logger.exception(e)
        chat_tokenize.append(line)

    # save
    file_chat_tokenize = conf.file_chat_tokenize
    logger.info("saving chat batch tokenize to %s" % file_chat_tokenize)
    file_op.write_lines(file_chat_tokenize, chat_tokenize)
    logger.info("save success!")
Beispiel #6
0
def tokenize_modify():
    """对部分返回失败的句子重新分词"""
    logger.info("modify tokenize results!")
    file_chat_tokenize = conf.file_chat_tokenize
    chat_tokenize = file_op.read_lines(file_chat_tokenize)
    logger.info("read %s success!" % chat_tokenize)
    for i in range(len(chat_tokenize)):
        line = chat_tokenize[i]
        try:
            if not inspect_tokenize(line):
                line = "\t".join(line.split('\t')[:-1])
                # 处理特殊字符
                line = line_pre(line)
                text = line.split('\t')[6]
                logger.info("current line: %i, text: %s" % (i, text))
                res = get_text_tokenize(text)
                text_tokens = res.get('tokenizedText', "fail_tokenize")
                line += str(text_tokens)
                chat_tokenize[i] = line
        except Exception as e:
            logger.exception(e)
            print(line)
    file_op.write_lines(file_chat_tokenize, chat_tokenize, mode='w')
    logger.info("write %s success!" % chat_tokenize)