def gen_train_data(in_path, out_path):
    matchObj = re.compile(r'(.+):([0-9\.]+)', re.M | re.I)
    res, qid = [], 0
    qw = query_weight()
    text = [e.strip().split("\t") for e in open(in_path, encoding="utf8").readlines() if e.strip()]
    for i, ele in enumerate(tqdm(text, total=len(text))):
        line_ele, sen2terms = [], []
        for e in ele:
            matchRes = matchObj.match(e)
            term, weight = matchRes.group(1), matchRes.group(2)
            line_ele.append((term, weight))
            sen2terms.append(term)
        qw.run_step(" ".join(sen2terms))
        weight_attn, weight_idf, weight_lm = qw.weight_attn, qw.weight_idf, qw.weight_lm
        sorted_line_ele = sorted(line_ele, key=lambda d: d[1], reverse=True)
        for i in range(len(sorted_line_ele)):
            feature_vector, fmap = get_feature(sorted_line_ele[i][0], sen2terms, weight_attn, weight_idf, weight_lm)
            res.append(" ".join([str(len(sorted_line_ele) - i - 1), "qid:" + str(qid)] + [str(i+1) + ":" + str(e) for i, e in enumerate(feature_vector)]))
        qid += 1
    print("train data length: %d" % (len(res)))
    with open(out_path + "train.txt", "w", encoding="utf8") as fin:
        fin.write("\n".join(res[:int(len(res) * 0.9)]))
    with open(out_path + "test.txt", "w", encoding="utf8") as fin:
        fin.write("\n".join(res[int(len(res) * 0.9):]))
    with open(out_path + "valid.txt", "w", encoding="utf8") as fin:
        fin.write("\n".join(res[int(len(res) * 0.9):]))
    with open(out_path + "feature.fmap", "w", encoding="utf8") as fin:
        fin.write("\n".join(fmap))
Example #2
0
def cal_ndcg_manual_data(topk=1):
    qw = query_weight()
    ndcg_sum = 0.0
    text = [
        e.strip().split("\t") for e in open("get_jdcv_data/querytrue.txt",
                                            encoding="utf8").readlines()[1:159]
        if e.strip()
    ]
    for (query, label) in tqdm(text, total=len(text)):
        seg_label = label.split()
        rel = {e: len(seg_label) - i - 1 for i, e in enumerate(seg_label)}
        dcg, idcg, ndcg = get_one_query_ndcg(qw, query, rel, topk)
        ndcg_sum += ndcg
    ndcg_avg = ndcg_sum / len(text)
    print("ndcg_avg@%d: %.3f" % (topk, ndcg_avg))
Example #3
0
def test():
    qw = query_weight()
    pred_num, total_num = 0, 0
    text = [
        e.strip().split("\t") for e in open("get_jdcv_data/querytrue.txt",
                                            encoding="utf8").readlines()[1:159]
    ]
    for (i, (query, label)) in enumerate(text):
        #query = "移动医疗"
        res = qw.run_step(query)
        pred = sorted(res, key=lambda d: d[1], reverse=True)[0]
        if pred[0] == label.split()[0]: pred_num += 1
        else:
            print(
                str(i + 1) + "\t" + query + "\t" +
                " ".join([k + ":" + str(v) for k, v in res]) + "\t" + pred[0] +
                "_" + label.split()[0])
        #if set([k for k, v in res]).difference(set(label.split())): print(str(i+1), '\t', " ".join([k for k, v in res]), '\t', label)
        total_num += 1
    print("acc: %f" % (round(pred_num / total_num, 3)))
    a = 1
Example #4
0
def cal_ndcg_train_data(topk=1):
    ndcg_sum = 0.0
    matchObj = re.compile(r'(.+)\t([0-9]+)', re.M | re.I)
    qw = query_weight()
    text = [
        e.strip().split("\t")
        for e in open("get_jdcv_data/label.data", encoding="utf8").readlines()
        if e.strip()
    ]
    for line in tqdm(text, total=len(text)):
        seg_line = [(preprocess_text(e.split(":")[0]), e.split(":")[1])
                    for e in line]
        sorted_seg_line = sorted(seg_line, key=lambda d: d[1], reverse=True)
        rel = {
            k: len(sorted_seg_line) - i - 1
            for i, (k, v) in enumerate(sorted_seg_line)
        }
        query = " ".join([e[0] for e in seg_line])
        dcg, idcg, ndcg = get_one_query_ndcg(qw, query, rel, topk)
        ndcg_sum += ndcg
    ndcg_avg = ndcg_sum / len(text)
    print("ndcg_avg@%d: %.3f" % (topk, ndcg_avg))
Example #5
0
def rank_query(query="产品策划"):
    qw = query_weight()
    xgb_sklearn = xgb.sklearn.Booster(model_file=MODEL_FILE)
    xgb_dict = parse_xgb_dict(MODEL_FILE + '.txt')
    res0 = qw.run_step(query)
    weight_attn, weight_idf, weight_lm = qw.weight_attn, qw.weight_idf, qw.weight_lm
    sen2terms = [k for k, v in weight_attn]
    tmp, score_sum = [], 1e-8
    for term in sen2terms:
        feature_vector, _ = get_feature(term, sen2terms, weight_attn,
                                        weight_idf, weight_lm)
        feature = np.array(feature_vector)
        feature_csr = sparse.csr_matrix(feature)
        input = DMatrix(feature_csr)
        score = xgb_model.predict(input)[0]
        s1 = xgb_sklearn.predict(input)[0]
        s2 = predict_proba(xgb_dict, [feature_vector])
        prob = 1.0 / (1 + math.exp(-1 * score))
        tmp.append((term, prob))
        score_sum += prob
    res = [(k, round(v / score_sum, 3)) for k, v in tmp]
    sorted_res0 = sorted(res0, key=lambda d: d[1], reverse=True)
    sorted_res = sorted(res, key=lambda d: d[1], reverse=True)
    return res
Example #6
0
# -*- coding: UTF-8 -*-
from tornado.web import RequestHandler, Application
from tornado.ioloop import IOLoop
from tornado.httpserver import HTTPServer
import json, logging, logging.config, re, chardet, tornado
from qw import query_weight

qw = query_weight()

log_conf_file = 'log4ic.conf'
logging.config.fileConfig(log_conf_file)


class Handler(RequestHandler):
    def post(self):
        try:
            #a={e:self.request.body.decode(e) for e in ['utf8','gbk','gb18030','ascii','gb2312']}
            encoding = chardet.detect(self.request.body)
            encode_type = encoding.get("encoding", "utf-8")
            req_body = self.request.body.decode(encode_type)
            req_dict = json.loads(req_body)
            self.set_header(
                'Content-Type',
                'application/json')  #; q=req_dict['request']['p']['query']
            r = qw.run(
                req_dict)  #;print(json.dumps(r, ensure_ascii=False)); exit()
            res = json.dumps(
                {
                    "header": {},
                    "response": {
                        "err_no": "0",