def gen_train_data(in_path, out_path): matchObj = re.compile(r'(.+):([0-9\.]+)', re.M | re.I) res, qid = [], 0 qw = query_weight() text = [e.strip().split("\t") for e in open(in_path, encoding="utf8").readlines() if e.strip()] for i, ele in enumerate(tqdm(text, total=len(text))): line_ele, sen2terms = [], [] for e in ele: matchRes = matchObj.match(e) term, weight = matchRes.group(1), matchRes.group(2) line_ele.append((term, weight)) sen2terms.append(term) qw.run_step(" ".join(sen2terms)) weight_attn, weight_idf, weight_lm = qw.weight_attn, qw.weight_idf, qw.weight_lm sorted_line_ele = sorted(line_ele, key=lambda d: d[1], reverse=True) for i in range(len(sorted_line_ele)): feature_vector, fmap = get_feature(sorted_line_ele[i][0], sen2terms, weight_attn, weight_idf, weight_lm) res.append(" ".join([str(len(sorted_line_ele) - i - 1), "qid:" + str(qid)] + [str(i+1) + ":" + str(e) for i, e in enumerate(feature_vector)])) qid += 1 print("train data length: %d" % (len(res))) with open(out_path + "train.txt", "w", encoding="utf8") as fin: fin.write("\n".join(res[:int(len(res) * 0.9)])) with open(out_path + "test.txt", "w", encoding="utf8") as fin: fin.write("\n".join(res[int(len(res) * 0.9):])) with open(out_path + "valid.txt", "w", encoding="utf8") as fin: fin.write("\n".join(res[int(len(res) * 0.9):])) with open(out_path + "feature.fmap", "w", encoding="utf8") as fin: fin.write("\n".join(fmap))
def cal_ndcg_manual_data(topk=1): qw = query_weight() ndcg_sum = 0.0 text = [ e.strip().split("\t") for e in open("get_jdcv_data/querytrue.txt", encoding="utf8").readlines()[1:159] if e.strip() ] for (query, label) in tqdm(text, total=len(text)): seg_label = label.split() rel = {e: len(seg_label) - i - 1 for i, e in enumerate(seg_label)} dcg, idcg, ndcg = get_one_query_ndcg(qw, query, rel, topk) ndcg_sum += ndcg ndcg_avg = ndcg_sum / len(text) print("ndcg_avg@%d: %.3f" % (topk, ndcg_avg))
def test(): qw = query_weight() pred_num, total_num = 0, 0 text = [ e.strip().split("\t") for e in open("get_jdcv_data/querytrue.txt", encoding="utf8").readlines()[1:159] ] for (i, (query, label)) in enumerate(text): #query = "移动医疗" res = qw.run_step(query) pred = sorted(res, key=lambda d: d[1], reverse=True)[0] if pred[0] == label.split()[0]: pred_num += 1 else: print( str(i + 1) + "\t" + query + "\t" + " ".join([k + ":" + str(v) for k, v in res]) + "\t" + pred[0] + "_" + label.split()[0]) #if set([k for k, v in res]).difference(set(label.split())): print(str(i+1), '\t', " ".join([k for k, v in res]), '\t', label) total_num += 1 print("acc: %f" % (round(pred_num / total_num, 3))) a = 1
def cal_ndcg_train_data(topk=1): ndcg_sum = 0.0 matchObj = re.compile(r'(.+)\t([0-9]+)', re.M | re.I) qw = query_weight() text = [ e.strip().split("\t") for e in open("get_jdcv_data/label.data", encoding="utf8").readlines() if e.strip() ] for line in tqdm(text, total=len(text)): seg_line = [(preprocess_text(e.split(":")[0]), e.split(":")[1]) for e in line] sorted_seg_line = sorted(seg_line, key=lambda d: d[1], reverse=True) rel = { k: len(sorted_seg_line) - i - 1 for i, (k, v) in enumerate(sorted_seg_line) } query = " ".join([e[0] for e in seg_line]) dcg, idcg, ndcg = get_one_query_ndcg(qw, query, rel, topk) ndcg_sum += ndcg ndcg_avg = ndcg_sum / len(text) print("ndcg_avg@%d: %.3f" % (topk, ndcg_avg))
def rank_query(query="产品策划"): qw = query_weight() xgb_sklearn = xgb.sklearn.Booster(model_file=MODEL_FILE) xgb_dict = parse_xgb_dict(MODEL_FILE + '.txt') res0 = qw.run_step(query) weight_attn, weight_idf, weight_lm = qw.weight_attn, qw.weight_idf, qw.weight_lm sen2terms = [k for k, v in weight_attn] tmp, score_sum = [], 1e-8 for term in sen2terms: feature_vector, _ = get_feature(term, sen2terms, weight_attn, weight_idf, weight_lm) feature = np.array(feature_vector) feature_csr = sparse.csr_matrix(feature) input = DMatrix(feature_csr) score = xgb_model.predict(input)[0] s1 = xgb_sklearn.predict(input)[0] s2 = predict_proba(xgb_dict, [feature_vector]) prob = 1.0 / (1 + math.exp(-1 * score)) tmp.append((term, prob)) score_sum += prob res = [(k, round(v / score_sum, 3)) for k, v in tmp] sorted_res0 = sorted(res0, key=lambda d: d[1], reverse=True) sorted_res = sorted(res, key=lambda d: d[1], reverse=True) return res
# -*- coding: UTF-8 -*- from tornado.web import RequestHandler, Application from tornado.ioloop import IOLoop from tornado.httpserver import HTTPServer import json, logging, logging.config, re, chardet, tornado from qw import query_weight qw = query_weight() log_conf_file = 'log4ic.conf' logging.config.fileConfig(log_conf_file) class Handler(RequestHandler): def post(self): try: #a={e:self.request.body.decode(e) for e in ['utf8','gbk','gb18030','ascii','gb2312']} encoding = chardet.detect(self.request.body) encode_type = encoding.get("encoding", "utf-8") req_body = self.request.body.decode(encode_type) req_dict = json.loads(req_body) self.set_header( 'Content-Type', 'application/json') #; q=req_dict['request']['p']['query'] r = qw.run( req_dict) #;print(json.dumps(r, ensure_ascii=False)); exit() res = json.dumps( { "header": {}, "response": { "err_no": "0",