def delete_file(request): ''' 删除上传的文件(分词标注) :param request: :return: ''' if request.method == "POST": filename = request.POST.get("file", "") cur_file = utils.get_database(request.session.get( utils.SESSION_DB, "")).get_file_by_filecode(filename) # 原始文件名 origin_file = "" for c in cur_file: origin_file = c["file"] utils.logger_file_info(request.session.get(utils.SESSION_USER, ""), "删除分词标注文件", request.session.get(utils.SESSION_DB, ""), origin_file) utils.get_database(request.session.get(utils.SESSION_DB, "")).delete_file(filename) if os.path.exists(os.path.join(utils.DIR_UPLOADS, filename)): os.remove(os.path.join(utils.DIR_UPLOADS, filename)) files = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_files() dic = get_file(files) return HttpResponse(json.dumps(dic), content_type='application/json')
def add_category(request): ''' 添加标注类型 :param request: :return: ''' if request.method == "POST": new_ctg = request.POST.get("newctg", "") utils.get_database(request.session.get(utils.SESSION_DB, "")).update_categories(new_ctg) sug_category = {} sug_category["category"] = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_categories() count = {} for s in sug_category["category"]: count[s] = 0 sug_data = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_suggests() for line in sug_data: count[line["sug"]] += 1 sug_category["counts"] = count utils.log_category_info(request.session.get(utils.SESSION_USER, ""), "添加标注类型", request.session.get(utils.SESSION_DB, ""), new_ctg) # 更新服务 requests.post(utils.update_sug_url(request.session.get(utils.SESSION_DB, "")), data=request.session.get(utils.SESSION_DB, ""), headers=utils.headers) return HttpResponse(json.dumps(sug_category), content_type='application/json')
def save_suggest(request): """ 保存标注,写入数据库 :param request: :return: """ if request.method == "POST": # 将json传来的字符串转成字典 utility = eval(request.POST.get("sugs", "")) # 寻常型/天疱疮:{寻常型:分型,天疱疮:中心词} session_method = request.session.get(utils.SESSION_ORIGIN_FILE, "") method = "手动" if session_method == "" else "从文件 " + request.session.get(utils.SESSION_ORIGIN_FILE, "") i = 0 # k:原文,v:分词+标注 for k, v in utility.iteritems(): msg = [] for seg, sug in v.iteritems(): if not sug == utils.UNKNOWN: # 未知不写入数据库 source = utils.seperate_term(seg, k.split("/")) utils.get_database(request.session.get(utils.SESSION_DB, "")).update_sug_source(seg, sug, source) msg.append(seg + ":" + sug) i += 1 utils.log_sug_info(request.session.get(utils.SESSION_USER, ""), method, request.session.get(utils.SESSION_DB, ""), "添加标注", " || ".join(msg)) return HttpResponse("", content_type="text")
def send_suggest(request): ''' 进入标注界面,获得分词,调用标注服务 :param request: :return: ''' if request.method == "POST": try: utils.update_db(request) new_sugs = request.POST.get("new_segs", "")[:-1] # 去掉末尾的; origin_msg = request.POST.get("origin", "") # 原文 if origin_msg: origin_msg = origin_msg.split(",") filename = request.session.get(utils.SESSION_FILE, "") info = dict() # 手动输入 if filename == "": # 诊断的分词对应标注和来源 info['sug'], info['source'], info['msg'] = get_sug_from_disk(new_sugs, request.session.get(utils.SESSION_DB, "")) # 从文件读取 else: edit_index = request.POST.get("edit_index", "") # 已编辑的index # seg_index_list = edit_index.split(",")[:-1] # 保存过的诊断index,从0开始 # edit_count = get_sug_from_file(filename, seg_index_list) # 返回编辑的分词个数 edit_count = 0 utils.get_database(request.session.get(utils.SESSION_DB, "")).update_file_checked_seg(filename, edit_count) # 诊断的分词对应标注 info['sug'], info['source'], info['msg'] = sort_sugs_by_category(origin_msg, "从文件 " + request.session.get( utils.SESSION_ORIGIN_FILE, ""), new_sugs, request.session.get(utils.SESSION_USER, ""), request.session.get(utils.SESSION_DB, "")) # 所有的标注对应颜色 info['all'] = {} all_sugs = utils.get_suggests_dic(request.session.get(utils.SESSION_DB, "")) for k, v in all_sugs.iteritems(): info['all'][k] = v info["page_count"] = utils.SUGS_PER_PAGE except Exception, e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] output = ",".join([str(e), fname, str(exc_tb.tb_lineno)]) error_logger.info(request.session.get("username", "") + " - 标注服务错误 - " + output) return HttpResponse(json.dumps(info), content_type='application/json')
def delete_selected_sugs(sugs, db): ''' 数据库删除分词标注,更新服务数据 :param sugs: :param db: 术语集名称 :return: ''' utils.get_database(db).delete_suggests(sugs) requests.post(utils.update_sug_url(db), data=db, headers=utils.headers)
def upload_file(request): ''' 上传文件,分词标注 :param request: :return: ''' if request.method == "POST": # 请求方法为POST时,进行处理 myFile = request.FILES.get("myfile", None) # 获取上传的文件,如果没有文件,则默认为None upload_filename = utils.random_string() + "." + myFile.name.split( ".")[-1] if not myFile: return "请上传文件!" ext = upload_filename.split(".")[-1] utils.write_to_file(myFile, os.path.join(utils.DIR_UPLOADS, upload_filename), ext) if ext == "txt" or ext == "csv": total = len( open(os.path.join(utils.DIR_UPLOADS, upload_filename)).readlines()) elif ext == "xls" or ext == "xlsx": total = 0 wb = xlrd.open_workbook( os.path.join(utils.DIR_UPLOADS, upload_filename)) for k in range(len(wb.sheets())): ws = wb.sheet_by_index(k) total += ws.nrows request.session[utils.SESSION_FILE] = upload_filename date = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 将文件信息插入数据库 file_dict = { dbinfo.FILE_FILE: myFile.name, dbinfo.FILE_CODE: upload_filename, dbinfo.FILE_TOTAL: total, dbinfo.FILE_DATE: date, dbinfo.FILE_CHECKED: 0 } utils.get_database(request.session.get(utils.SESSION_DB, "")).insert_file(file_dict) # 上传文件信息写入log utils.logger_file_info(request.session.get(utils.SESSION_USER, ""), "上传分词标注文件", request.session.get(utils.SESSION_DB, ""), myFile.name) request.session[utils.SESSION_ORIGIN_FILE] = myFile.name return HttpResponse("")
def check_file(request): ''' 检查上传的数据是否符合规范 :param request: :return: ''' try: upload_filename = request.FILES.get("myfile", None) name = upload_filename.name ext = name.split(".")[-1] utils.write_to_file(upload_filename, "tmp.csv", ext) all_categories = utils.get_database( request.session.get(utils.SESSION_DB, "")).get_categories() error_data, error_type, duplicate_data = [], [], {} i = 0 # 上传标注 for line in open("tmp.csv").readlines(): line = line.strip() if len(line) > 1: if len(line.split("\t")) != 2: error_data.append(line) # 数据格式错误 else: seg, sug = line.split("\t") if sug in all_categories: # 添加的标注需属于已有标注 res, old_sug = utils.get_database( request.session.get(utils.SESSION_DB, "")).is_sug_exist(seg, sug) if res: duplicate_data[seg] = [sug, old_sug] # 添加的标注需属于已有标注 i += 1 else: error_type.append(line) # 标注类型不存在 else: error_data.append(line) data = { 'error': error_data, 'types': error_type, 'duplicate': duplicate_data } except Exception, e: f = open("exp.txt", "w") exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] output = ",".join([str(e), fname, str(exc_tb.tb_lineno)]) f.write(output)
def update_duplicate_data(seg, sug, dbname): ''' 上传分词标注数据时,对于已有分词,覆盖标注 :param request: msg: [["植入","术式"],["盆腔","部位"]] :return: ''' utils.get_database(dbname).insert_suggests( { dbinfo.SUG_SEG: seg, dbinfo.SUG_SUG: sug, dbinfo.SEG_SOURCE: "", dbinfo.SUG_SOURCE: "", dbinfo.SUG_STATE: "已存", dbinfo.SUG_COUNT: 1 }, cover=True)
def get_sug_from_disk(new_sugs, dbname): ''' 手动输入分词,获得标注 :param logger: :param new_sugs:一组分词[高血压,2级; 糖尿病] :param username: :param dbname: :return: ''' info_sug, res_source = [], [] new_sugs = new_sugs.split(";") sug_data = [] for origin_cut in new_sugs: sugs = [c1 for c1 in origin_cut.split(",")] sentence = origin_cut.replace(",", "") sug_data.append([sentence, sugs[:-1]]) # sugs最后一个元素是空 url = utils.sug_service_url(dbname) sug_list = eval(requests.post(url, data=json.dumps({"terms": sug_data, "auto_match": True}), headers=utils.headers).content.decode('utf8')) # [高血压2级,[[高血压,中心词],[2级,特征词]]] for sugs in sug_list: terms = [] tmp = [] for term in sugs[1]: # terms["高血压","中心词"] if term[1] == "": term[1] = u"未知" tmp.append([term[0], term[1]]) if tmp: info_sug.append(tmp) terms.append(tmp) res_source.append(utils.get_database(dbname).get_sug_source(terms, utils.MAX_TERMS)[0]) return info_sug, res_source, ""
class ReclameAquiReclamation(Model): __uri_conn, __db_conn = get_database() __uri = __uri_conn __client = MongoClient(__uri) __db = __client[__db_conn] collection = __db.get_collection('reclamations')
def delete_sug_category(request): ''' 删除某个标注和该标注下的分词 :param request: :return: ''' if request.method == "POST": category = request.POST.get("category", "") utils.get_database(request.session.get(utils.SESSION_DB, "")).delete_category(category) utils.get_database(request.session.get(utils.SESSION_DB, "")).delete_items_by_sug(category) # 更新服务 requests.post(utils.update_sug_url(request.session.get(utils.SESSION_DB, "")), data=request.session.get(utils.SESSION_DB, ""), headers=utils.headers) utils.log_category_info(request.session.get(utils.SESSION_USER, ""), "删除标注类型", request.session.get(utils.SESSION_DB, ""), category) return HttpResponse("", content_type='application/test')
def get_sug_source(dbname, terms): ''' 每次分页,加载标注来源 :param dbname: session中的数据库名称 :param terms: 一组分词[[高血压,3级],[右眼,外伤]...],length=每页分词个数 :return: ''' res = utils.get_database(dbname).get_sug_source(terms, utils.MAX_TERMS) return res
def download_sug(request): if request.method == "POST": segs = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_new_suggests() data = "" for f in segs: s = f["seg"] + "-" + f["sug"] data += s + "\n" return HttpResponse(data, content_type='text')
def get_all_suggests(request): ''' 所有标注传到标注编辑界面 :param request: :return: ''' utils.update_db(request) sug_category = {} sug_category["category"] = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_categories() count = {} for s in sug_category["category"]: count[s] = 0 sug_data = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_suggests() for line in sug_data: try: count[line["sug"]] += 1 except Exception, e: print line["sug"]
def update_segs_sugs(request): ''' 修改原始数据 :param request: source:origin/new msgs:dict,[seg:sug] :return: ''' if request.method == "POST": msgs = eval(request.POST.get("msg", "")) # {"高血压":"中心词"} logger = logging.getLogger(utils.SUGGEST_LOG) for seg, sug in msgs.iteritems(): old_sug = utils.get_database( request.session.get(utils.SESSION_DB, "")).get_suggest_from_seg(seg)[0]["sug"] utils.log_data_info(logger, request.session.get(utils.SESSION_USER, ""), "手动", request.session.get(utils.SESSION_DB, ""), "更新原始数据", seg + ":" + old_sug + "=>" + seg + ":" + sug) # 先获得更新的词原先的标注,用于写入日志,再更新数据库 utils.get_database(request.session.get( utils.SESSION_DB, "")).update_single_sug_category(msgs) requests.post(utils.update_sug_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) requests.post(utils.update_seg_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) data = init_origin_data({}, request.session.get(utils.SESSION_DB, "")) return HttpResponse(json.dumps(data), content_type='application/json')
def add_sugs(sugs, db): ''' 更新标注数据库,state=已存,更新标注服务 :param sugs: :param db: :return:新增标注是否覆盖之前的标注,用于写日志 ''' res = utils.get_database(db).update_sug_state(sugs) if db == "zhenduan": url = utils.sug_service_url_zd elif db == "shoushu": url = utils.sug_service_url_ss requests.post(url, data=db, headers=utils.headers) return res
def get_files(request): ''' 获得所有上传文件 :param request: :return: ''' if request.method == "POST": utils.update_db(request) files = utils.get_database(request.session.get(utils.SESSION_DB, "")).get_files() dic = get_file(files) config = json.load(open("config.json")) dic["page_count"] = config['basic']['ITEM_PER_PAGE_FILE'] return HttpResponse(json.dumps(dic), content_type='application/json')
def select_file(request): ''' 上传文件-选择文件 :param request: :return: ''' if request.method == "POST": upload_filename = request.POST.get("file", "") # 随机码文件名 request.session[utils.SESSION_FILE] = upload_filename cur_file = utils.get_database(request.session.get( utils.SESSION_DB, "")).get_file_by_filecode(upload_filename) # 原始文件名 origin_file = "" for c in cur_file: origin_file = c["file"] request.session[utils.SESSION_ORIGIN_FILE] = origin_file request.session["segfile"] = 1 return HttpResponse("", content_type='text')
def refresh_datafile_sug(data, db): ''' 获得新增的标注数据,id除外 :param d: 返回数据,字典 :param db: 数据库表名 :return: ''' sugs = utils.get_database(db).get_new_suggests() sug_dic = {} i = 0 for f in sugs: tmp = {} for k, v in f.iteritems(): if k == u'_id': continue # id的格式不能解析成json else: tmp[k] = v i += 1 sug_dic[i] = tmp data["sugs"] = sug_dic return data
def init_origin_data(data, db): ''' 获得所有数据(分词-标注) :param data:返回数据 :param db: 当前的术语集(数据库) :return: items:所有数据,sugs:所有标注 ''' data["items"] = {} i = 0 all_sugs = utils.get_database(db).get_suggests() segs_with_sugs = set() for line in all_sugs: tmp = line del tmp[u"_id"] segs_with_sugs.add(tmp["seg"]) data["items"][i] = tmp i += 1 data["all_sug"] = utils.get_suggestions(db) return data
def update_sug_category(request): ''' 更新标注,同时更新数据库的新增分词-标注 :param request: :return: ''' if request.method == "POST": origin_ctg = eval(request.POST.get("origin_ctg", "")) new_ctg = eval(request.POST.get("new_ctg", "")) updates = {} for i in range(len(origin_ctg)): if origin_ctg[i] != new_ctg[i]: if new_ctg[i] not in origin_ctg: utils.get_database(request.session.get(utils.SESSION_DB, "")).update_categories(new_ctg[i]) updates[origin_ctg[i].decode('utf8')] = new_ctg[i].decode('utf8') utils.get_database(request.session.get(utils.SESSION_DB, "")).update_sug_category(updates) utils.get_database(request.session.get(utils.SESSION_DB, "")).delete_category(origin_ctg[i]) utils.log_category_info(request.session.get(utils.SESSION_USER, ""), "更新标注类型", request.session.get(utils.SESSION_DB, ""), origin_ctg + "=>" + new_ctg) return HttpResponse("", content_type='application/text')
def sort_sugs_by_category(origin_msg, origin_file, new_segs, username, dbname): ''' 从文件读取,将标注按标注类型排序,优先级:"未知"--"未知"猜词--"其他" :param origin_msg: 原文 :param origin_file: 用于日志 :param new_segs: 分词 :param username: :param dbname: :return: ''' new_segs = new_segs.split(utils.SEP) sug_data = [] for origin_cut in new_segs: terms = [] utils.log_sug_info(username, origin_file, dbname, "添加分词", origin_cut[:-1].replace(",", "/")) sugs = [c1 for c1 in origin_cut.split(",")] sentence = origin_cut.replace(",", "") sug_data.append([sentence, sugs[:-1]]) # sugs最后一个元素是空 url = utils.sug_service_url(dbname) sug_list = eval(requests.post(url, data=json.dumps({"terms": sug_data, "auto_match": True}), headers=utils.headers).content.decode('utf8')) # [高血压2级,[[高血压,中心词],[2级,特征词]]] unknown, unknown_source, unknown_msg = [], [], [] for sugs in sug_list: is_unknown, is_predict, is_others = False, False, False unknown_predict, others, rest = [], [], [] unknown_predict_source, others_source, rest_source = [], [], [] unknown_predict_msg, others_msg, rest_msg = [], [], [] origin = sugs[0] tmp = [] for sug in sugs[1]: # sugs:["高血压","中心词"] if sug[1] == "": sug[1] = u"未知" is_unknown = True else: is_predict = True # 不是未知的词可以显示来源 if sug[1] == u"其他" or sug[1] == '其他': is_others = True tmp.append(sug) terms.append(sug) if is_unknown: unknown.append(tmp) unknown_source.append(terms) unknown_msg.append(origin) elif is_predict: unknown_predict.append(tmp) unknown_predict_source.append(terms) unknown_predict_msg.append(origin) elif is_others: others.append(tmp) others_source.append(terms) others_msg.append(origin) else: rest.append(tmp) rest_source.append(terms) rest_msg.append(origin) unknown.extend(unknown_predict) unknown.extend(others) unknown.extend(rest) unknown_source.extend(unknown_predict_source) unknown_source.extend(others_source) unknown_source.extend(rest_source) unknown_msg.extend(unknown_predict_msg) unknown_msg.extend(others_msg) unknown_msg.extend(rest_msg) source = utils.get_database(dbname).get_sug_source(unknown_source, utils.MAX_TERMS) return unknown, source, unknown_msg
def upload_data_file(request): ''' 上传分词/标注数据,txt/csv格式 分词-标注数据库已有,跳过 分词-标注数据库没有,写入 分词-标注和数据库的不一样,提示是否覆盖 ?只有分词,没有标注:未知 错误数据(格式不对,标注类型不对),不能写入 :param request: :return: ''' # 上传文件信息写入log # upload_filename = request.FILES.get("myfile", None) # name = upload_filename.name # utils.logger_file_info(request.session.get(utils.SESSION_USER, ""), "上传分词标注数据", # request.session.get(utils.SESSION_DB, ""), name) checked = request.POST.get("checked", "") # 同一分词,不同标注,是否覆盖原始数据 if checked == '0': checked = False else: checked = True all_categories = utils.get_database( request.session.get(utils.SESSION_DB, "")).get_categories() i = 0 # 上传标注 for line in open("tmp.csv").readlines(): line = line.strip() if len(line) > 1: if len(line.split("\t")) != 2: pass else: seg, sug = line.split("\t") if sug in all_categories: # 添加的标注需属于已有标注 # 添加数据 utils.get_database( request.session.get(utils.SESSION_DB, "")).insert_suggests( { dbinfo.SUG_SEG: seg, dbinfo.SUG_SUG: sug, dbinfo.SEG_SOURCE: "", dbinfo.SUG_SOURCE: "", dbinfo.SUG_STATE: "已存", dbinfo.SUG_COUNT: 1 }, cover=checked) utils.log_sug_info( request.session.get(utils.SESSION_USER, ""), "从数据文件", request.session.get(utils.SESSION_DB, ""), "添加标注", seg + ":" + sug) i += 1 requests.post(utils.update_seg_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) requests.post(utils.update_sug_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) # 更新数据 data = init_origin_data({}, request.session.get(utils.SESSION_DB, "")) # 这里按sug排序 request.session[utils.SESSION_ALLDATA] = build_sug_dict(data["items"]) return HttpResponse(json.dumps(data), content_type='application/json')
def delete_segs_sugs(request): ''' 新增数据的删除 :param request: msg:{sug:"多根-其他,双坐骨-部位,..."} {0:{seg:,sug:},1:{seg:,sug:}} :return: ''' if request.method == "POST": msgs = eval(request.POST.get("msg", "")) dic = {} # 删除新增数据 if "sug" in msgs.keys(): sugs = msgs["sug"] for s in sugs.split(","): if s: idx = s.rfind("-") # 最后一个"-"是分隔分词与标注,分词有可能带有"-" dic['seg'], dic['sug'] = s[:idx], s[idx + 1:len(s)] utils.get_database( request.session.get(utils.SESSION_DB, "")).delete_suggests(dic) logger = logging.getLogger(utils.SUGGEST_LOG) utils.log_data_info( logger, request.session.get(utils.SESSION_USER, ""), "手动", request.session.get(utils.SESSION_DB, ""), "删除新增数据", dic['seg'] + ":" + dic['sug']) # 删除原始数据 else: for k, v in msgs.iteritems(): utils.get_database(request.session.get(utils.SESSION_DB, "")).delete_suggests(v) logger = logging.getLogger(utils.SUGGEST_LOG) utils.log_data_info( logger, request.session.get(utils.SESSION_USER, ""), "手动", request.session.get(utils.SESSION_DB, ""), "删除原始数据", v["seg"] + ":" + v["sug"]) data = init_origin_data({}, request.session.get(utils.SESSION_DB, "")) data = refresh_datafile_sug(data, request.session.get(utils.SESSION_DB, "")) # 这里按sug排序 request.session[utils.SESSION_ALLDATA] = build_sug_dict(data["items"]) # 更新服务 requests.post(utils.update_seg_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) requests.post(utils.update_sug_url( request.session.get(utils.SESSION_DB, "")), data="", headers=utils.headers) if request.session.get(utils.SESSION_DB, "") == "zhenduan": url = utils.seg_service_url_zd elif request.session.get(utils.SESSION_DB, "") == "shoushu": url = utils.seg_service_url_ss requests.post(url, data=request.session.get(utils.SESSION_DB, ""), headers=utils.headers) if request.session.get(utils.SESSION_DB, "") == "zhenduan": url = utils.sug_service_url_zd elif request.session.get(utils.SESSION_DB, "") == "shoushu": url = utils.sug_service_url_ss requests.post(url, data=request.session.get(utils.SESSION_DB, ""), headers=utils.headers) return HttpResponse(json.dumps(data), content_type='application/json')