def get_mark_job_data_by_ids(self, mark_job_ids, args, doc_type_key="doc_type", prefix='NER'): items = [] for mark_job_id in mark_job_ids: doc_type = DocTypeModel().get_by_mark_job_id(mark_job_id) result = { "prefix": prefix, # TODO: 与MQ确认传参是否适配 doc_type_key: DocTypeSchema().dump(doc_type), "docs": [], "tasks": [], "mark_job_id": mark_job_id, } data = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids([mark_job_id]) for task, doc in data: # 抽取逻辑 if args.get('doc_term_ids'): if isinstance(task.mark_task_result, list) \ and Common.check_doc_term_include(task.mark_task_result, 'doc_term_id', args['doc_term_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) # 实体关系逻辑 if args.get('doc_relation_ids'): if isinstance(task.mark_task_result, list) and Common.check_doc_relation_include( task.mark_task_result, 'relation_id', args['doc_relation_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) else: result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) items.append(result) return items
def add_file_handler(*loggers: logging.Logger) -> None: log_dir_path = os.path.join(BASE_PATH, 'logs') Common.make_dirs(log_dir_path) log_file_path = os.path.join(log_dir_path, time.strftime( '%Y-%m-%d', time.localtime(time.time())) + '.log') file_handler = TimedRotatingFileHandler( log_file_path, 'D', 1, 7, None, False, False) file_handler.setFormatter(logging.Formatter( '%(request_id)s - %(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(lineno)s - %(message)s')) file_handler.addFilter(RequestIdFilter()) file_handler.setLevel(logging.INFO) for l in loggers: l.addHandler(file_handler)
def get_user_task_with_doc_and_doc_type(nlp_task_id, current_user: CurrentUser, args): q = session.query(UserTask, DocType, Doc) \ .join(MarkTask, MarkTask.mark_task_id == UserTask.mark_task_id) \ .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \ .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \ .join(Doc, Doc.doc_id == MarkTask.doc_id) \ .filter( DocType.nlp_task_id == nlp_task_id, ~UserTask.is_deleted, ~MarkTask.is_deleted, ~Doc.is_deleted ) # TODO # 权限 if current_user.user_role in [ RoleEnum.manager.value, RoleEnum.guest.value ]: q = q.filter(DocType.group_id.in_(current_user.user_groups)) elif current_user.user_role in [RoleEnum.reviewer.value]: q = q.filter( func.json_contains(MarkJob.reviewer_ids, str(current_user.user_id))) elif current_user.user_role in [RoleEnum.annotator.value]: # q = q.filter(func.json_contains(MarkJob.annotator_ids, str(current_user.user_id))) q = q.filter(UserTask.annotator_id == current_user.user_id) if args.get('job_id'): q = q.filter(MarkTask.mark_job_id == args['job_id']) if args.get('doc_type_id'): q = q.filter(MarkJob.doc_type_id == args['doc_type_id']) if args['task_state']: q = q.filter(MarkTask.mark_task_status == status_str2int_mapper().get(args['task_state'])) if args['query']: q = q.filter(Doc.doc_raw_name.like(f'%{args["query"]}%')) q = q.group_by(UserTask) count = q.count() processing_count = q.filter( MarkTask.mark_task_status == int(StatusEnum.processing)).count() if args['order_by'] and isinstance(args['order_by'], str): if args['order_by'][1:] == 'task_id': args['order_by'] = args['order_by'][0] + 'mark_task_id' q = Common().order_by_model_fields(q, UserTask, [args['order_by']]) items = [] for user_task, doc_type, doc in q.offset(args['offset']).limit( args['limit']).all(): user_task.doc = doc user_task.doc_type = doc_type items.append(user_task) return count, count - processing_count, items
def get(self): nlp_task_id = Common().get_nlp_task_id_by_route() result = DocTypeService().get_doc_type_info_by_nlp_task_by_user( nlp_task_id=nlp_task_id, current_user=self.get_current_user()) return { "message": "请求成功", "result": result, }, 200
def get(self: Resource) -> typing.Tuple[typing.Dict, int]: """ 获取所有条款,分页,可选排除条件exclude_terms_ids """ result = Common().get_wordseg_doc_terms() return { "message": "请求成功", "result": result }, 200
def get(self: Resource, job_id: int) -> typing.Tuple[typing.Dict, int]: nlp_task_id = Common().get_nlp_task_id_by_route() # get predict job predict_job = PredictService().get_predict_job_by_id( nlp_task_id=nlp_task_id, predict_job_id=job_id, current_user=self.get_current_user) result = PredictJobSchema().dump(predict_job) return {"message": "请求成功", "result": result}, 200
def get(self: Resource, args: typing.Dict, job_id: int) -> typing.Tuple[typing.Dict, int]: nlp_task_id = Common().get_nlp_task_id_by_route() file_path = PredictService().export_predict_file( nlp_task_id=nlp_task_id, predict_job_id=job_id, offset=args["offset"]) return {"message": "请求成功", "file_path": file_path}, 200
def post(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: """ 创建一个文档类型包括它的条款 """ args.update({'nlp_task_id': Common().get_nlp_task_id_by_route()}) args.update({"group_id": self.get_current_user().user_groups[0]}) result = DocTypeService().create_doc_type(self.get_current_user(), args) return { "message": "创建成功", "result": result, }, 201
def post(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: files = args['files'] job_type = Common().check_job_type_by_files(files) if not job_type: abort(400, message='请上传全部纯文本文档(txt/csv)或者全部电子文档(pdf/word文档)') else: args['mark_job_type'] = job_type result = MarkJobService().create_mark_job(files, NlpTaskEnum.relation, args) return {"message": "创建成功", "result": result}, 201
def get(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: """ 获取所有条款,分页,可选排除条件exclude_terms_ids """ nlp_task_id = Common().get_nlp_task_id_by_route() args.update({ 'nlp_task_id': nlp_task_id }) result, count = DocTermService().get_doc_term_list(args) return { "message": "请求成功", "result": result, "count": count, }, 200
def get(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: """ 获取所有文档条款 :param args: :return: """ nlp_task_id = Common().get_nlp_task_id_by_route() args.update({ 'nlp_task_id': nlp_task_id }) result, count = DocTypeService().get_doc_type(self.get_current_user(), args) return { "message": "请求成功", "result": result, "count": count, }, 200
def post(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: files = args["files"] assign_mode = args["assign_mode"] if assign_mode == AssignModeEnum.together: abort(400, message="不支持共同标注") job_type = Common().check_job_type_by_files(files) if job_type != "text": abort(400, message="请上传纯文本文档(txt/csv)") else: args['mark_job_type'] = job_type try: result = MarkJobService().create_mark_job(files, NlpTaskEnum.wordseg, args) return {"message": "创建成功", "result": result}, 201 except TypeError: abort(400, message="上传文件类型错误")
def export_multi_mark_file(nlp_task_id, mark_job_id_list): mark_job_list = MarkJobModel().get_by_mark_job_id_list(mark_job_id_list=mark_job_id_list) # 导出文件夹命名 export_dir_path = os.path.join( 'upload/export', 'classify_mark_job_{}_{}'.format(','.join([str(job_id) for job_id in mark_job_id_list]), datetime.now().strftime("%Y%m%d%H%M%S"))) os.mkdir(export_dir_path) # get all (count, status, mark_job_id) tuple all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=mark_job_id_list) # convert to a nested dict all_status_dict = Common().tuple_list2dict(all_count) for mark_job in mark_job_list: # 遍历所有的job if mark_job.mark_job_status not in (StatusEnum.success, StatusEnum.approved): # 不成功的job continue # 不是所有的任务都未审核完成 if len(all_status_dict[mark_job.mark_job_id]) == 1 and ( int(StatusEnum.approved) in all_status_dict[mark_job.mark_job_id]): continue export_file_path = os.path.join( 'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job.mark_job_id)) # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果 last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path) if last_exported_file: shutil.copy( last_exported_file, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id))) continue # 重新制作 export_fileset = FileSet(folder=export_file_path) mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids( mark_job_ids=[mark_job.mark_job_id]) file_path = export_sync.generate_classify_file( task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) shutil.copy(file_path, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id))) if not os.listdir(export_dir_path): raise ValueError("所选标注任务中没有完成审核的任务,请重新选择") shutil.make_archive(export_dir_path, 'zip', export_dir_path) # 打包 return export_dir_path + ".zip"
def get(self: Resource, args: Dict[str, Any]) -> Tuple[Dict[str, Any], int]: """ 获取模型记录,分页 """ nlp_task_id = Common.get_nlp_task_id_by_route() count, train_job_list = ModelService( ).get_train_job_list_by_nlp_task_id( nlp_task_id=nlp_task_id, doc_type_id=args['doc_type_id'], search=args['query'], offset=args['offset'], limit=args['limit'], current_user=self.get_current_user()) result = TrainJobSchema().dump(train_job_list, many=True) return { "message": "请求成功", "result": result, "count": count, }, 200
def get(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]: nlp_task_id = Common().get_nlp_task_id_by_route() order_by = args["order_by"][1:] order_by_desc = True if args["order_by"][0] == "-" else False count, predict_job_list = PredictService( ).get_predict_job_list_by_nlp_task_id( nlp_task_id=nlp_task_id, doc_type_id=args['doc_type_id'], search=args['query'], order_by=order_by, order_by_desc=order_by_desc, offset=args['offset'], limit=args['limit'], current_user=self.get_current_user()) # get the serialized result result = PredictJobSchema().dump(predict_job_list, many=True) return { "message": "请求成功", "result": result, "count": count, }, 200
def get_preview_and_next_mark_task_id(current_user, nlp_task_id, task_id, args): q = session.query(MarkTask.mark_task_id) \ .outerjoin(UserTask, UserTask.mark_task_id == MarkTask.mark_task_id) \ .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \ .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \ .filter( DocType.nlp_task_id == nlp_task_id, MarkTask.mark_task_status != int(StatusEnum.processing), ~MarkTask.is_deleted, or_(~UserTask.is_deleted, UserTask.is_deleted.is_(None)), ~MarkJob.is_deleted, ~DocType.is_deleted ) if args.get('job_id'): q = q.filter(MarkJob.mark_job_id == args['job_id']) if args.get("task_state"): q = q.filter(MarkTask.mark_task_status == args.get("task_state")) if args.get("query"): q = q.filter(Doc.doc_raw_name.contains(args.get("query"))) if current_user.user_role in [ RoleEnum.manager.value, RoleEnum.guest.value ]: q = q.filter(DocType.group_id.in_(current_user.user_groups)) elif current_user.user_role in [RoleEnum.reviewer.value]: q = q.filter( func.json_contains(MarkJob.reviewer_ids, str(current_user.user_id))) elif current_user.user_role in [RoleEnum.annotator.value]: q = q.filter( func.json_contains(MarkJob.annotator_ids, str(current_user.user_id))) q1 = Common().order_by_model_fields( q.filter(MarkTask.mark_task_id < task_id), MarkTask, ['-mark_task_id']) q2 = Common().order_by_model_fields( q.filter(MarkTask.mark_task_id > task_id), MarkTask, ['+mark_task_id']) next_task_id = q1.limit(1).first() preview_task_id = q2.limit(1).first() return preview_task_id[0] if preview_task_id else None, next_task_id[ 0] if next_task_id else None
def export_mark_file(nlp_task_id, mark_job_id, offset=50): mark_job = MarkJobModel().get_by_id(mark_job_id) if mark_job.mark_job_status not in (StatusEnum.approved, StatusEnum.success): abort(400, message="有失败或未完成任务,不能导出") all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=[mark_job_id]) # convert 3 element tuple to a nested dict all_status_dict = Common().tuple_list2dict(all_count) if not (len(all_status_dict[mark_job_id]) == 1 and int(StatusEnum.approved) in all_status_dict[mark_job_id]): abort(400, message="有未标注或未审核任务,不能导出") export_file_path = os.path.join( 'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job_id)) # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果 last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path) if last_exported_file: return last_exported_file # 重新制作 export_fileset = FileSet(folder=export_file_path) mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(mark_job_ids=[mark_job_id]) if nlp_task_id == int(NlpTaskEnum.extract): doc_terms = DocTermModel().get_by_filter(limit=99999, doc_type_id=mark_job.doc_type_id) file_path = export_sync.generate_extract_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset, doc_terms=doc_terms, offset=offset) elif nlp_task_id == int(NlpTaskEnum.classify): file_path = export_sync.generate_classify_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) elif nlp_task_id == int(NlpTaskEnum.wordseg): file_path = export_sync.generate_wordseg_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) else: abort(400, message="该任务无法导出") return file_path
def import_labeled_wordseg_files(self, f, mark_job: MarkJob): # Step 1. Save temp file # Step 2. Pre-process labeled file, generate raw content and labeled results # Step 3. Save labeled information into database doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) corpus_doc_unique_name_list = [] # 标注txt中每行是一个sample,应该存成数据库里一个doc try: labeled_corpus_list = [] with open(doc_relative_path, encoding='utf-8-sig') as fr: lines = fr.readlines() for line in lines: line = line.replace("\n", "").strip() if len(line) < 2: continue ws_raw_content = Common().restore_sentence(line) doc_unique_name, _ = upload_fileset.save_file(f.filename, ws_raw_content) corpus_doc_unique_name_list.append(doc_unique_name) labeled_corpus_list.append( [lc.rsplit("/", maxsplit=1) for lc in line.replace(" ", " ").split(" ")]) except Exception as e: logger.exception(e) raise ValueError("分词标注数据格式有误") # bulk insert docs doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in corpus_doc_unique_name_list] doc_entity_list = DocModel().bulk_create(doc_list) task_list = [] for i in range(len(doc_entity_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job.mark_job_id, mark_task_result=labeled_corpus_list[i], mark_task_status=int(StatusEnum.approved) )) task_entity_list = MarkTaskModel().bulk_create(task_list) return task_entity_list
def get_doc_type_info_by_nlp_task_by_user(nlp_task_id, current_user: CurrentUser): """ 获取管理大厅首页的doc_type信息 """ result = [] # get doc_type list by user _, doc_type_list = DocTypeModel().get_by_nlp_task_id_by_user(nlp_task_id=nlp_task_id, current_user=current_user) for doc_type, terms in doc_type_list: doc_type.doc_terms = [int(t) for t in terms.split(",")] if terms is not None else [] doc_type_list = [d[0] for d in doc_type_list] doc_type_list = [{"doc_type": DocTypeSchema().dump(doc_type)} for doc_type in doc_type_list] # get all job count and approved job count all_status, all_marked_status = MarkTaskModel().count_status_by_user(nlp_task_id=nlp_task_id, current_user=current_user) # calculate marked mark_job count and all mark_job for each doc_type all_status_dict = Common().tuple_list2dict(all_status) all_marked_status_dict = Common().tuple_list2dict(all_marked_status) for doc_type in doc_type_list: doc_type_id = doc_type["doc_type"]["doc_type_id"] mark_job_count = len(all_status_dict.get(doc_type_id, {})) marked_mark_job_count = 0 for _mark_job_id, _count_sum in all_status_dict.get(doc_type_id, {}).items(): if _count_sum == all_marked_status_dict.get(doc_type_id, {}).get(_mark_job_id, 0): marked_mark_job_count += 1 doc_type.update(progress_state={"job_num": mark_job_count, "labeled_job_number": marked_mark_job_count, "progress_rate": round(marked_mark_job_count / mark_job_count, 2) if mark_job_count > 0 else 0}) # get latest evaluation result if exists latest_evaluate = EvaluateTaskModel().get_latest_evaluate_by_doc_type_id(nlp_task_id=nlp_task_id, doc_type_id=doc_type_id) if latest_evaluate: doc_type.update(evaluate=EvaluateTaskSchema().dump(latest_evaluate)) result.append(doc_type) return result
def get_mark_task_with_doc_and_doc_type(self, nlp_task_id, current_user: CurrentUser, args): q = session.query(MarkTask, DocType, Doc) \ .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \ .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \ .join(Doc, Doc.doc_id == MarkTask.doc_id) \ .filter( DocType.nlp_task_id == nlp_task_id, ~DocType.is_deleted, ~MarkTask.is_deleted, ~Doc.is_deleted ) # TODO # 权限 if current_user.user_role in [ RoleEnum.manager.value, RoleEnum.guest.value ]: q = q.filter(DocType.group_id.in_(current_user.user_groups)) elif current_user.user_role in [RoleEnum.reviewer.value]: q = q.filter( func.json_contains(MarkJob.reviewer_ids, str(current_user.user_id))) elif current_user.user_role in [RoleEnum.annotator.value]: q = q.filter( func.json_contains(MarkJob.annotator_ids, str(current_user.user_id))) if args.get('job_id'): q = q.filter(MarkTask.mark_job_id == args['job_id']) if args.get('doc_type_id'): q = q.filter(MarkJob.doc_type_id == args['doc_type_id']) if args['task_state']: q = q.filter(MarkTask.mark_task_status == status_str2int_mapper().get(args['task_state'])) if args['query']: q = q.filter(Doc.doc_raw_name.like(f'%{args["query"]}%')) q = q.group_by(MarkTask) count = q.count() processing_count = q.filter( MarkTask.mark_task_status == int(StatusEnum.processing)).count() if args['order_by'] and isinstance(args['order_by'], str): if args['order_by'][1:] == 'task_id': args['order_by'] = args['order_by'][0] + 'mark_task_id' q = Common().order_by_model_fields(q, MarkTask, [args['order_by']]) items = [] results = q.offset(args['offset']).limit(args['limit']).all() mark_task_ids = [mark_task.mark_task_id for mark_task, _, _ in results] user_task_map = self._get_user_task_map( mark_task_ids, select_keys=(UserTask)) # .annotator_id, UserTask.mark_task_id)) UserTaskPlaceholder = UserTask( annotator_id=0, is_deleted=False, user_task_status=StatusEnum.labeled.value) for mark_task, doc_type, doc in results: UserTaskPlaceholder.user_task_result = mark_task.mark_task_result user_task_list = user_task_map.get(str(mark_task.mark_task_id), [UserTaskPlaceholder]) mark_task.user_task_list = user_task_list mark_task.doc = doc mark_task.doc_type = doc_type items.append(mark_task) return count, count - processing_count, items