def detele_task(task_id): mark_job_id = MarkTaskModel().get_mark_job_id_by_id(task_id) MarkTaskModel().delete(task_id) session.commit() mark_job_status = MarkJobModel().check_mark_job_status(mark_job_id) mark_job = MarkJobModel().update(mark_job_id, **{'mark_job_status': mark_job_status}) session.commit() return mark_job
def get_preview_and_next_task_id(current_user: CurrentUser, task_id, args): nlp_task_id = nlp_task_mapper.get(args['job_type']) if current_user.user_role in [RoleEnum.annotator.value]: preview_task_id, next_task_id = UserTaskModel().get_preview_and_next_user_task_id(current_user, nlp_task_id, task_id, args) else: preview_task_id, next_task_id = MarkTaskModel().get_preview_and_next_mark_task_id(current_user, nlp_task_id, task_id, args) return preview_task_id, next_task_id
def upload_batch_files(self, f: FileStorage, mark_job: MarkJob, nlp_task) -> List[MarkTask]: doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) content_list = upload_fileset.read_csv(doc_relative_path) # bulk create doc doc_name_list = [] for txt_content in content_list: doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content) doc_name_list.append(doc_unique_name) doc_list = [dict(doc_raw_name=csv_doc.doc_raw_name, doc_unique_name=d) for d in doc_name_list] doc_list = DocModel().bulk_create(doc_list) # bulk create mark tasks task_list = [] for i in range(len(doc_list)): task_list.append(dict(doc_id=doc_list[i].doc_id, mark_job_id=mark_job.mark_job_id)) task_list = MarkTaskModel().bulk_create(task_list) # push redis business = self.get_business_by_nlp_task(nlp_task) for i in range(len(doc_list)): self.push_mark_task_message( mark_job=mark_job, mark_task=task_list[i], doc=doc_list[i], business=business) return task_list
def get_mark_job_data_by_ids(self, mark_job_ids, args, doc_type_key="doc_type", prefix='NER'): items = [] for mark_job_id in mark_job_ids: doc_type = DocTypeModel().get_by_mark_job_id(mark_job_id) result = { "prefix": prefix, # TODO: 与MQ确认传参是否适配 doc_type_key: DocTypeSchema().dump(doc_type), "docs": [], "tasks": [], "mark_job_id": mark_job_id, } data = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids([mark_job_id]) for task, doc in data: # 抽取逻辑 if args.get('doc_term_ids'): if isinstance(task.mark_task_result, list) \ and Common.check_doc_term_include(task.mark_task_result, 'doc_term_id', args['doc_term_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) # 实体关系逻辑 if args.get('doc_relation_ids'): if isinstance(task.mark_task_result, list) and Common.check_doc_relation_include( task.mark_task_result, 'relation_id', args['doc_relation_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) else: result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) items.append(result) return items
def get_mark_job_list_by_nlp_task(self, args, nlp_task: NlpTaskEnum): user_role = self.get_current_role() nlp_task_id = int(nlp_task) count, result = MarkJobModel().get_by_nlp_task_id( nlp_task_id=nlp_task_id, doc_type_id=args['doc_type_id'], user_role=user_role, search=args['query'], limit=args['limit'], offset=args['offset']) mark_job_ids = [mark_job.mark_job_id for mark_job, _ in result] status_count = MarkTaskModel().count_mark_task_status(mark_job_ids=mark_job_ids) cache = {} for mark_job_id, task_status, task_status_count in status_count: if not cache.get(mark_job_id): cache[mark_job_id] = {'all': 0, 'labeled': 0, 'audited': 0} cache[mark_job_id]['all'] += task_status_count if StatusEnum.labeled <= task_status <= StatusEnum.approved: cache[mark_job_id]['labeled'] += task_status_count if task_status == StatusEnum.approved: cache[mark_job_id]['audited'] += task_status_count items = [] for mark_job, doc_type in result: mark_job.stats = cache.get(mark_job.mark_job_id, {'all': 0, 'labeled': 0, 'audited': 0}) mark_job.doc_type = doc_type items.append(mark_job) result = MarkJobSchema(many=True).dump(items) return count, result
def upload_single_file(self, f: FileStorage, mark_job: MarkJob, nlp_task) -> List[MarkTask]: doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) mark_task = MarkTaskModel().create(doc_id=doc.doc_id, mark_job_id=mark_job.mark_job_id) business = self.get_business_by_nlp_task(nlp_task) self.push_mark_task_message( mark_job=mark_job, mark_task=mark_task, doc=doc, business=business) return [mark_task]
def export_multi_mark_file(nlp_task_id, mark_job_id_list): mark_job_list = MarkJobModel().get_by_mark_job_id_list(mark_job_id_list=mark_job_id_list) # 导出文件夹命名 export_dir_path = os.path.join( 'upload/export', 'classify_mark_job_{}_{}'.format(','.join([str(job_id) for job_id in mark_job_id_list]), datetime.now().strftime("%Y%m%d%H%M%S"))) os.mkdir(export_dir_path) # get all (count, status, mark_job_id) tuple all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=mark_job_id_list) # convert to a nested dict all_status_dict = Common().tuple_list2dict(all_count) for mark_job in mark_job_list: # 遍历所有的job if mark_job.mark_job_status not in (StatusEnum.success, StatusEnum.approved): # 不成功的job continue # 不是所有的任务都未审核完成 if len(all_status_dict[mark_job.mark_job_id]) == 1 and ( int(StatusEnum.approved) in all_status_dict[mark_job.mark_job_id]): continue export_file_path = os.path.join( 'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job.mark_job_id)) # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果 last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path) if last_exported_file: shutil.copy( last_exported_file, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id))) continue # 重新制作 export_fileset = FileSet(folder=export_file_path) mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids( mark_job_ids=[mark_job.mark_job_id]) file_path = export_sync.generate_classify_file( task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) shutil.copy(file_path, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id))) if not os.listdir(export_dir_path): raise ValueError("所选标注任务中没有完成审核的任务,请重新选择") shutil.make_archive(export_dir_path, 'zip', export_dir_path) # 打包 return export_dir_path + ".zip"
def import_labeled_classify_files(f, mark_job): doc_type_id = mark_job.doc_type_id mark_job_id = mark_job.mark_job_id doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) try: df = pd.read_csv(doc_relative_path, skiprows=0, na_values='') except EmptyDataError: raise EmptyDataError('上传数据为空,请检查上传数据:{}'.format(f.filename)) except Exception: raise EmptyDataError('上传数据处理异常,请检查上传数据:{}'.format(f.filename)) if 'text' not in df.columns or 'label' not in df.columns: raise KeyError doc_terms, _ = DocTermModel.get_doc_term_by_doctype(doc_type_id, offset=0, limit=9999) doc_term_name2id_map = {m.doc_term_name: m.doc_term_id for m in doc_terms} content_list = [] task_results = [] for row_num, row in df.iterrows(): content = row.get('text') label = row.get('label') try: label_id = doc_term_name2id_map[label] except KeyError as ke: raise ValueError(f"当前项目不存在文件第{row_num + 2}行的label:{ke.args[0]},请检查") task_result = [{'prob': 1, 'marked': 1, 'label_id': label_id, 'label_name': label}] if content and label: content_list.append(content) task_results.append(task_result) # bulk insert doc unique_name_list = [] for txt_content in content_list: doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content) unique_name_list.append(doc_unique_name) doc_list = [ dict( doc_raw_name=csv_doc.doc_raw_name, doc_unique_name=unique_name, ) for unique_name in unique_name_list ] doc_entity_list = DocModel().bulk_create(doc_list) # bulk insert task task_list = [] for i in range(len(doc_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job_id, mark_task_result=task_results[i] if task_results else {}, mark_task_status=int(StatusEnum.approved) )) task_entity_list = MarkTaskModel().bulk_create(task_list) return task_entity_list
def update_mark_task_status(current_user: CurrentUser, task_id, args): if args.get('task_result'): args['mark_task_result'] = args.get('task_result') del args['task_result'] if args.get('task_state'): args['mark_task_status'] = status_str2int_mapper()[args['task_state']] del args['task_state'] item = MarkTaskModel().update(task_id, **args) schema = MarkTaskSchema session.commit() result = schema().dump(item) return result
def update_user_task_status(current_user: CurrentUser, task_id, args): if args.get('task_result'): args['user_task_result'] = args.get('task_result') del args['task_result'] if args.get('task_state'): args['user_task_status'] = status_str2int_mapper()[args['task_state']] del args['task_state'] item = UserTaskModel().update_by_annotator_id(current_user, task_id, **args) MarkTaskModel().check_user_task_and_update_mark_task(task_id) schema = UserTaskSchema session.commit() result = schema().dump(item) return result
def export_mark_file(nlp_task_id, mark_job_id, offset=50): mark_job = MarkJobModel().get_by_id(mark_job_id) if mark_job.mark_job_status not in (StatusEnum.approved, StatusEnum.success): abort(400, message="有失败或未完成任务,不能导出") all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=[mark_job_id]) # convert 3 element tuple to a nested dict all_status_dict = Common().tuple_list2dict(all_count) if not (len(all_status_dict[mark_job_id]) == 1 and int(StatusEnum.approved) in all_status_dict[mark_job_id]): abort(400, message="有未标注或未审核任务,不能导出") export_file_path = os.path.join( 'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job_id)) # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果 last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path) if last_exported_file: return last_exported_file # 重新制作 export_fileset = FileSet(folder=export_file_path) mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(mark_job_ids=[mark_job_id]) if nlp_task_id == int(NlpTaskEnum.extract): doc_terms = DocTermModel().get_by_filter(limit=99999, doc_type_id=mark_job.doc_type_id) file_path = export_sync.generate_extract_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset, doc_terms=doc_terms, offset=offset) elif nlp_task_id == int(NlpTaskEnum.classify): file_path = export_sync.generate_classify_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) elif nlp_task_id == int(NlpTaskEnum.wordseg): file_path = export_sync.generate_wordseg_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) else: abort(400, message="该任务无法导出") return file_path
def update_mark_task_and_user_task_by_mark_task_id(mark_task_id, args): # update mark task mark_task = MarkTaskModel().update(mark_task_id, **args) # update user task list user_task_list = UserTaskModel().get_by_filter(limit=99999, mark_task_id=mark_task_id) user_update_params = { "user_task_status": mark_task.mark_task_status, "user_task_result": mark_task.mark_task_result } user_task_list = UserTaskModel().bulk_update( [user_task.user_task_id for user_task in user_task_list], **user_update_params) session.commit() return mark_task, user_task_list
def get_mark_task_or_user_task(current_user: CurrentUser, task_id: int): """ :param current_user: :param task_id: 如果是标注员则为user task id 如果是审核员员则为mark task id :return: """ if current_user.user_role in [RoleEnum.annotator.value]: item = UserTaskModel().get_user_task_with_doc_and_user_task_list_by_id(task_id) schema = UserTaskSchema else: item = MarkTaskModel().get_mark_task_with_doc_and_user_task_list_by_id(task_id) schema = MarkTaskSchema result = schema().dump(item) return result
def update_mark_task_or_user_task_status(current_user: CurrentUser, task_id, args): if current_user.user_role in [RoleEnum.annotator.value]: if args.get('task_result'): args['user_task_result'] = args.get('task_result') del args['task_result'] if args.get('task_state'): args['user_task_status'] = int(getattr(StatusEnum, args['task_state'])) del args['task_state'] item = UserTaskModel().update_by_annotator_id(current_user, task_id, **args) MarkTaskModel().check_user_task_and_update_mark_task(task_id) schema = UserTaskSchema else: if args.get('task_result'): args['mark_task_result'] = args.get('task_result') del args['task_result'] if args.get('task_state'): args['mark_task_status'] = int(getattr(StatusEnum, args['task_state'])) del args['task_state'] item = MarkTaskModel().update(task_id, **args) schema = MarkTaskSchema session.commit() result = schema().dump(item) return result
def update_mark_job_status_by_mark_task(mark_task: MarkTask): # 更新这个task对应的job的状态,如果其下所有的task都成功,则修改job状态成功;如果其下有一个任务失败,则修改job状态失败 status_list = MarkTaskModel().get_distinct_status_by_mark_job(mark_task.mark_job_id) if int(StatusEnum.fail) in status_list: # 有一个失败,则整个job失败 new_job_status = int(StatusEnum.fail) else: new_job_status = min(status_list) mark_job = MarkJobModel().get_by_id(mark_task.mark_job_id) if mark_job.mark_job_status != new_job_status: MarkJobModel().update( mark_task.mark_job_id, mark_job_status=new_job_status, updated_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ) session.commit()
def get_user_task_or_mark_task_result_by_role(current_user: CurrentUser, args): nlp_task_id = nlp_task_mapper.get(args['job_type']) if current_user.user_role in [RoleEnum.annotator.value]: count, processed, items = UserTaskModel().get_user_task_with_doc_and_doc_type(nlp_task_id=nlp_task_id, current_user=current_user, args=args) schema = UserTaskSchema else: count, processed, items = MarkTaskModel().get_mark_task_with_doc_and_doc_type(nlp_task_id=nlp_task_id, current_user=current_user, args=args) schema = MarkTaskSchema if args['job_type'] == 'classify_mark': # TODO 返回数据格式转换 result = schema(many=True).dump(items) else: result = schema(many=True, exclude=('task_result',)).dump(items) return count, processed, result
def re_pre_label_mark_job(self, mark_job_ids, nlp_task): pipe = r.pipeline() # 通过标注任务获取 doctype id mark_jobs = MarkJobModel().get_by_ids(mark_job_ids) doc_type_ids = set(item.doc_type_id for item in mark_jobs) # 获取其中拥有上线模型的doctype ids online_doc_type_ids = DocTypeModel().get_online_ids_by_ids(doc_type_ids) # 如果重新预标注的doc type在上线模型中没有 则abort if doc_type_ids - online_doc_type_ids: doc_types = DocTypeModel().get_by_ids(doc_type_ids - online_doc_type_ids) abort(400, message='项目:{},没有上线模型'.format('、'.join(item.doc_type_name for item in doc_types))) # 获取所有标注任务所有文件生成的标注任务 unlabel_tasks = MarkTaskModel().get_unlabel_tasks_by_mark_job_ids(mark_job_ids) # 按标注任务发送重新预标注任务 for task in unlabel_tasks: self.push_mark_task_message(task, task, task, business=f"{nlp_task.name}_label") pipe.execute()
def get_doc_type_info_by_nlp_task_by_user(nlp_task_id, current_user: CurrentUser): """ 获取管理大厅首页的doc_type信息 """ result = [] # get doc_type list by user _, doc_type_list = DocTypeModel().get_by_nlp_task_id_by_user(nlp_task_id=nlp_task_id, current_user=current_user) for doc_type, terms in doc_type_list: doc_type.doc_terms = [int(t) for t in terms.split(",")] if terms is not None else [] doc_type_list = [d[0] for d in doc_type_list] doc_type_list = [{"doc_type": DocTypeSchema().dump(doc_type)} for doc_type in doc_type_list] # get all job count and approved job count all_status, all_marked_status = MarkTaskModel().count_status_by_user(nlp_task_id=nlp_task_id, current_user=current_user) # calculate marked mark_job count and all mark_job for each doc_type all_status_dict = Common().tuple_list2dict(all_status) all_marked_status_dict = Common().tuple_list2dict(all_marked_status) for doc_type in doc_type_list: doc_type_id = doc_type["doc_type"]["doc_type_id"] mark_job_count = len(all_status_dict.get(doc_type_id, {})) marked_mark_job_count = 0 for _mark_job_id, _count_sum in all_status_dict.get(doc_type_id, {}).items(): if _count_sum == all_marked_status_dict.get(doc_type_id, {}).get(_mark_job_id, 0): marked_mark_job_count += 1 doc_type.update(progress_state={"job_num": mark_job_count, "labeled_job_number": marked_mark_job_count, "progress_rate": round(marked_mark_job_count / mark_job_count, 2) if mark_job_count > 0 else 0}) # get latest evaluation result if exists latest_evaluate = EvaluateTaskModel().get_latest_evaluate_by_doc_type_id(nlp_task_id=nlp_task_id, doc_type_id=doc_type_id) if latest_evaluate: doc_type.update(evaluate=EvaluateTaskSchema().dump(latest_evaluate)) result.append(doc_type) return result
def import_labeled_extract_files(self, f, mark_job: MarkJob): doc_type_id = mark_job.doc_type_id mark_job_id = mark_job.mark_job_id alias_id_mapping = DocTermModel().get_doc_term_alias_mapping(doc_type_id) # txt file contains multiple lines doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) sample_docs = [] task_results = [] with open(doc_relative_path, encoding="utf-8-sig") as fr: samples = fr.readlines() for sample in samples: sample = sample.replace("\n", "").strip() if len(sample) < 2: continue # parse tagged content into database format raw_content, task_result_list = self.parse_sample(sample, alias_id_mapping) doc_unique_name, _ = upload_fileset.save_file(f.filename, raw_content) sample_docs.append(doc_unique_name) task_results.append(task_result_list) # bulk insert docs doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in sample_docs] doc_entity_list = DocModel().bulk_create(doc_list) task_list = [] for i in range(len(doc_entity_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job_id, mark_task_result=task_results[i] if task_results else {}, mark_task_status=int(StatusEnum.approved) if task_results else int(StatusEnum.processing) )) task_entity_list = MarkTaskModel().bulk_create(task_list) # push redis for i in range(len(doc_list)): self.push_mark_task_message(mark_job, task_entity_list[i], doc_entity_list[i], business="dp") return task_entity_list
def import_labeled_wordseg_files(self, f, mark_job: MarkJob): # Step 1. Save temp file # Step 2. Pre-process labeled file, generate raw content and labeled results # Step 3. Save labeled information into database doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) corpus_doc_unique_name_list = [] # 标注txt中每行是一个sample,应该存成数据库里一个doc try: labeled_corpus_list = [] with open(doc_relative_path, encoding='utf-8-sig') as fr: lines = fr.readlines() for line in lines: line = line.replace("\n", "").strip() if len(line) < 2: continue ws_raw_content = Common().restore_sentence(line) doc_unique_name, _ = upload_fileset.save_file(f.filename, ws_raw_content) corpus_doc_unique_name_list.append(doc_unique_name) labeled_corpus_list.append( [lc.rsplit("/", maxsplit=1) for lc in line.replace(" ", " ").split(" ")]) except Exception as e: logger.exception(e) raise ValueError("分词标注数据格式有误") # bulk insert docs doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in corpus_doc_unique_name_list] doc_entity_list = DocModel().bulk_create(doc_list) task_list = [] for i in range(len(doc_entity_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job.mark_job_id, mark_task_result=labeled_corpus_list[i], mark_task_status=int(StatusEnum.approved) )) task_entity_list = MarkTaskModel().bulk_create(task_list) return task_entity_list
def create_mark_task(): from app.model import MarkTaskModel if MarkTaskModel().is_empty_table(): mark_tasks = [ dict(app_id=1, created_by=1, mark_job_id=1, mark_task_id=1, doc_id=1, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=1, mark_task_id=2, doc_id=2, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=1, mark_task_id=3, doc_id=3, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=1, mark_task_id=4, doc_id=4, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=1, mark_task_id=5, doc_id=5, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=8, mark_task_id=6, doc_id=1, mark_task_status=int(StatusEnum.approved)), dict(app_id=1, created_by=1, mark_job_id=8, mark_task_id=7, doc_id=2, mark_task_status=int(StatusEnum.approved)), dict(app_id=1, created_by=1, mark_job_id=9, mark_task_id=8, doc_id=3, mark_task_status=int(StatusEnum.approved)), dict(app_id=1, created_by=1, mark_job_id=9, mark_task_id=9, doc_id=4, mark_task_status=int(StatusEnum.labeled)), dict(app_id=1, created_by=1, mark_job_id=9, mark_task_id=10, doc_id=5, mark_task_status=int(StatusEnum.labeled)), ] MarkTaskModel().bulk_create(mark_tasks) session.commit()
def reject_mark_task(mark_task_id): UserTaskModel().update_status_to_unlabel_by_mark_task_id(mark_task_id) MarkTaskModel().update_status_to_unlabel_by_mark_task_id(mark_task_id) session.commit()
def export_pdf(task_id): doc_unique_name, doc_raw_name, labels = MarkTaskModel().get_doc_and_lable(task_id) return doc_unique_name, doc_raw_name, labels