def create_evaluate_task_by_train_job_id(train_job_id, evaluate_task_name, evaluate_task_desc, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule=0): """ 如果后面要加重新训练的逻辑,这部分要改,不能根据train_job_id去创建评估任务,而是根据train_task_id, 目前先保留,因为目前train_job和train_task是一一对应,不会有影响 """ # get correspondent train_job, doc_type, train_task, nlp_task by train_job_id train_job = TrainJobModel().get_by_id(train_job_id) doc_type = DocTypeModel().get_by_id(train_job.doc_type_id) doc_term_list = DocTermModel().get_by_filter(limit=99999, doc_type_id=doc_type.doc_type_id) doc_type.doc_term_list = doc_term_list nlp_task = NlpTaskEnum(doc_type.nlp_task_id) _, train_task_list = TrainTaskModel().get_by_filter(train_job_id=train_job_id) train_task = train_task_list[0] # create evaluate_task evaluate_task = EvaluateTaskModel().create(evaluate_task_name=evaluate_task_name, evaluate_task_desc=evaluate_task_desc, train_task_id=train_task.train_task_id, evaluate_task_status=int(StatusEnum.processing)) # bulk create evaluate m2m mark evaluate_m2m_mark_list = [{"evaluate_task_id": evaluate_task.evaluate_task_id, "mark_job_id": _id} for _id in mark_job_ids] EvaluateM2mMarkModel().bulk_create(evaluate_m2m_mark_list) # push to evaluate redis queue doc_term_ids = [str(t.doc_term_id) for t in RelationM2mTermModel().get_by_filter(limit=99999, doc_relation_ids=[int(rl) for rl in doc_relation_ids])] push_evaluate_task_to_redis(nlp_task, evaluate_task, train_task, doc_type, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule) session.commit() return evaluate_task
def get_mark_job_data_by_ids(self, mark_job_ids, args, doc_type_key="doc_type", prefix='NER'): items = [] for mark_job_id in mark_job_ids: doc_type = DocTypeModel().get_by_mark_job_id(mark_job_id) result = { "prefix": prefix, # TODO: 与MQ确认传参是否适配 doc_type_key: DocTypeSchema().dump(doc_type), "docs": [], "tasks": [], "mark_job_id": mark_job_id, } data = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids([mark_job_id]) for task, doc in data: # 抽取逻辑 if args.get('doc_term_ids'): if isinstance(task.mark_task_result, list) \ and Common.check_doc_term_include(task.mark_task_result, 'doc_term_id', args['doc_term_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) # 实体关系逻辑 if args.get('doc_relation_ids'): if isinstance(task.mark_task_result, list) and Common.check_doc_relation_include( task.mark_task_result, 'relation_id', args['doc_relation_ids']): result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) else: result['docs'].append(DocSchema().dump(doc)) result['tasks'].append(MarkTaskSchema().dump(task)) items.append(result) return items
def import_mark_job(self, files, args, nlp_task): DocTypeModel().get_by_id(args['doc_type_id']) job = MarkJobModel().create( mark_job_name=args['mark_job_name'], mark_job_type=args['mark_job_type'], mark_job_desc=args.get('mark_job_desc'), doc_type_id=args['doc_type_id'], mark_job_status=int(StatusEnum.approved), assign_mode='average', ) tasks = [] for f in files: if nlp_task == NlpTaskEnum.classify: single_file_tasks = self.import_labeled_classify_files(f, job) elif nlp_task == NlpTaskEnum.extract: single_file_tasks = self.import_labeled_extract_files(f, job) elif nlp_task == NlpTaskEnum.wordseg: single_file_tasks = self.import_labeled_wordseg_files(f, job) else: raise TypeError('nlp_task illegal') tasks.extend(single_file_tasks) session.commit() result = MarkJobSchema().dump(job) return result
def get_doc_type(current_user: CurrentUser, args): mark_job_ids = args.get('mark_job_ids', []) nlp_task_id = args["nlp_task_id"] count, items = DocTypeModel().get_by_mark_job_ids(mark_job_ids=mark_job_ids, nlp_task_id=nlp_task_id, current_user=current_user, offset=args["offset"], limit=args["limit"]) result = DocTypeSchema(many=True).dump(items) return result, count
def re_pre_label_mark_job(self, mark_job_ids, nlp_task): pipe = r.pipeline() # 通过标注任务获取 doctype id mark_jobs = MarkJobModel().get_by_ids(mark_job_ids) doc_type_ids = set(item.doc_type_id for item in mark_jobs) # 获取其中拥有上线模型的doctype ids online_doc_type_ids = DocTypeModel().get_online_ids_by_ids(doc_type_ids) # 如果重新预标注的doc type在上线模型中没有 则abort if doc_type_ids - online_doc_type_ids: doc_types = DocTypeModel().get_by_ids(doc_type_ids - online_doc_type_ids) abort(400, message='项目:{},没有上线模型'.format('、'.join(item.doc_type_name for item in doc_types))) # 获取所有标注任务所有文件生成的标注任务 unlabel_tasks = MarkTaskModel().get_unlabel_tasks_by_mark_job_ids(mark_job_ids) # 按标注任务发送重新预标注任务 for task in unlabel_tasks: self.push_mark_task_message(task, task, task, business=f"{nlp_task.name}_label") pipe.execute()
def create_relation(doc_type_id: int, doc_term_ids: typing.List, doc_relation_name: str): if not DocTypeModel().get_by_id(doc_type_id): raise ValueError(f"DocType {doc_type_id} 不存在") if len(DocTermModel().get_by_filter(doc_term_ids=doc_term_ids)) != 2: raise ValueError(f"DocTerm 不存在或已被删除") item = DocTermModel().create_relation(doc_relation_name, doc_term_ids, doc_type_id=doc_type_id) session.commit() return { "doc_relation_name": doc_relation_name, "doc_relation_id": item.doc_relation_id }
def create_doc_type(current_user: CurrentUser, args): doc_term_list = args.pop('doc_term_list') if 'group_id' not in args or args['group_id'] < 1: if current_user.user_groups: args['group_id'] = current_user.user_groups[0] else: abort(403, message="当前角色禁止创建项目,请切换角色操作") doc_type = DocTypeModel().create(**args) for item in doc_term_list: item.update({'doc_type_id': doc_type.doc_type_id}) doc_type.doc_term_list = DocTermModel().bulk_create(doc_term_list) session.commit() result = DocTypeSchema().dumps(doc_type) return result
def update_doc_type(args, doc_type_id): item = DocTypeModel().update(doc_type_id, **args) existed_doc_term_ids = [dt.doc_term_id for dt in DocTermModel().get_by_filter(doc_type_id=doc_type_id)] updated_doc_term_ids = [] if args.get("doc_term_list"): for i in args.get("doc_term_list"): i.update({"doc_type_id": doc_type_id}) updated_doc_term_ids.append(i.get("doc_term_id", 0)) DocTermModel().bulk_update(args.get("doc_term_list")) session.commit() # Remove doc terms for i in existed_doc_term_ids: if i not in updated_doc_term_ids: DocTermModel().delete(i) session.commit() return DocTypeSchema().dump(item)
def get_doc_type_info_by_nlp_task_by_user(nlp_task_id, current_user: CurrentUser): """ 获取管理大厅首页的doc_type信息 """ result = [] # get doc_type list by user _, doc_type_list = DocTypeModel().get_by_nlp_task_id_by_user(nlp_task_id=nlp_task_id, current_user=current_user) for doc_type, terms in doc_type_list: doc_type.doc_terms = [int(t) for t in terms.split(",")] if terms is not None else [] doc_type_list = [d[0] for d in doc_type_list] doc_type_list = [{"doc_type": DocTypeSchema().dump(doc_type)} for doc_type in doc_type_list] # get all job count and approved job count all_status, all_marked_status = MarkTaskModel().count_status_by_user(nlp_task_id=nlp_task_id, current_user=current_user) # calculate marked mark_job count and all mark_job for each doc_type all_status_dict = Common().tuple_list2dict(all_status) all_marked_status_dict = Common().tuple_list2dict(all_marked_status) for doc_type in doc_type_list: doc_type_id = doc_type["doc_type"]["doc_type_id"] mark_job_count = len(all_status_dict.get(doc_type_id, {})) marked_mark_job_count = 0 for _mark_job_id, _count_sum in all_status_dict.get(doc_type_id, {}).items(): if _count_sum == all_marked_status_dict.get(doc_type_id, {}).get(_mark_job_id, 0): marked_mark_job_count += 1 doc_type.update(progress_state={"job_num": mark_job_count, "labeled_job_number": marked_mark_job_count, "progress_rate": round(marked_mark_job_count / mark_job_count, 2) if mark_job_count > 0 else 0}) # get latest evaluation result if exists latest_evaluate = EvaluateTaskModel().get_latest_evaluate_by_doc_type_id(nlp_task_id=nlp_task_id, doc_type_id=doc_type_id) if latest_evaluate: doc_type.update(evaluate=EvaluateTaskSchema().dump(latest_evaluate)) result.append(doc_type) return result
def create_doc_type(): from app.model import DocTypeModel if len(DocTypeModel().get_all()) == 0: doc_types = [ dict(app_id=1, created_by=1, doc_type_id=1, doc_type_name="测试抽取项目1", nlp_task_id=int(NlpTaskEnum.extract)), dict(app_id=1, created_by=1, doc_type_id=2, doc_type_name="测试抽取项目2", nlp_task_id=int(NlpTaskEnum.extract)), dict(app_id=1, created_by=1, doc_type_id=3, doc_type_name="测试抽取项目3", nlp_task_id=int(NlpTaskEnum.extract)), dict(app_id=1, created_by=1, doc_type_id=4, doc_type_name="测试抽取项目4", nlp_task_id=int(NlpTaskEnum.extract)), dict(app_id=1, created_by=1, doc_type_id=5, doc_type_name="测试分类项目1", nlp_task_id=int(NlpTaskEnum.classify)), dict(app_id=1, created_by=1, doc_type_id=6, doc_type_name="测试分类项目2", nlp_task_id=int(NlpTaskEnum.classify)), dict(app_id=1, created_by=1, doc_type_id=7, doc_type_name="测试分类项目3", nlp_task_id=int(NlpTaskEnum.classify)), dict(app_id=1, created_by=1, doc_type_id=8, doc_type_name="测试关系项目1", nlp_task_id=int(NlpTaskEnum.relation)), dict(app_id=1, created_by=1, doc_type_id=9, doc_type_name="测试关系项目2", nlp_task_id=int(NlpTaskEnum.relation)), dict(app_id=1, created_by=1, doc_type_id=10, doc_type_name="测试分词项目1", nlp_task_id=int(NlpTaskEnum.wordseg)), dict(app_id=1, created_by=1, doc_type_id=11, doc_type_name="测试分词项目2", nlp_task_id=int(NlpTaskEnum.wordseg)), ] DocTypeModel().bulk_create(doc_types) session.commit()
def get(self): return jsonify(DocTypeModel().get_all())
def set_favoriate_doc_type(doc_type_id, is_favorite: bool): _doc_type = DocTypeModel().update(doc_type_id=doc_type_id, is_favorite=is_favorite) return DocTypeSchema().dump(_doc_type)
def get_doc_type_items(doc_type_id: int): item = DocTypeModel().get_by_id(doc_type_id) item.doc_term_list = DocTermModel().get_by_filter(doc_type_id=doc_type_id) return DocTypeSchema().dump(item)
def get_by_id_and_user_group(doc_type_id, group_id): doc_type = DocTypeModel().get_by_id_by_user_group(_id=doc_type_id, group_id=group_id) return doc_type
def get_by_id(doc_type_id): doc_type = DocTypeModel().get_by_id(doc_type_id) return doc_type
def create_relation_doc_type(args): item = DocTypeModel().create(**args) session.commit() result = DocTypeSchema().dump(item) return result
def delete_doc_type(doc_type_id): DocTypeModel().delete(doc_type_id) session.commit()
def check_doc_type_name_exists(doc_type_name): return DocTypeModel().if_exists_by_name(doc_type_name)
def update_relation_doc_type(args, doc_type_id): item = DocTypeModel().update(doc_type_id, **args) session.commit() return DocTypeSchema().dump(item)