def create_new_rule(args): is_existed = DocTermModel().check_exists_rule(args.get("doc_term_id")) if is_existed: raise ValueError("该标签的规则已存在,请勿重复创建") classify_rule = DocTermModel().create_classify_rule(**args) session.commit() result = ClassifyDocRuleSchema().dump(classify_rule) return result
def create_classify_doc_term(args, doc_type_id, doc_rule_list): doc_term = DocTermModel().create(**args, doc_type_id=doc_type_id) doc_term.flush() for doc_rule_dict in doc_rule_list: ClassifyRuleModel().create(doc_term_id=doc_term.doc_term_id, **doc_rule_dict) session.commit() result = DocTermSchema().dump(doc_term) return result
def create_evaluate_task_by_train_job_id(train_job_id, evaluate_task_name, evaluate_task_desc, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule=0): """ 如果后面要加重新训练的逻辑,这部分要改,不能根据train_job_id去创建评估任务,而是根据train_task_id, 目前先保留,因为目前train_job和train_task是一一对应,不会有影响 """ # get correspondent train_job, doc_type, train_task, nlp_task by train_job_id train_job = TrainJobModel().get_by_id(train_job_id) doc_type = DocTypeModel().get_by_id(train_job.doc_type_id) doc_term_list = DocTermModel().get_by_filter(limit=99999, doc_type_id=doc_type.doc_type_id) doc_type.doc_term_list = doc_term_list nlp_task = NlpTaskEnum(doc_type.nlp_task_id) _, train_task_list = TrainTaskModel().get_by_filter(train_job_id=train_job_id) train_task = train_task_list[0] # create evaluate_task evaluate_task = EvaluateTaskModel().create(evaluate_task_name=evaluate_task_name, evaluate_task_desc=evaluate_task_desc, train_task_id=train_task.train_task_id, evaluate_task_status=int(StatusEnum.processing)) # bulk create evaluate m2m mark evaluate_m2m_mark_list = [{"evaluate_task_id": evaluate_task.evaluate_task_id, "mark_job_id": _id} for _id in mark_job_ids] EvaluateM2mMarkModel().bulk_create(evaluate_m2m_mark_list) # push to evaluate redis queue doc_term_ids = [str(t.doc_term_id) for t in RelationM2mTermModel().get_by_filter(limit=99999, doc_relation_ids=[int(rl) for rl in doc_relation_ids])] push_evaluate_task_to_redis(nlp_task, evaluate_task, train_task, doc_type, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule) session.commit() return evaluate_task
def get_doc_term_by_doctype(doc_type_id, offset=0, limit=10, doc_term_ids=None): items, count = DocTermModel().get_doc_term_by_doctype( doc_type_id, offset, limit, doc_term_ids) result = DocTermSchema(many=True).dump(items) return result, count
def get_doc_term_list(args): exclude_terms_ids = args.get('exclude_terms_ids', []) offset = args.get('offset', 0) limit = args.get('limit', 10) items, count = DocTermModel().get_by_exclude_terms( exclude_terms_ids=exclude_terms_ids, offset=offset, limit=limit) result = DocTermSchema(many=True).dump(items) return result, count
def import_labeled_classify_files(f, mark_job): doc_type_id = mark_job.doc_type_id mark_job_id = mark_job.mark_job_id doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) try: df = pd.read_csv(doc_relative_path, skiprows=0, na_values='') except EmptyDataError: raise EmptyDataError('上传数据为空,请检查上传数据:{}'.format(f.filename)) except Exception: raise EmptyDataError('上传数据处理异常,请检查上传数据:{}'.format(f.filename)) if 'text' not in df.columns or 'label' not in df.columns: raise KeyError doc_terms, _ = DocTermModel.get_doc_term_by_doctype(doc_type_id, offset=0, limit=9999) doc_term_name2id_map = {m.doc_term_name: m.doc_term_id for m in doc_terms} content_list = [] task_results = [] for row_num, row in df.iterrows(): content = row.get('text') label = row.get('label') try: label_id = doc_term_name2id_map[label] except KeyError as ke: raise ValueError(f"当前项目不存在文件第{row_num + 2}行的label:{ke.args[0]},请检查") task_result = [{'prob': 1, 'marked': 1, 'label_id': label_id, 'label_name': label}] if content and label: content_list.append(content) task_results.append(task_result) # bulk insert doc unique_name_list = [] for txt_content in content_list: doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content) unique_name_list.append(doc_unique_name) doc_list = [ dict( doc_raw_name=csv_doc.doc_raw_name, doc_unique_name=unique_name, ) for unique_name in unique_name_list ] doc_entity_list = DocModel().bulk_create(doc_list) # bulk insert task task_list = [] for i in range(len(doc_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job_id, mark_task_result=task_results[i] if task_results else {}, mark_task_status=int(StatusEnum.approved) )) task_entity_list = MarkTaskModel().bulk_create(task_list) return task_entity_list
def create_doc_term(): from app.model import DocTermModel if len(DocTermModel().get_all()) == 0: DocTermModel().create(app_id=1, created_by=1, doc_term_id=1, doc_term_name="人名", doc_term_alias="nr", doc_type_id=1) DocTermModel().create(app_id=1, created_by=1, doc_term_id=2, doc_term_name="地名", doc_term_alias="ns", doc_type_id=1) DocTermModel().create(app_id=1, created_by=1, doc_term_id=3, doc_term_name="机构名", doc_term_alias="nt", doc_type_id=1) session.commit()
def create_export_task(current_user: CurrentUser, mark_job_ids, mark_type, export_type): # raise no result found exception redis_message = {} doc_type_id = MarkJobModel().get_by_id(int(mark_job_ids.split(',')[0])).doc_type_id doc_terms = [str(row.doc_term_id) for row in DocTermModel().get_by_filter(doc_type_id=doc_type_id)] if mark_type == 'wordseg': doc_terms = ['10086'] elif mark_type == 'relation': relation_2_entity_mapping = [{i[0]: [d for d in i[1].split(",")]} for i in RelationM2mTermModel.get_relation_term_mapping(doc_type_id)] redis_message.update({ 'relation_2_entity_mapping': relation_2_entity_mapping, }) version = '{}{}_{}_{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"), str(uuid.uuid4())[:4], doc_type_id, mark_job_ids) file_path = 'upload/export/{}.zip'.format(version) new_export_job = ExportJobModel().create(**{ "export_file_path": file_path, "doc_type_id": doc_type_id, "created_by": current_user.user_id, "export_job_status": StatusEnum.processing.value, "export_mark_job_ids": [int(i) for i in mark_job_ids.split(',')] }) export_id = new_export_job.export_job_id session.commit() # 发送给offline nlp redis_message.update({ 'export_id': export_id, 'export_type': export_type, 'file_path': file_path, 'version': version, 'doc_type': doc_type_id, 'fields': ','.join(doc_terms), 'mark_job_ids': mark_job_ids, 'task_type': mark_type, }) r.lpush(_get('DATA_EXPORT_QUEUE_KEY'), json.dumps(redis_message))
def export_mark_file(nlp_task_id, mark_job_id, offset=50): mark_job = MarkJobModel().get_by_id(mark_job_id) if mark_job.mark_job_status not in (StatusEnum.approved, StatusEnum.success): abort(400, message="有失败或未完成任务,不能导出") all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=[mark_job_id]) # convert 3 element tuple to a nested dict all_status_dict = Common().tuple_list2dict(all_count) if not (len(all_status_dict[mark_job_id]) == 1 and int(StatusEnum.approved) in all_status_dict[mark_job_id]): abort(400, message="有未标注或未审核任务,不能导出") export_file_path = os.path.join( 'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job_id)) # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果 last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path) if last_exported_file: return last_exported_file # 重新制作 export_fileset = FileSet(folder=export_file_path) mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(mark_job_ids=[mark_job_id]) if nlp_task_id == int(NlpTaskEnum.extract): doc_terms = DocTermModel().get_by_filter(limit=99999, doc_type_id=mark_job.doc_type_id) file_path = export_sync.generate_extract_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset, doc_terms=doc_terms, offset=offset) elif nlp_task_id == int(NlpTaskEnum.classify): file_path = export_sync.generate_classify_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) elif nlp_task_id == int(NlpTaskEnum.wordseg): file_path = export_sync.generate_wordseg_file(task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset) else: abort(400, message="该任务无法导出") return file_path
def import_labeled_extract_files(self, f, mark_job: MarkJob): doc_type_id = mark_job.doc_type_id mark_job_id = mark_job.mark_job_id alias_id_mapping = DocTermModel().get_doc_term_alias_mapping(doc_type_id) # txt file contains multiple lines doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) sample_docs = [] task_results = [] with open(doc_relative_path, encoding="utf-8-sig") as fr: samples = fr.readlines() for sample in samples: sample = sample.replace("\n", "").strip() if len(sample) < 2: continue # parse tagged content into database format raw_content, task_result_list = self.parse_sample(sample, alias_id_mapping) doc_unique_name, _ = upload_fileset.save_file(f.filename, raw_content) sample_docs.append(doc_unique_name) task_results.append(task_result_list) # bulk insert docs doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in sample_docs] doc_entity_list = DocModel().bulk_create(doc_list) task_list = [] for i in range(len(doc_entity_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job_id, mark_task_result=task_results[i] if task_results else {}, mark_task_status=int(StatusEnum.approved) if task_results else int(StatusEnum.processing) )) task_entity_list = MarkTaskModel().bulk_create(task_list) # push redis for i in range(len(doc_list)): self.push_mark_task_message(mark_job, task_entity_list[i], doc_entity_list[i], business="dp") return task_entity_list
def create_classify_train_job_by_doc_type_id(doc_type_id, train_job_name, train_job_desc, train_config, mark_job_ids, custom_id): # verify doc_type doc_type = DocTypeModel().get_by_id(doc_type_id) # get nlp_task name nlp_task = NlpTaskEnum.classify # generate model version by nlp task model_version = generate_model_version_by_nlp_task( doc_type_id, mark_job_ids, nlp_task) # create TrainJob table train_job = TrainJobModel().create(train_job_name=train_job_name, train_job_desc=train_job_desc, doc_type_id=doc_type_id, train_job_status=int( StatusEnum.training), preprocess={}) # create TrainM2mMark table train_m2m_mark_list = [{ "train_job_id": train_job.train_job_id, "mark_job_id": _id } for _id in mark_job_ids] TrainM2mMarkbModel().bulk_create(train_m2m_mark_list) # create TrainTask table train_task = TrainTaskModel().create( train_job_id=train_job.train_job_id, train_model_name=train_job_name, train_model_desc=train_job_desc, train_config=train_config, train_status=int(StatusEnum.training), model_version=model_version) # bulk create train term doc_term_list = DocTermModel().get_by_filter(limit=99999, doc_type_id=doc_type_id) TrainTermTaskModel().bulk_create([ dict(train_task_id=train_task.train_task_id, doc_term_id=doc_term.doc_term_id, train_term_status=int(StatusEnum.training)) for doc_term in doc_term_list ]) # assign doc term list to doc type doc_type.doc_term_list = doc_term_list if custom_id: custom_item = CustomAlgorithmModel().get_by_id(custom_id) custom = CustomAlgorithmSchema( only=("custom_id_name", "custom_ip", "custom_port")).dump(custom_item) else: custom = {} # push to redis push_train_task_to_redis(nlp_task, doc_type, train_task.train_task_id, model_version, train_config, mark_job_ids, custom) session.commit() # add some attribute for dumping train_task.mark_job_ids = mark_job_ids train_job.train_list = [train_task] train_job.doc_type = doc_type train_job.model_version = model_version return train_job
def update_doc_term(doc_term_id, args): item = DocTermModel().update(doc_term_id, **args) session.commit() result = DocTermSchema().dump(item) return result
def remove_doc_term(doc_term_id): DocTermModel().delete(doc_term_id)
def check_term_in_relation(doc_term_id): return DocTermModel().check_term_in_relation(doc_term_id)
def create_doc_term(args, doc_type_id): item = DocTermModel().create(**args, doc_type_id=doc_type_id) session.commit() result = DocTermSchema().dump(item) return result
def get_classify_doc_rule(doc_type_id, offset, limit): items, count = DocTermModel().get_classify_doc_rule( doc_type_id, offset, limit) result = ClassifyDocRuleSchema(many=True).dump(items) timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) return result, count, timestamp