Ejemplo n.º 1
0
 def create_new_rule(args):
     is_existed = DocTermModel().check_exists_rule(args.get("doc_term_id"))
     if is_existed:
         raise ValueError("该标签的规则已存在,请勿重复创建")
     classify_rule = DocTermModel().create_classify_rule(**args)
     session.commit()
     result = ClassifyDocRuleSchema().dump(classify_rule)
     return result
Ejemplo n.º 2
0
 def create_classify_doc_term(args, doc_type_id, doc_rule_list):
     doc_term = DocTermModel().create(**args, doc_type_id=doc_type_id)
     doc_term.flush()
     for doc_rule_dict in doc_rule_list:
         ClassifyRuleModel().create(doc_term_id=doc_term.doc_term_id,
                                    **doc_rule_dict)
     session.commit()
     result = DocTermSchema().dump(doc_term)
     return result
Ejemplo n.º 3
0
    def create_evaluate_task_by_train_job_id(train_job_id, evaluate_task_name, evaluate_task_desc, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule=0):
        """
        如果后面要加重新训练的逻辑,这部分要改,不能根据train_job_id去创建评估任务,而是根据train_task_id,
        目前先保留,因为目前train_job和train_task是一一对应,不会有影响
        """
        # get correspondent train_job, doc_type, train_task, nlp_task by train_job_id
        train_job = TrainJobModel().get_by_id(train_job_id)
        doc_type = DocTypeModel().get_by_id(train_job.doc_type_id)
        doc_term_list = DocTermModel().get_by_filter(limit=99999, doc_type_id=doc_type.doc_type_id)
        doc_type.doc_term_list = doc_term_list

        nlp_task = NlpTaskEnum(doc_type.nlp_task_id)
        _, train_task_list = TrainTaskModel().get_by_filter(train_job_id=train_job_id)
        train_task = train_task_list[0]

        # create evaluate_task
        evaluate_task = EvaluateTaskModel().create(evaluate_task_name=evaluate_task_name,
                                                   evaluate_task_desc=evaluate_task_desc,
                                                   train_task_id=train_task.train_task_id,
                                                   evaluate_task_status=int(StatusEnum.processing))
        # bulk create evaluate m2m mark
        evaluate_m2m_mark_list = [{"evaluate_task_id": evaluate_task.evaluate_task_id, "mark_job_id": _id} for _id in mark_job_ids]
        EvaluateM2mMarkModel().bulk_create(evaluate_m2m_mark_list)

        # push to evaluate redis queue
        doc_term_ids = [str(t.doc_term_id) for t in RelationM2mTermModel().get_by_filter(limit=99999, doc_relation_ids=[int(rl) for rl in doc_relation_ids])]
        push_evaluate_task_to_redis(nlp_task, evaluate_task, train_task, doc_type, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule)
        session.commit()
        return evaluate_task
Ejemplo n.º 4
0
 def get_doc_term_by_doctype(doc_type_id,
                             offset=0,
                             limit=10,
                             doc_term_ids=None):
     items, count = DocTermModel().get_doc_term_by_doctype(
         doc_type_id, offset, limit, doc_term_ids)
     result = DocTermSchema(many=True).dump(items)
     return result, count
Ejemplo n.º 5
0
 def get_doc_term_list(args):
     exclude_terms_ids = args.get('exclude_terms_ids', [])
     offset = args.get('offset', 0)
     limit = args.get('limit', 10)
     items, count = DocTermModel().get_by_exclude_terms(
         exclude_terms_ids=exclude_terms_ids, offset=offset, limit=limit)
     result = DocTermSchema(many=True).dump(items)
     return result, count
    def import_labeled_classify_files(f, mark_job):
        doc_type_id = mark_job.doc_type_id
        mark_job_id = mark_job.mark_job_id
        doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read())
        csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name)
        try:
            df = pd.read_csv(doc_relative_path, skiprows=0, na_values='')
        except EmptyDataError:
            raise EmptyDataError('上传数据为空,请检查上传数据:{}'.format(f.filename))
        except Exception:
            raise EmptyDataError('上传数据处理异常,请检查上传数据:{}'.format(f.filename))
        if 'text' not in df.columns or 'label' not in df.columns:
            raise KeyError
        doc_terms, _ = DocTermModel.get_doc_term_by_doctype(doc_type_id, offset=0, limit=9999)
        doc_term_name2id_map = {m.doc_term_name: m.doc_term_id for m in doc_terms}
        content_list = []
        task_results = []
        for row_num, row in df.iterrows():
            content = row.get('text')
            label = row.get('label')
            try:
                label_id = doc_term_name2id_map[label]
            except KeyError as ke:
                raise ValueError(f"当前项目不存在文件第{row_num + 2}行的label:{ke.args[0]},请检查")
            task_result = [{'prob': 1, 'marked': 1, 'label_id': label_id, 'label_name': label}]
            if content and label:
                content_list.append(content)
                task_results.append(task_result)

        # bulk insert doc
        unique_name_list = []
        for txt_content in content_list:
            doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content)
            unique_name_list.append(doc_unique_name)
        doc_list = [
            dict(
                doc_raw_name=csv_doc.doc_raw_name,
                doc_unique_name=unique_name,
            ) for unique_name in unique_name_list
        ]
        doc_entity_list = DocModel().bulk_create(doc_list)

        # bulk insert task
        task_list = []
        for i in range(len(doc_list)):
            task_list.append(dict(
                doc_id=doc_entity_list[i].doc_id,
                mark_job_id=mark_job_id,
                mark_task_result=task_results[i] if task_results else {},
                mark_task_status=int(StatusEnum.approved)
            ))
        task_entity_list = MarkTaskModel().bulk_create(task_list)

        return task_entity_list
Ejemplo n.º 7
0
 def create_doc_term():
     from app.model import DocTermModel
     if len(DocTermModel().get_all()) == 0:
         DocTermModel().create(app_id=1,
                               created_by=1,
                               doc_term_id=1,
                               doc_term_name="人名",
                               doc_term_alias="nr",
                               doc_type_id=1)
         DocTermModel().create(app_id=1,
                               created_by=1,
                               doc_term_id=2,
                               doc_term_name="地名",
                               doc_term_alias="ns",
                               doc_type_id=1)
         DocTermModel().create(app_id=1,
                               created_by=1,
                               doc_term_id=3,
                               doc_term_name="机构名",
                               doc_term_alias="nt",
                               doc_type_id=1)
         session.commit()
Ejemplo n.º 8
0
    def create_export_task(current_user: CurrentUser, mark_job_ids, mark_type, export_type):
        # raise no result found exception
        redis_message = {}
        doc_type_id = MarkJobModel().get_by_id(int(mark_job_ids.split(',')[0])).doc_type_id
        doc_terms = [str(row.doc_term_id) for row in DocTermModel().get_by_filter(doc_type_id=doc_type_id)]
        if mark_type == 'wordseg':
            doc_terms = ['10086']
        elif mark_type == 'relation':
            relation_2_entity_mapping = [{i[0]: [d for d in i[1].split(",")]} for i in RelationM2mTermModel.get_relation_term_mapping(doc_type_id)]
            redis_message.update({
                'relation_2_entity_mapping': relation_2_entity_mapping,
            })
        version = '{}{}_{}_{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"), str(uuid.uuid4())[:4], doc_type_id,
                                      mark_job_ids)
        file_path = 'upload/export/{}.zip'.format(version)

        new_export_job = ExportJobModel().create(**{
            "export_file_path": file_path,
            "doc_type_id": doc_type_id,
            "created_by": current_user.user_id,
            "export_job_status": StatusEnum.processing.value,
            "export_mark_job_ids": [int(i) for i in mark_job_ids.split(',')]
        })
        export_id = new_export_job.export_job_id

        session.commit()
        # 发送给offline nlp
        redis_message.update({
            'export_id': export_id,
            'export_type': export_type,
            'file_path': file_path,
            'version': version,
            'doc_type': doc_type_id,
            'fields': ','.join(doc_terms),
            'mark_job_ids': mark_job_ids,
            'task_type': mark_type,
        })
        r.lpush(_get('DATA_EXPORT_QUEUE_KEY'), json.dumps(redis_message))
    def export_mark_file(nlp_task_id, mark_job_id, offset=50):
        mark_job = MarkJobModel().get_by_id(mark_job_id)

        if mark_job.mark_job_status not in (StatusEnum.approved, StatusEnum.success):
            abort(400, message="有失败或未完成任务,不能导出")

        all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=[mark_job_id])
        # convert 3 element tuple to a nested dict
        all_status_dict = Common().tuple_list2dict(all_count)

        if not (len(all_status_dict[mark_job_id]) == 1 and int(StatusEnum.approved) in all_status_dict[mark_job_id]):
            abort(400, message="有未标注或未审核任务,不能导出")

        export_file_path = os.path.join(
            'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job_id))
        # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果
        last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path)
        if last_exported_file:
            return last_exported_file

        # 重新制作
        export_fileset = FileSet(folder=export_file_path)
        mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(mark_job_ids=[mark_job_id])

        if nlp_task_id == int(NlpTaskEnum.extract):
            doc_terms = DocTermModel().get_by_filter(limit=99999, doc_type_id=mark_job.doc_type_id)
            file_path = export_sync.generate_extract_file(task_and_doc_list=mark_task_and_doc_list,
                                                          export_fileset=export_fileset, doc_terms=doc_terms,
                                                          offset=offset)
        elif nlp_task_id == int(NlpTaskEnum.classify):
            file_path = export_sync.generate_classify_file(task_and_doc_list=mark_task_and_doc_list,
                                                           export_fileset=export_fileset)
        elif nlp_task_id == int(NlpTaskEnum.wordseg):
            file_path = export_sync.generate_wordseg_file(task_and_doc_list=mark_task_and_doc_list,
                                                          export_fileset=export_fileset)
        else:
            abort(400, message="该任务无法导出")
        return file_path
 def import_labeled_extract_files(self, f, mark_job: MarkJob):
     doc_type_id = mark_job.doc_type_id
     mark_job_id = mark_job.mark_job_id
     alias_id_mapping = DocTermModel().get_doc_term_alias_mapping(doc_type_id)
     # txt file contains multiple lines
     doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read())
     DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name)
     sample_docs = []
     task_results = []
     with open(doc_relative_path, encoding="utf-8-sig") as fr:
         samples = fr.readlines()
         for sample in samples:
             sample = sample.replace("\n", "").strip()
             if len(sample) < 2:
                 continue
             # parse tagged content into database format
             raw_content, task_result_list = self.parse_sample(sample, alias_id_mapping)
             doc_unique_name, _ = upload_fileset.save_file(f.filename, raw_content)
             sample_docs.append(doc_unique_name)
             task_results.append(task_result_list)
     # bulk insert docs
     doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in sample_docs]
     doc_entity_list = DocModel().bulk_create(doc_list)
     task_list = []
     for i in range(len(doc_entity_list)):
         task_list.append(dict(
             doc_id=doc_entity_list[i].doc_id,
             mark_job_id=mark_job_id,
             mark_task_result=task_results[i] if task_results else {},
             mark_task_status=int(StatusEnum.approved) if task_results else int(StatusEnum.processing)
         ))
     task_entity_list = MarkTaskModel().bulk_create(task_list)
     # push redis
     for i in range(len(doc_list)):
         self.push_mark_task_message(mark_job, task_entity_list[i], doc_entity_list[i], business="dp")
     return task_entity_list
Ejemplo n.º 11
0
    def create_classify_train_job_by_doc_type_id(doc_type_id, train_job_name,
                                                 train_job_desc, train_config,
                                                 mark_job_ids, custom_id):
        # verify doc_type
        doc_type = DocTypeModel().get_by_id(doc_type_id)
        # get nlp_task name
        nlp_task = NlpTaskEnum.classify
        # generate model version by nlp task
        model_version = generate_model_version_by_nlp_task(
            doc_type_id, mark_job_ids, nlp_task)

        # create TrainJob table
        train_job = TrainJobModel().create(train_job_name=train_job_name,
                                           train_job_desc=train_job_desc,
                                           doc_type_id=doc_type_id,
                                           train_job_status=int(
                                               StatusEnum.training),
                                           preprocess={})
        # create TrainM2mMark table
        train_m2m_mark_list = [{
            "train_job_id": train_job.train_job_id,
            "mark_job_id": _id
        } for _id in mark_job_ids]
        TrainM2mMarkbModel().bulk_create(train_m2m_mark_list)

        # create TrainTask table
        train_task = TrainTaskModel().create(
            train_job_id=train_job.train_job_id,
            train_model_name=train_job_name,
            train_model_desc=train_job_desc,
            train_config=train_config,
            train_status=int(StatusEnum.training),
            model_version=model_version)
        # bulk create train term
        doc_term_list = DocTermModel().get_by_filter(limit=99999,
                                                     doc_type_id=doc_type_id)
        TrainTermTaskModel().bulk_create([
            dict(train_task_id=train_task.train_task_id,
                 doc_term_id=doc_term.doc_term_id,
                 train_term_status=int(StatusEnum.training))
            for doc_term in doc_term_list
        ])
        # assign doc term list to doc type
        doc_type.doc_term_list = doc_term_list

        if custom_id:
            custom_item = CustomAlgorithmModel().get_by_id(custom_id)
            custom = CustomAlgorithmSchema(
                only=("custom_id_name", "custom_ip",
                      "custom_port")).dump(custom_item)
        else:
            custom = {}

        # push to redis
        push_train_task_to_redis(nlp_task, doc_type, train_task.train_task_id,
                                 model_version, train_config, mark_job_ids,
                                 custom)
        session.commit()

        # add some attribute for dumping
        train_task.mark_job_ids = mark_job_ids
        train_job.train_list = [train_task]
        train_job.doc_type = doc_type
        train_job.model_version = model_version
        return train_job
Ejemplo n.º 12
0
 def update_doc_term(doc_term_id, args):
     item = DocTermModel().update(doc_term_id, **args)
     session.commit()
     result = DocTermSchema().dump(item)
     return result
Ejemplo n.º 13
0
 def remove_doc_term(doc_term_id):
     DocTermModel().delete(doc_term_id)
Ejemplo n.º 14
0
 def check_term_in_relation(doc_term_id):
     return DocTermModel().check_term_in_relation(doc_term_id)
Ejemplo n.º 15
0
 def create_doc_term(args, doc_type_id):
     item = DocTermModel().create(**args, doc_type_id=doc_type_id)
     session.commit()
     result = DocTermSchema().dump(item)
     return result
Ejemplo n.º 16
0
 def get_classify_doc_rule(doc_type_id, offset, limit):
     items, count = DocTermModel().get_classify_doc_rule(
         doc_type_id, offset, limit)
     result = ClassifyDocRuleSchema(many=True).dump(items)
     timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
     return result, count, timestamp