def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def make_doc_field_detector(self) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = 'cushion' detector.include_regexps = r'(?<=\D{3,3}\s\D{5,5}\s)\D+' detector.detected_value = 'shall' detector.extraction_hint = None return detector
def make_doc_field_detector( self, exclude_regexps: Optional[str] = None, include_regexps: Optional[str] = None, detected_value: Optional[str] = None) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion' detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+' detector.detected_value = detected_value if detected_value is not None else 'shall' detector.extraction_hint = 'detected' return detector
def make_doc_field_detector(exclude_regexps: Optional[str] = None, include_regexps: Optional[str] = None, detected_value: Optional[str] = None, regexps_pre_process_lower: bool = True, definition_words: Optional[str] = None) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion' detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+' if detected_value is not None: detector.detected_value = detected_value detector.extraction_hint = 'TAKE_FIRST' # 'detected' detector.text_part = 'INSIDE_REGEXP' detector.regexps_pre_process_lower = regexps_pre_process_lower detector.definition_words = definition_words return detector
def save_detector_settings( self, detectors_by_value: Dict[str, List[str]]) -> None: # save [all pattern: value] records into DocumentFieldMultilineRegexDetector if self.save_in_csv_format: self.save_detector_settings_csv(detectors_by_value) return # save patterns as one or more DocumentFieldDetector records # but before (optionally) delete old settings if self.drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=self.document_field, category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for field_val in detectors_by_value: include_reg_values = detectors_by_value[field_val] detector = DocumentFieldDetector() detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = self.document_field detector.regexps_pre_process_lower = True detector.detected_value = field_val detector.include_regexps = '\n'.join(include_reg_values) detector.save()