def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def detect_values_in_document(self, text_units: List[MockTextUnit], detector: DocumentFieldDetector, **doc_field_kwargs): init_field_type_registry() field = self.make_doc_field(**doc_field_kwargs) detector.field = field doc = self.setup_document(text_units) detect_repo = MockFieldDetectorRepository() detect_repo.detectors = [detector] text_unit_repo = MockTextUnitRepository() text_unit_repo.units = text_units for tu in text_unit_repo.units: tu.document = doc tu.unit_type = field.text_unit_type old_repo_tu = RegexpsOnlyFieldDetectionStrategy.text_unit_repo RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo try: detected = RegexpsOnlyFieldDetectionStrategy.detect_field_value(None, doc, field, {}) finally: RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_repo_tu RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect return detected
def __init__(self, text: str, field_type: str): self.document = Document() self.field = DocumentField() self.field.type = field_type self.text_unit = TextUnit() self.text_unit.document = self.document self.text_unit.textunittext = TextUnitText() self.text_unit.textunittext.text = text self.text_unit.location_start = 1001 self.text_unit.location_end = self.text_unit.location_start + len(text) self.detector = DocumentFieldDetector() self.detector.regexps_pre_process_lower = True self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \ '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))' self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \ 'required revolving credit lenders\nrequired term lenders\n' + \ 'requisite lenders\nrequisite revolving lenders\n' + \ 'required class lenders\nrequired ddtl lenders' self.detector.detected_value = 'AFFILIATED' self.detector.text_part = TextParts.FULL.value self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST self.matcher = DetectorFieldMatcher(self.detector)
def make_doc_field_detector(exclude_regexps: Optional[str] = None, include_regexps: Optional[str] = None, detected_value: Optional[str] = None, regexps_pre_process_lower: bool = True, definition_words: Optional[str] = None) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion' detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+' if detected_value is not None: detector.detected_value = detected_value detector.extraction_hint = 'TAKE_FIRST' # 'detected' detector.text_part = 'INSIDE_REGEXP' detector.regexps_pre_process_lower = regexps_pre_process_lower detector.definition_words = definition_words return detector
def clean(self): try: DocumentFieldDetector.compile_regexps_string( self.cleaned_data['exclude_regexps']) except Exception as exc: self.add_error('exclude_regexps', exc) try: DocumentFieldDetector.compile_regexps_string( self.cleaned_data['include_regexps']) except Exception as exc: self.add_error('include_regexps', exc) try: DetectorFieldMatcher.validate_detected_value( self.cleaned_data['field'].type, self.cleaned_data['detected_value']) except Exception as exc: self.add_error('detected_value', exc) return self.cleaned_data
def make_doc_field_detector(self) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = 'cushion' detector.include_regexps = r'(?<=\D{3,3}\s\D{5,5}\s)\D+' detector.detected_value = 'shall' detector.extraction_hint = None return detector
def make_doc_field_detector( self, exclude_regexps: Optional[str] = None, include_regexps: Optional[str] = None, detected_value: Optional[str] = None) -> DocumentFieldDetector: detector = DocumentFieldDetector() detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion' detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+' detector.detected_value = detected_value if detected_value is not None else 'shall' detector.extraction_hint = 'detected' return detector
def save_detector_settings( self, detectors_by_value: Dict[str, List[str]]) -> None: # save [all pattern: value] records into DocumentFieldMultilineRegexDetector if self.save_in_csv_format: self.save_detector_settings_csv(detectors_by_value) return # save patterns as one or more DocumentFieldDetector records # but before (optionally) delete old settings if self.drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=self.document_field, category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for field_val in detectors_by_value: include_reg_values = detectors_by_value[field_val] detector = DocumentFieldDetector() detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = self.document_field detector.regexps_pre_process_lower = True detector.detected_value = field_val detector.include_regexps = '\n'.join(include_reg_values) detector.save()
def clean(self): field_code = self.cleaned_data.get('code') formula = self.cleaned_data.get('formula') type_code = self.cleaned_data.get('type') depends_on_fields = self.cleaned_data.get('depends_on_fields') or [] document_type = self.cleaned_data.get('document_type') depends_on_fields = list(depends_on_fields) classifier_init_script = self.cleaned_data['classifier_init_script'] stop_words = self.cleaned_data.get('stop_words') hide_until_python = self.cleaned_data['hide_until_python'] default_value = self.cleaned_data.get('default_value') unsure_choice_value = self.cleaned_data[self.UNSURE_CHOICE_VALUE] choice_values = DocumentField.parse_choice_values(self.cleaned_data['choices']) unsure_thresholds_by_value = self.cleaned_data.get(self.UNSURE_THRESHOLDS) try: field_type = FIELD_TYPE_REGISTRY[type_code] except KeyError: self.add_error('type', 'Unknown field type "{}".'.format(type_code)) if unsure_choice_value and (not choice_values or unsure_choice_value not in choice_values): self.add_error(self.UNSURE_CHOICE_VALUE, '"Unsure choice value" must be listed in the choice values.') if unsure_thresholds_by_value is not None: if not hasattr(unsure_thresholds_by_value, 'items'): self.add_error(self.UNSURE_THRESHOLDS, 'Must be a dict of choice values to float thresholds [0..1]') else: if not choice_values: self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but choice values are not.') if not unsure_choice_value: self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but ' '"unsure" choice value is not.') if choice_values and unsure_choice_value: for k, v in unsure_thresholds_by_value.items(): if k == unsure_choice_value: self.add_error(self.UNSURE_THRESHOLDS, 'Please set thresholds only for "sure" choice ' 'values and not for ' + k) elif k not in choice_values: self.add_error(self.UNSURE_THRESHOLDS, 'Value not in choice values: ' + k) if (not isinstance(v, int) and not isinstance(v, float)) or v < 0 or v > 1: self.add_error(self.UNSURE_THRESHOLDS, 'Threshold should be a float value between 0 and 1: ' + k) try: stop_words = compile_stop_words(stop_words) detect_value_with_stop_words(stop_words, 'dummy text') except Exception as err: self.add_error('stop_words', str(err)) try: init_classifier_impl(field_code, classifier_init_script) except ScriptError as err: self.add_error('classifier_init_script', str(err).split('\n')) fields_and_deps = {self.cleaned_data.get('code') or 'xxx': {f.code for f in depends_on_fields}} fields_and_deps = self._extract_field_and_deps(depends_on_fields, fields_and_deps) fields_and_deps = [(code, deps) for code, deps in fields_and_deps.items()] try: order_field_detection(fields_and_deps) except ValueError as ve: self.add_error(None, str(ve)) fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field) for field in depends_on_fields} python_coded_field_code = self.cleaned_data.get('python_coded_field') if python_coded_field_code: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code) if not python_coded_field: self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code)) else: if type_code != python_coded_field.type: self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified' ' as the field type'.format(python_coded_field.title, python_coded_field.type, type_code)) if formula and formula.strip() and type_code: self.calc_formula(field_code, type_code, formula, fields_to_values, 'formula') hide_until_python = hide_until_python.strip() if hide_until_python else None if hide_until_python: fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field) for field in list(document_type.fields.all())} if field_code and field_code in fields_to_values: del fields_to_values[field_code] if type_code: fields_to_values[field_code] = FIELD_TYPE_REGISTRY[type_code] \ .example_python_value(self.instance) self.calc_formula(field_code, None, hide_until_python, fields_to_values, 'hide_until_python', formula_name='hide until python') if default_value is not None: if type_code == RelatedInfoField.code: self.add_error('default_value', 'Related info field can\'t have default value') elif field_type.extract_from_possible_value(self.instance, default_value) != default_value: self.add_error('default_value', 'Wrong value for type {0}. Example: {1}' .format(type_code, json.dumps(field_type.example_python_value(self.instance)))) try: DocumentField.compile_value_regexp(self.cleaned_data['value_regexp']) except Exception as exc: self.add_error('value_regexp', exc) self.validate_field_code() if self.initial and 'type' in self.changed_data: wrong_field_detector_pks = [] for field_detector in DocumentFieldDetector.objects.filter(field=self.instance): try: DocumentFieldDetector.validate_detected_value(type_code, field_detector.detected_value) except Exception: wrong_field_detector_pks.append('#' + field_detector.pk) if wrong_field_detector_pks: self.add_error('type', 'Detected value is not allowed for this field type, please unset detected value ' 'for this field detectors: {0}'.format(', '.join(wrong_field_detector_pks))) return self.cleaned_data
def clean(self): field_code = self.cleaned_data.get('code') formula = self.cleaned_data.get('formula') type_code = self.cleaned_data.get('type') depends_on_fields = self.cleaned_data.get('depends_on_fields') or [] document_type = self.cleaned_data.get('document_type') depends_on_fields = list(depends_on_fields) classifier_init_script = self.cleaned_data['classifier_init_script'] stop_words = self.cleaned_data['stop_words'] hide_until_python = self.cleaned_data['hide_until_python'] default_value = self.cleaned_data['default_value'] try: stop_words = compile_stop_words(stop_words) detect_value_with_stop_words(stop_words, 'dummy text') except Exception as err: self.add_error('stop_words', str(err)) try: FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl( field_code, classifier_init_script) except ScriptError as err: self.add_error('classifier_init_script', str(err).split('\n')) fields_and_deps = { self.cleaned_data.get('code') or 'xxx': {f.code for f in depends_on_fields} } fields_and_deps = self._extract_field_and_deps(depends_on_fields, fields_and_deps) fields_and_deps = [(code, deps) for code, deps in fields_and_deps.items()] try: order_field_detection(fields_and_deps) except ValueError as ve: self.add_error(None, str(ve)) fields_to_values = { field.code: FIELD_TYPES_REGISTRY[field.type].example_python_value(field) for field in depends_on_fields } python_coded_field_code = self.cleaned_data.get('python_coded_field') if python_coded_field_code: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( python_coded_field_code) if not python_coded_field: self.add_error( 'python_coded_field', 'Unknown Python-coded field: {0}'.format( python_coded_field_code)) else: if type_code != python_coded_field.type: self.add_error( 'type', 'Python-coded field {0} is of type {1} but {2} is specified' ' as the field type'.format(python_coded_field.title, python_coded_field.type, type_code)) if formula and formula.strip() and type_code: self.calc_formula(field_code, type_code, formula, fields_to_values, 'formula') hide_until_python = hide_until_python.strip( ) if hide_until_python else None if hide_until_python: fields_to_values = { field.code: FIELD_TYPES_REGISTRY[field.type].example_python_value(field) for field in list(document_type.fields.all()) } code = self.instance.code if self.instance else None if code and code in fields_to_values: del fields_to_values[code] if type_code: fields_to_values[field_code] = FIELD_TYPES_REGISTRY[type_code] \ .example_python_value(DocumentField(**self.cleaned_data)) self.calc_formula(field_code, None, hide_until_python, fields_to_values, 'hide_until_python', formula_name='hide until python') if default_value and type_code == RelatedInfoField.code: self.add_error('default_value', 'Related info field can\'t have default value') try: DocumentField.compile_value_regexp( self.cleaned_data['value_regexp']) except Exception as exc: self.add_error('value_regexp', exc) # Ensure field code is not too long for Postgres column names # We use field codes to build column names for Postgres tables. # Max length of column name is 63. We escape them to snake case and sometimes add postfixes to them. # Lets assume that we should have max 23 chars for postfixes and max 40 chars for the field code. field_code_escaped = escape_column_name(field_code) if len(field_code_escaped) > self.MAX_ESCAPED_FIELD_CODE_LEN: self.add_error( 'code', '''Field code is too long. Field codes are used to build column names of DB tables. Escaped version should have max {max_length} chars but it is {length} chars long. Current escaped version of the specified field code is: "{field_code_escaped}"'''.format( max_length=self.MAX_ESCAPED_FIELD_CODE_LEN, length=len(field_code_escaped), field_code_escaped=field_code_escaped)) if not self.R_AZ.search(field_code_escaped): self.add_error( 'code', '''Field codes are used to build column names of DB tables. Escaped version of the specified field code should contain at least one latin letter. Current escaped version of the specified field code is: "{0}"'''.format(field_code_escaped)) if self.initial and 'type' in self.changed_data: wrong_field_detector_pks = [] for field_detector in DocumentFieldDetector.objects.filter( field=self.instance): try: DocumentFieldDetector.validate_detected_value( type_code, field_detector.detected_value) except Exception: wrong_field_detector_pks.append('#' + field_detector.pk) if wrong_field_detector_pks: self.add_error( 'type', 'Detected value is not allowed for this field type, please unset detected value ' 'for this field detectors: {0}'.format( ', '.join(wrong_field_detector_pks))) return self.cleaned_data