예제 #1
0
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        csv: bytes, drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
    def detect_values_in_document(self,
                                  text_units: List[MockTextUnit],
                                  detector: DocumentFieldDetector,
                                  **doc_field_kwargs):
        init_field_type_registry()
        field = self.make_doc_field(**doc_field_kwargs)
        detector.field = field
        doc = self.setup_document(text_units)
        detect_repo = MockFieldDetectorRepository()
        detect_repo.detectors = [detector]
        text_unit_repo = MockTextUnitRepository()
        text_unit_repo.units = text_units
        for tu in text_unit_repo.units:
            tu.document = doc
            tu.unit_type = field.text_unit_type

        old_repo_tu = RegexpsOnlyFieldDetectionStrategy.text_unit_repo
        RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo
        old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo
        RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo

        try:
            detected = RegexpsOnlyFieldDetectionStrategy.detect_field_value(None, doc, field, {})
        finally:
            RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_repo_tu
            RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect
        return detected
예제 #3
0
    def __init__(self, text: str, field_type: str):
        self.document = Document()
        self.field = DocumentField()
        self.field.type = field_type

        self.text_unit = TextUnit()
        self.text_unit.document = self.document
        self.text_unit.textunittext = TextUnitText()
        self.text_unit.textunittext.text = text
        self.text_unit.location_start = 1001
        self.text_unit.location_end = self.text_unit.location_start + len(text)

        self.detector = DocumentFieldDetector()
        self.detector.regexps_pre_process_lower = True
        self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \
            '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))'
        self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \
                                    'required revolving credit lenders\nrequired term lenders\n' + \
                                    'requisite lenders\nrequisite revolving lenders\n' + \
                                    'required class lenders\nrequired ddtl lenders'
        self.detector.detected_value = 'AFFILIATED'
        self.detector.text_part = TextParts.FULL.value
        self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST

        self.matcher = DetectorFieldMatcher(self.detector)
 def make_doc_field_detector(exclude_regexps: Optional[str] = None,
                             include_regexps: Optional[str] = None,
                             detected_value: Optional[str] = None,
                             regexps_pre_process_lower: bool = True,
                             definition_words: Optional[str] = None) -> DocumentFieldDetector:
     detector = DocumentFieldDetector()
     detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion'
     detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+'
     if detected_value is not None:
         detector.detected_value = detected_value
     detector.extraction_hint = 'TAKE_FIRST'  # 'detected'
     detector.text_part = 'INSIDE_REGEXP'
     detector.regexps_pre_process_lower = regexps_pre_process_lower
     detector.definition_words = definition_words
     return detector
예제 #5
0
    def clean(self):
        try:
            DocumentFieldDetector.compile_regexps_string(
                self.cleaned_data['exclude_regexps'])
        except Exception as exc:
            self.add_error('exclude_regexps', exc)

        try:
            DocumentFieldDetector.compile_regexps_string(
                self.cleaned_data['include_regexps'])
        except Exception as exc:
            self.add_error('include_regexps', exc)

        try:
            DetectorFieldMatcher.validate_detected_value(
                self.cleaned_data['field'].type,
                self.cleaned_data['detected_value'])
        except Exception as exc:
            self.add_error('detected_value', exc)

        return self.cleaned_data
 def make_doc_field_detector(self) -> DocumentFieldDetector:
     detector = DocumentFieldDetector()
     detector.exclude_regexps = 'cushion'
     detector.include_regexps = r'(?<=\D{3,3}\s\D{5,5}\s)\D+'
     detector.detected_value = 'shall'
     detector.extraction_hint = None
     return detector
예제 #7
0
 def make_doc_field_detector(
         self,
         exclude_regexps: Optional[str] = None,
         include_regexps: Optional[str] = None,
         detected_value: Optional[str] = None) -> DocumentFieldDetector:
     detector = DocumentFieldDetector()
     detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion'
     detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+'
     detector.detected_value = detected_value if detected_value is not None else 'shall'
     detector.extraction_hint = 'detected'
     return detector
예제 #8
0
    def save_detector_settings(
            self, detectors_by_value: Dict[str, List[str]]) -> None:
        # save [all pattern: value] records into DocumentFieldMultilineRegexDetector
        if self.save_in_csv_format:
            self.save_detector_settings_csv(detectors_by_value)
            return

        # save patterns as one or more DocumentFieldDetector records
        # but before (optionally) delete old settings
        if self.drop_previous_field_detectors:
            DocumentFieldDetector.objects.filter(
                field=self.document_field,
                category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
        for field_val in detectors_by_value:
            include_reg_values = detectors_by_value[field_val]

            detector = DocumentFieldDetector()
            detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
            detector.field = self.document_field
            detector.regexps_pre_process_lower = True
            detector.detected_value = field_val
            detector.include_regexps = '\n'.join(include_reg_values)
            detector.save()
예제 #9
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data.get('stop_words')
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data.get('default_value')
        unsure_choice_value = self.cleaned_data[self.UNSURE_CHOICE_VALUE]
        choice_values = DocumentField.parse_choice_values(self.cleaned_data['choices'])
        unsure_thresholds_by_value = self.cleaned_data.get(self.UNSURE_THRESHOLDS)

        try:
            field_type = FIELD_TYPE_REGISTRY[type_code]
        except KeyError:
            self.add_error('type', 'Unknown field type "{}".'.format(type_code))

        if unsure_choice_value and (not choice_values or unsure_choice_value not in choice_values):
            self.add_error(self.UNSURE_CHOICE_VALUE, '"Unsure choice value" must be listed in the choice values.')

        if unsure_thresholds_by_value is not None:
            if not hasattr(unsure_thresholds_by_value, 'items'):
                self.add_error(self.UNSURE_THRESHOLDS, 'Must be a dict of choice values to float thresholds [0..1]')
            else:
                if not choice_values:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but choice values are not.')
                if not unsure_choice_value:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but '
                                                           '"unsure" choice value is not.')

                if choice_values and unsure_choice_value:
                    for k, v in unsure_thresholds_by_value.items():
                        if k == unsure_choice_value:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Please set thresholds only for "sure" choice '
                                                                   'values and not for ' + k)
                        elif k not in choice_values:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Value not in choice values: ' + k)
                        if (not isinstance(v, int) and not isinstance(v, float)) or v < 0 or v > 1:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Threshold should be a float value between 0 and 1: '
                                           + k)

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            init_classifier_impl(field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {self.cleaned_data.get('code') or 'xxx': {f.code for f in depends_on_fields}}
        fields_and_deps = self._extract_field_and_deps(depends_on_fields, fields_and_deps)
        fields_and_deps = [(code, deps) for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                            for field in depends_on_fields}

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code)
            if not python_coded_field:
                self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified'
                                           ' as the field type'.format(python_coded_field.title,
                                                                       python_coded_field.type,
                                                                       type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values, 'formula')

        hide_until_python = hide_until_python.strip() if hide_until_python else None
        if hide_until_python:
            fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                                for field in list(document_type.fields.all())}
            if field_code and field_code in fields_to_values:
                del fields_to_values[field_code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPE_REGISTRY[type_code] \
                    .example_python_value(self.instance)

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value is not None:
            if type_code == RelatedInfoField.code:
                self.add_error('default_value', 'Related info field can\'t have default value')
            elif field_type.extract_from_possible_value(self.instance, default_value) != default_value:
                self.add_error('default_value', 'Wrong value for type {0}. Example: {1}'
                               .format(type_code, json.dumps(field_type.example_python_value(self.instance))))

        try:
            DocumentField.compile_value_regexp(self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        self.validate_field_code()

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error('type', 'Detected value is not allowed for this field type, please unset detected value '
                                       'for this field detectors: {0}'.format(', '.join(wrong_field_detector_pks)))

        return self.cleaned_data
예제 #10
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data['default_value']

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values,
                              'formula')

        hide_until_python = hide_until_python.strip(
        ) if hide_until_python else None
        if hide_until_python:
            fields_to_values = {
                field.code:
                FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
                for field in list(document_type.fields.all())
            }
            code = self.instance.code if self.instance else None
            if code and code in fields_to_values:
                del fields_to_values[code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPES_REGISTRY[type_code] \
                    .example_python_value(DocumentField(**self.cleaned_data))

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value and type_code == RelatedInfoField.code:
            self.add_error('default_value',
                           'Related info field can\'t have default value')

        try:
            DocumentField.compile_value_regexp(
                self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        # Ensure field code is not too long for Postgres column names
        # We use field codes to build column names for Postgres tables.
        # Max length of column name is 63. We escape them to snake case and sometimes add postfixes to them.
        # Lets assume that we should have max 23 chars for postfixes and max 40 chars for the field code.
        field_code_escaped = escape_column_name(field_code)
        if len(field_code_escaped) > self.MAX_ESCAPED_FIELD_CODE_LEN:
            self.add_error(
                'code',
                '''Field code is too long. Field codes are used to build column names of DB tables.
Escaped version should have max {max_length} chars but it is {length} chars long. Current escaped version of the 
specified field code is: "{field_code_escaped}"'''.format(
                    max_length=self.MAX_ESCAPED_FIELD_CODE_LEN,
                    length=len(field_code_escaped),
                    field_code_escaped=field_code_escaped))
        if not self.R_AZ.search(field_code_escaped):
            self.add_error(
                'code',
                '''Field codes are used to build column names of DB tables. Escaped version of 
the specified field code should contain at least one latin letter. Current escaped version of the specified field 
code is: "{0}"'''.format(field_code_escaped))

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(
                    field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(
                        type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error(
                    'type',
                    'Detected value is not allowed for this field type, please unset detected value '
                    'for this field detectors: {0}'.format(
                        ', '.join(wrong_field_detector_pks)))

        return self.cleaned_data