def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        typed_field = TypedField.by(field)  # type: TypedField

        if python_coded_field.type != typed_field.type_code:
            raise RuntimeError(
                f'Python-coded field {python_coded_field.__class__.__name__} is '
                f'for fields of type {python_coded_field.type} and field {field.code} '
                f'is of type {typed_field.type_code}')

        log.debug('detect_field_value: python_coded_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        field_value_dto = python_coded_field.get_value(
            log=log,
            field=field,
            doc=doc,
            cur_field_code_to_value=field_code_to_value)
        if not typed_field.is_json_field_value_ok(field_value_dto.field_value):
            raise ValueError(
                f'Python coded field class {field.python_coded_field} returned value not suitable for '
                f'field {field.code} ({typed_field.type_code})')
        return field_value_dto
    def uses_cached_document_field_values(cls, field: DocumentField):
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))

        return python_coded_field.uses_cached_document_field_values
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False) -> Optional[ClassifierModel]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(field.python_coded_field))

        return python_coded_field.train_document_field_detector_model(field,
                                                                      train_data_project_ids,
                                                                      use_only_confirmed_field_values)
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.detect_per_text_unit:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end \
                        in python_coded_field.get_values(log, field, doc, text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end \
                    in python_coded_field.get_values(log, field, doc, doc.full_text) or []:
                if field.requires_text_annotations and (
                        location_start is None or location_end is None):
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'undefined location but the field requires text annotation (and location).\n'
                        'This should not happen. Something is broken.'.format(
                            field.python_coded_field, doc))
                if location_start is not None and location_end is not None:
                    text_unit = TextUnit.objects.filter(
                        document=doc,
                        unit_type=field.text_unit_type,
                        location_start__lte=location_start,
                        location_end__gte=location_start).first(
                        )  # type: TextUnit
                    if not text_unit:
                        raise RuntimeError(
                            'Python coded field {0} detected a value in document {1} at '
                            'location [{2};{3}] but the start of location does not belong to any '
                            'text unit object in DB.\n'
                            'This should not happen. Something is broken.'.
                            format(field.python_coded_field, doc,
                                   location_start, location_end))
                    location_length = location_end - location_start
                    location_start = location_start - text_unit.location_start
                    location_end = location_start + location_length
                else:
                    text_unit = None
                    location_start = None
                    location_end = None
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']

        try:
            stop_words = compile_stop_words(stop_words)
            _v = detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if not formula or not formula.strip() or not type_code:
            return self.cleaned_data

        try:
            FormulaBasedFieldDetectionStrategy.calc_formula(
                field_code, type_code, formula, fields_to_values)
        except DocumentFieldFormulaError as ex:
            base_error_class = type(ex.base_error).__name__
            base_error_msg = str(ex.base_error)
            lines = list()
            lines.append(
                "Error caught while trying to execute formula on example values:"
            )
            for field_name in ex.field_values:
                lines.append('{0}={1}'.format(field_name,
                                              ex.field_values[field_name]))
            lines.append(
                "{0}. {1} in formula of field '{2}' at line {3}".format(
                    base_error_class, base_error_msg, ex.field_code,
                    ex.line_number))
            self.add_error('formula', lines)
        except Exception:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'
                .format(str(fields_to_values), trace))

        return self.cleaned_data
Exemple #6
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data.get('stop_words')
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data.get('default_value')
        unsure_choice_value = self.cleaned_data[self.UNSURE_CHOICE_VALUE]
        choice_values = DocumentField.parse_choice_values(self.cleaned_data['choices'])
        unsure_thresholds_by_value = self.cleaned_data.get(self.UNSURE_THRESHOLDS)

        try:
            field_type = FIELD_TYPE_REGISTRY[type_code]
        except KeyError:
            self.add_error('type', 'Unknown field type "{}".'.format(type_code))

        if unsure_choice_value and (not choice_values or unsure_choice_value not in choice_values):
            self.add_error(self.UNSURE_CHOICE_VALUE, '"Unsure choice value" must be listed in the choice values.')

        if unsure_thresholds_by_value is not None:
            if not hasattr(unsure_thresholds_by_value, 'items'):
                self.add_error(self.UNSURE_THRESHOLDS, 'Must be a dict of choice values to float thresholds [0..1]')
            else:
                if not choice_values:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but choice values are not.')
                if not unsure_choice_value:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but '
                                                           '"unsure" choice value is not.')

                if choice_values and unsure_choice_value:
                    for k, v in unsure_thresholds_by_value.items():
                        if k == unsure_choice_value:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Please set thresholds only for "sure" choice '
                                                                   'values and not for ' + k)
                        elif k not in choice_values:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Value not in choice values: ' + k)
                        if (not isinstance(v, int) and not isinstance(v, float)) or v < 0 or v > 1:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Threshold should be a float value between 0 and 1: '
                                           + k)

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            init_classifier_impl(field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {self.cleaned_data.get('code') or 'xxx': {f.code for f in depends_on_fields}}
        fields_and_deps = self._extract_field_and_deps(depends_on_fields, fields_and_deps)
        fields_and_deps = [(code, deps) for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                            for field in depends_on_fields}

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code)
            if not python_coded_field:
                self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified'
                                           ' as the field type'.format(python_coded_field.title,
                                                                       python_coded_field.type,
                                                                       type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values, 'formula')

        hide_until_python = hide_until_python.strip() if hide_until_python else None
        if hide_until_python:
            fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                                for field in list(document_type.fields.all())}
            if field_code and field_code in fields_to_values:
                del fields_to_values[field_code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPE_REGISTRY[type_code] \
                    .example_python_value(self.instance)

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value is not None:
            if type_code == RelatedInfoField.code:
                self.add_error('default_value', 'Related info field can\'t have default value')
            elif field_type.extract_from_possible_value(self.instance, default_value) != default_value:
                self.add_error('default_value', 'Wrong value for type {0}. Example: {1}'
                               .format(type_code, json.dumps(field_type.example_python_value(self.instance))))

        try:
            DocumentField.compile_value_regexp(self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        self.validate_field_code()

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error('type', 'Detected value is not allowed for this field type, please unset detected value '
                                       'for this field detectors: {0}'.format(', '.join(wrong_field_detector_pks)))

        return self.cleaned_data
Exemple #7
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data['default_value']

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values,
                              'formula')

        hide_until_python = hide_until_python.strip(
        ) if hide_until_python else None
        if hide_until_python:
            fields_to_values = {
                field.code:
                FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
                for field in list(document_type.fields.all())
            }
            code = self.instance.code if self.instance else None
            if code and code in fields_to_values:
                del fields_to_values[code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPES_REGISTRY[type_code] \
                    .example_python_value(DocumentField(**self.cleaned_data))

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value and type_code == RelatedInfoField.code:
            self.add_error('default_value',
                           'Related info field can\'t have default value')

        try:
            DocumentField.compile_value_regexp(
                self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        # Ensure field code is not too long for Postgres column names
        # We use field codes to build column names for Postgres tables.
        # Max length of column name is 63. We escape them to snake case and sometimes add postfixes to them.
        # Lets assume that we should have max 23 chars for postfixes and max 40 chars for the field code.
        field_code_escaped = escape_column_name(field_code)
        if len(field_code_escaped) > self.MAX_ESCAPED_FIELD_CODE_LEN:
            self.add_error(
                'code',
                '''Field code is too long. Field codes are used to build column names of DB tables.
Escaped version should have max {max_length} chars but it is {length} chars long. Current escaped version of the 
specified field code is: "{field_code_escaped}"'''.format(
                    max_length=self.MAX_ESCAPED_FIELD_CODE_LEN,
                    length=len(field_code_escaped),
                    field_code_escaped=field_code_escaped))
        if not self.R_AZ.search(field_code_escaped):
            self.add_error(
                'code',
                '''Field codes are used to build column names of DB tables. Escaped version of 
the specified field code should contain at least one latin letter. Current escaped version of the specified field 
code is: "{0}"'''.format(field_code_escaped))

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(
                    field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(
                        type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error(
                    'type',
                    'Detected value is not allowed for this field type, please unset detected value '
                    'for this field detectors: {0}'.format(
                        ', '.join(wrong_field_detector_pks)))

        return self.cleaned_data