예제 #1
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info(
            'Forbidden storing detected field values for document with "completed"'
            ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set())
                       for f in all_fields]
    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    field_values_pre_cached = False

    res = list()
    for field_code in sorted_codes:
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy
        if not field_values_pre_cached \
                and field_detection_strategy.uses_cached_document_field_values(field):
            # Pre-cache Document.field_values structure for the usage in field detection strategies
            document.field_values = field_value_cache.cache_field_values(
                document, None, save=False)
            field_values_pre_cached = True

        detected_values = field_detection_strategy.detect_field_values(
            log, document, field)  # type: List[DetectedFieldValue]
        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, res, save=True, log=log)

    return res
예제 #2
0
 def test_order_field_detection(self) -> None:
     fields = [('a', set()), ('b', set('a')), ('c', set('d')), ('d', set('b')), ('e', set())]
     ordered = order_field_detection(fields)
     ordered_pos = {ordered[i]:i for i in range(len(ordered))}
     self.assertEqual(len(fields), len(ordered))
     self.assertGreater(ordered_pos['b'], ordered_pos['a'])
     self.assertGreater(ordered_pos['c'], ordered_pos['d'])
     self.assertGreater(ordered_pos['d'], ordered_pos['b'])
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info('Forbidden storing detected field values for document with "completed"'
                 ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields]
    required_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    res = list()
    for field_code in sorted_codes:
        if ignore_field_codes and field_code in ignore_field_codes:
            continue
        if required_fields and field_code not in required_fields:
            continue

        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            field_vals = field_value_cache.cache_field_values(document, None, save=False)
            detected_values = field_detection_strategy.detect_field_values(log,
                                                                           document,
                                                                           field,
                                                                           field_vals)  # type: List[DetectedFieldValue]
        except Exception as e:
            msg = '''Unable to detect field value. 
            Document type: {0} 
            Document: {1} 
            Field: {2}'''.format(document_type.code, document.pk, field.code)
            log.error(render_error(msg, e))
            raise e

        if save_detected and clear_old_values:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=document,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .exclude(field__value_detection_strategy=DocumentField.VD_DISABLED) \
                .delete()

        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, suggested_field_values=res,
                                             save=True, log=log,
                                             changed_by_user=changed_by_user,
                                             system_fields_changed=system_fields_changed,
                                             generic_fields_changed=generic_fields_changed,
                                             document_initial_load=document_initial_load)

    return res
예제 #4
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']

        try:
            stop_words = compile_stop_words(stop_words)
            _v = detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if not formula or not formula.strip() or not type_code:
            return self.cleaned_data

        try:
            FormulaBasedFieldDetectionStrategy.calc_formula(
                field_code, type_code, formula, fields_to_values)
        except DocumentFieldFormulaError as ex:
            base_error_class = type(ex.base_error).__name__
            base_error_msg = str(ex.base_error)
            lines = list()
            lines.append(
                "Error caught while trying to execute formula on example values:"
            )
            for field_name in ex.field_values:
                lines.append('{0}={1}'.format(field_name,
                                              ex.field_values[field_name]))
            lines.append(
                "{0}. {1} in formula of field '{2}' at line {3}".format(
                    base_error_class, base_error_msg, ex.field_code,
                    ex.line_number))
            self.add_error('formula', lines)
        except Exception:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'
                .format(str(fields_to_values), trace))

        return self.cleaned_data
예제 #5
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data.get('stop_words')
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data.get('default_value')
        unsure_choice_value = self.cleaned_data[self.UNSURE_CHOICE_VALUE]
        choice_values = DocumentField.parse_choice_values(self.cleaned_data['choices'])
        unsure_thresholds_by_value = self.cleaned_data.get(self.UNSURE_THRESHOLDS)

        try:
            field_type = FIELD_TYPE_REGISTRY[type_code]
        except KeyError:
            self.add_error('type', 'Unknown field type "{}".'.format(type_code))

        if unsure_choice_value and (not choice_values or unsure_choice_value not in choice_values):
            self.add_error(self.UNSURE_CHOICE_VALUE, '"Unsure choice value" must be listed in the choice values.')

        if unsure_thresholds_by_value is not None:
            if not hasattr(unsure_thresholds_by_value, 'items'):
                self.add_error(self.UNSURE_THRESHOLDS, 'Must be a dict of choice values to float thresholds [0..1]')
            else:
                if not choice_values:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but choice values are not.')
                if not unsure_choice_value:
                    self.add_error(self.UNSURE_THRESHOLDS, '"Unsure" thresholds are set but '
                                                           '"unsure" choice value is not.')

                if choice_values and unsure_choice_value:
                    for k, v in unsure_thresholds_by_value.items():
                        if k == unsure_choice_value:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Please set thresholds only for "sure" choice '
                                                                   'values and not for ' + k)
                        elif k not in choice_values:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Value not in choice values: ' + k)
                        if (not isinstance(v, int) and not isinstance(v, float)) or v < 0 or v > 1:
                            self.add_error(self.UNSURE_THRESHOLDS, 'Threshold should be a float value between 0 and 1: '
                                           + k)

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            init_classifier_impl(field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {self.cleaned_data.get('code') or 'xxx': {f.code for f in depends_on_fields}}
        fields_and_deps = self._extract_field_and_deps(depends_on_fields, fields_and_deps)
        fields_and_deps = [(code, deps) for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                            for field in depends_on_fields}

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code)
            if not python_coded_field:
                self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified'
                                           ' as the field type'.format(python_coded_field.title,
                                                                       python_coded_field.type,
                                                                       type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values, 'formula')

        hide_until_python = hide_until_python.strip() if hide_until_python else None
        if hide_until_python:
            fields_to_values = {field.code: FIELD_TYPE_REGISTRY[field.type].example_python_value(field)
                                for field in list(document_type.fields.all())}
            if field_code and field_code in fields_to_values:
                del fields_to_values[field_code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPE_REGISTRY[type_code] \
                    .example_python_value(self.instance)

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value is not None:
            if type_code == RelatedInfoField.code:
                self.add_error('default_value', 'Related info field can\'t have default value')
            elif field_type.extract_from_possible_value(self.instance, default_value) != default_value:
                self.add_error('default_value', 'Wrong value for type {0}. Example: {1}'
                               .format(type_code, json.dumps(field_type.example_python_value(self.instance))))

        try:
            DocumentField.compile_value_regexp(self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        self.validate_field_code()

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error('type', 'Detected value is not allowed for this field type, please unset detected value '
                                       'for this field detectors: {0}'.format(', '.join(wrong_field_detector_pks)))

        return self.cleaned_data
예제 #6
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        document_type = self.cleaned_data.get('document_type')
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']
        hide_until_python = self.cleaned_data['hide_until_python']
        default_value = self.cleaned_data['default_value']

        try:
            stop_words = compile_stop_words(stop_words)
            detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if formula and formula.strip() and type_code:
            self.calc_formula(field_code, type_code, formula, fields_to_values,
                              'formula')

        hide_until_python = hide_until_python.strip(
        ) if hide_until_python else None
        if hide_until_python:
            fields_to_values = {
                field.code:
                FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
                for field in list(document_type.fields.all())
            }
            code = self.instance.code if self.instance else None
            if code and code in fields_to_values:
                del fields_to_values[code]
            if type_code:
                fields_to_values[field_code] = FIELD_TYPES_REGISTRY[type_code] \
                    .example_python_value(DocumentField(**self.cleaned_data))

            self.calc_formula(field_code,
                              None,
                              hide_until_python,
                              fields_to_values,
                              'hide_until_python',
                              formula_name='hide until python')

        if default_value and type_code == RelatedInfoField.code:
            self.add_error('default_value',
                           'Related info field can\'t have default value')

        try:
            DocumentField.compile_value_regexp(
                self.cleaned_data['value_regexp'])
        except Exception as exc:
            self.add_error('value_regexp', exc)

        # Ensure field code is not too long for Postgres column names
        # We use field codes to build column names for Postgres tables.
        # Max length of column name is 63. We escape them to snake case and sometimes add postfixes to them.
        # Lets assume that we should have max 23 chars for postfixes and max 40 chars for the field code.
        field_code_escaped = escape_column_name(field_code)
        if len(field_code_escaped) > self.MAX_ESCAPED_FIELD_CODE_LEN:
            self.add_error(
                'code',
                '''Field code is too long. Field codes are used to build column names of DB tables.
Escaped version should have max {max_length} chars but it is {length} chars long. Current escaped version of the 
specified field code is: "{field_code_escaped}"'''.format(
                    max_length=self.MAX_ESCAPED_FIELD_CODE_LEN,
                    length=len(field_code_escaped),
                    field_code_escaped=field_code_escaped))
        if not self.R_AZ.search(field_code_escaped):
            self.add_error(
                'code',
                '''Field codes are used to build column names of DB tables. Escaped version of 
the specified field code should contain at least one latin letter. Current escaped version of the specified field 
code is: "{0}"'''.format(field_code_escaped))

        if self.initial and 'type' in self.changed_data:
            wrong_field_detector_pks = []
            for field_detector in DocumentFieldDetector.objects.filter(
                    field=self.instance):
                try:
                    DocumentFieldDetector.validate_detected_value(
                        type_code, field_detector.detected_value)
                except Exception:
                    wrong_field_detector_pks.append('#' + field_detector.pk)
            if wrong_field_detector_pks:
                self.add_error(
                    'type',
                    'Detected value is not allowed for this field type, please unset detected value '
                    'for this field detectors: {0}'.format(
                        ', '.join(wrong_field_detector_pks)))

        return self.cleaned_data
예제 #7
0
 def test_order_field_detection_empty(self) -> None:
     ordered = order_field_detection([])
     self.assertEqual(0, len(ordered))