Example #1
0
    def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel,
                                  typed_field: TypedField,
                                  document: Document,
                                  field: DocumentField,
                                  text_unit: TextUnit) -> Optional[AnnotationDTO]:
        field_code, value, hint_name = cls.predict_value(sklearn_model, text_unit)
        if field_code == field.code:
            if typed_field.requires_value:
                hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(document, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned '
                                     f'annotation value not suitable for this field:\n'
                                     f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(value)
                return AnnotationDTO(annotation_value=annotation_value,
                                     location_in_doc_start=text_unit.location_start,
                                     location_in_doc_end=text_unit.location_end,
                                     extraction_hint_name=hint_name)
            else:
                return AnnotationDTO(annotation_value=None,
                                     location_in_doc_start=text_unit.location_start,
                                     location_in_doc_end=text_unit.location_end,
                                     extraction_hint_name=None)

        return None
    def _validate_choice_values_removed(self, context: dict) -> None:
        saved_field = self._get_saved_field(context)
        if not saved_field or not TypedField.by(saved_field).is_choice_field \
                or not TypedField.by(self.object).is_choice_field:
            return
        err_msg = ''
        invalid_choices = self._get_invalid_choices(saved_field)
        if self._is_allow_values_not_specified_in_choices_was_unset(
                saved_field):
            err_msg += '"Allow values not specified in choices" flag is unset in the the config being imported. '
        if invalid_choices:
            invalid_choices = [
                '"{0}"'.format(invalid_choice)
                for invalid_choice in invalid_choices
            ]
            err_msg += 'The following choice values are missing in the config being imported: {0}. ' \
                .format(', '.join(invalid_choices))

        if err_msg:
            invalid_values_count = self.object.get_invalid_choice_annotations(
            ).count()
            user_values_count = 0
            detected_values_count = 0
            if invalid_values_count > 0:
                field_repo = DocumentFieldRepository()
                user_values_count = field_repo.get_invalid_choice_vals_count(
                    self.object)
                detected_values_count = self._get_detected_values_count(
                    invalid_values_count, user_values_count)
            err_msg += 'Number of invalid values: user entered values {0}, automatically detected values {1}.' \
                       ' You need to set force auto-fixes option to continue (this option will remove all invalid' \
                       ' values) or make manual updates.'.format(user_values_count, detected_values_count)
            err_msg = 'Unable to update field #{0} "{1}". {2}'.format(
                self.pk, self.object.code, err_msg)
            raise ValidationError(err_msg)
Example #3
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        dto = cls.maybe_detect_with_stop_words(field, field_code_to_value)
        if dto is not None:
            return dto

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            doc_field_vals = field_code_to_value
            predicted = model.predict([doc_field_vals])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dto = FieldValueDTO(field_value=TypedField.by(
                field).field_value_python_to_json(target_name))

            return dto

        except ClassifierModel.DoesNotExist as e:
            raise e
    def maybe_detect_with_stop_words(
            cls, field: DocumentField, doc: Document,
            cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]:
        if field.stop_words:
            depends_on_field_codes = list(
                field.depends_on_fields.all().values_list(
                    'code', flat=True))  # type: List[str]
            depends_on_full_text = []

            if not any(cached_fields):
                return None

            for field_code in depends_on_field_codes:
                v = cached_fields.get(field_code)
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_field_value = \
                detect_with_stop_words_by_field_and_full_text(field=field,
                                                              doc=doc,
                                                              full_text='\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return FieldValueDTO(field_value=TypedField.by(
                    field).field_value_python_to_json(detected_field_value))
        return None
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        typed_field = TypedField.by(field)  # type: TypedField

        if python_coded_field.type != typed_field.type_code:
            raise RuntimeError(
                f'Python-coded field {python_coded_field.__class__.__name__} is '
                f'for fields of type {python_coded_field.type} and field {field.code} '
                f'is of type {typed_field.type_code}')

        log.debug('detect_field_value: python_coded_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        field_value_dto = python_coded_field.get_value(
            log=log,
            field=field,
            doc=doc,
            cur_field_code_to_value=field_code_to_value)
        if not typed_field.is_json_field_value_ok(field_value_dto.field_value):
            raise ValueError(
                f'Python coded field class {field.python_coded_field} returned value not suitable for '
                f'field {field.code} ({typed_field.type_code})')
        return field_value_dto
Example #6
0
    def build_pipeline(
        cls, field: DocumentField, depends_on_fields: List[Tuple[str, str]]
    ) -> Tuple[Pipeline, List[Callable[[], List[str]]]]:

        transformer_list = []
        feature_names_funcs = []

        fields = list(
            DocumentField.objects.filter(
                code__in={code
                          for code, _type in depends_on_fields}))
        fields_by_code = {f.code: f for f in fields}

        for field_code, field_type in sorted(
                depends_on_fields, key=lambda t: t[1]):  # type: str, str
            field_type = TypedField.by(
                fields_by_code[field_code])  # type: TypedField

            field_vect_steps = [('sel', FieldValueExtractor(field_code))]

            field_vect_pipeline, field_feature_names_func = field_type.build_vectorization_pipeline(
            )

            field_vect_steps.extend(field_vect_pipeline)

            transformer_list.append((field_code, Pipeline(field_vect_steps)))

            feature_names_funcs.append(
                wrap_feature_names_with_field_code(field_feature_names_func,
                                                   field_code))

        classifier = cls.init_classifier(field)

        return Pipeline([('vect', FeatureUnion(transformer_list)),
                         ('clf', classifier)]), feature_names_funcs
Example #7
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
 def pack_parsed_value(cls, typed_field: TypedField, value: Any,
                       loc_start: int, loc_end: int):
     value = typed_field.field_value_python_to_json(value)
     ant = AnnotationDTO(annotation_value=value,
                         location_in_doc_start=loc_start,
                         location_in_doc_end=loc_end)
     return FieldValueDTO(field_value=value, annotations=[ant])
Example #9
0
    def join_field_detector_model_tests(task: ExtendedTask,
                                        field_uid,
                                        classifier_model_id):
        results = list(Task.objects
                       .filter(main_task_id=task.request.parent_id,
                               name=TrainAndTest.test_field_detector_model.name)
                       .values_list('result', flat=True))

        test_text_units_number = 0
        match_number = 0
        test_doc_number = 0

        matches_per_value = dict()
        total_per_value = dict()

        for res in results:
            actual_field_value = res.get('actual_field_value')

            if actual_field_value:
                if actual_field_value not in total_per_value:
                    total_per_value[actual_field_value] = 0
                total_per_value[actual_field_value] += 1

            test_doc_number += 1
            test_text_units_number += (res.get('text_units_number') or 0)
            if res.get('value_matches_expected'):
                match_number += 1
                if actual_field_value:
                    if actual_field_value not in matches_per_value:
                        matches_per_value[actual_field_value] = 0
                    matches_per_value[actual_field_value] += 1

        accuracy = match_number / test_doc_number

        if classifier_model_id:
            classifier_model = ClassifierModel.objects.get(pk=classifier_model_id)
            classifier_model.field_detection_accuracy = accuracy
            classifier_model.save()

        field = DocumentField.objects.get(pk=field_uid)

        task.log_info('Testing finished.\n'
                      'Document type: {0}.\n'
                      'Field: {1}.\n'
                      'Text unit type: {2}.\n'
                      'Test documents number: {3}.\n'
                      'Test text units number: {4}.\n'
                      'Accuracy: {5}.\n'
                      .format(field.document_type.code,
                              field.code,
                              field.text_unit_type,
                              test_doc_number,
                              test_text_units_number,
                              accuracy))

        if TypedField.by(field).is_choice_field:
            accuracy_per_value = {actual_field_value: (matches_per_value.get(actual_field_value) or 0) / total
                                  for actual_field_value, total in total_per_value.items()}
            task.log_info('Accuracy per value:\n{0}'.format(json.dumps(accuracy_per_value, sort_keys=True, indent=2)))
Example #10
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
Example #11
0
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     v = PartyUsage.objects.filter(text_unit__document_id=doc.id) \
         .aggregate(value=StringAgg('party__name', delimiter=', ', distinct=True))
     value = TypedField.by(field).field_value_python_to_json(v['value'])
     return FieldValueDTO(field_value=value) if v else None
Example #12
0
 def get_value(self, log, field: DocumentField, doc: Document,
               cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
     for v in CurrencyUsage.objects.filter(text_unit__document_id=doc.id) \
             .order_by('-amount') \
             .values('currency', 'amount'):
         v = TypedField.by(field).field_value_python_to_json(v)
         return FieldValueDTO(field_value=v)
     return None
Example #13
0
 def get_value(self, log, field: DocumentField, doc: Document,
               cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
     for v in DateUsage.objects.filter(text_unit__document_id=doc.id) \
             .order_by('-date') \
             .values_list('date', flat=True):
         v = TypedField.by(field).field_value_python_to_json(v)
         return FieldValueDTO(field_value=v)
     return None
Example #14
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_fields = field.get_depends_on_codes()
        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items() if c in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_value \
                = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
            if detected_with_stop_words:
                return FieldValueDTO(field_value=detected_value)

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]
        typed_field = TypedField.by(field)  # type: TypedField

        values = list()  # type: List

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for detector_field_matcher in detectors:  # type: DetectorFieldMatcher
                matching_piece = detector_field_matcher.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_piece is not None:
                    matching_string = matching_piece[0]
                    value = detector_field_matcher.get_validated_detected_value(
                        field)
                    if typed_field.requires_value:
                        hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = typed_field \
                            .get_or_extract_value(doc,
                                                  value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    value = typed_field.annotation_value_python_to_json(value)
                    if not isinstance(typed_field, MultiValueField):
                        return FieldValueDTO(field_value=value)
                    else:
                        values.append(value)

        if isinstance(typed_field, MultiValueField):
            return FieldValueDTO(
                field_value=typed_field.
                build_json_field_value_from_json_ant_values(values))
        else:
            return None
Example #15
0
 def check_field_values(task: ExtendedTask,
                        field_value_ids: List[int],
                        delete_broken: bool = False):
     import apps.document.repository.document_field_repository as dfr
     field_repo = dfr.DocumentFieldRepository()
     qa_field_values = field_repo.get_field_values_by_ids(field_value_ids)
     for fv in qa_field_values:  # FieldAnnotation
         field = fv.field  # type: DocumentField
         if not TypedField.by(field).is_json_field_value_ok(fv.value):
             FindBrokenDocumentFieldValues.process_broken(task, fv, delete_broken)
Example #16
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
    def validate(dst_field: DocumentField, date_constraint_field: DocumentField, date_constraint_days: int) \
            -> List[Tuple[str, str]]:
        document_type = dst_field.document_type
        feature_vector_fields = dst_field.depends_on_fields.all()
        res = list()

        if date_constraint_days is not None and date_constraint_days < 1:
            res.append((ATTR_DATE_CONSTRAINT_DAYS,
                        'Date constraint days should be either empty or a '
                        'positive integer.'))

        if date_constraint_field is not None and date_constraint_field.document_type_id != document_type.pk:
            res.append((
                ATTR_DATE_CONSTRAINT_FIELD,
                'Date constraint field should be owned by the same document type'
                'as the destination field.'))

        if date_constraint_field is not None and date_constraint_days is None:
            res.append(
                (ATTR_DATE_CONSTRAINT_DAYS,
                 'Date constraint days number should not be empty if the date '
                 'constraint field is assigned.'))

        if date_constraint_field is not None:
            example_value = TypedField.by(
                date_constraint_field).example_python_value()
            if not isinstance(example_value, (date, datetime)):
                res.append((
                    ATTR_DATE_CONSTRAINT_FIELD,
                    'Type of the date constraint field should be date or datetime'
                ))

        if dst_field.type != LinkedDocumentsField.code:
            res.append((ATTR_DST_FIELD,
                        'Destination field should be of type {0}'.format(
                            LinkedDocumentsField.code)))

        if not feature_vector_fields:
            res.append((ATTR_FEATURE_VECTOR_FIELDS,
                        'Feature vector fields list can not be empty.'))

        wrong_doc_type_fields = list()
        for f in feature_vector_fields:
            if f.document_type_id != dst_field.document_type_id:
                wrong_doc_type_fields.append(f.code)
        if wrong_doc_type_fields:
            res.append(
                (ATTR_FEATURE_VECTOR_FIELDS,
                 mark_safe('''All feature vector fields should be owned by
                    the same document type as the destination field: {dst_field_type}.<br />
                    The following fields are owned by different document type(s):<br />{bad_fields}'''
                           .format(
                               bad_fields='<br />'.join(wrong_doc_type_fields),
                               dst_field_type=document_type.code))))
        return res or None
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     for date, start, end in DateUsage.objects.filter(text_unit__document_id=doc.pk) \
             .order_by('-date') \
             .values_list('date',
                          'text_unit__location_start',
                          'text_unit__location_end'):
         v = TypedField.by(field).field_value_python_to_json(date)
         return FieldValueDTO(field_value=v)
     return None
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     for curx, amt, start, end in \
         CurrencyUsage.objects.filter(text_unit__document_id=doc.pk) \
             .order_by('-amount') \
             .values('currency', 'amount',
                     'text_unit__location_start', 'text_unit__location_end'):
         v = TypedField.by(field).field_value_python_to_json((
             curx,
             amt,
         ))
         return FieldValueDTO(field_value=v)
     return None
Example #20
0
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Any]:
    if field.requires_text_annotations:
        return False, None
    stop_words = compile_stop_words(field.stop_words)
    if not stop_words:
        return False, None
    typed_field = TypedField.by(field)  # type: TypedField
    detected, possible_value = detect_value_with_stop_words(stop_words, full_text)
    if not detected:
        return False, None
    if possible_value is None:
        return True, None
    else:
        possible_value = typed_field.extract_from_possible_value_text(possible_value)
        return True, possible_value
Example #21
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
Example #22
0
 def has_problems_with_field(cls, field: DocumentField) -> Optional[str]:
     if not field.mlflow_model_uri:
         return f'MLFlow model uri is not set for field {field.code}'
     try:
         output = cls.test_model(field.mlflow_model_uri)
         output = output[0] if output is not None else None
         tf = TypedField.by(field)
         if not tf.is_python_field_value_ok(output):
             return f'MLFlow model returned value which does not match the field type.\n' \
                 f'Returned value (shortened up to 100 chars): {str(output)[:100]}.\n ' \
                 f'Example value: {tf.example_python_value()}.'
         return None
     except Exception as e:
         from apps.common.errors import render_error
         render_error('MLFlow model has thrown exception when testing '
                      '(input = 1-row DataFrame with text = "Hello World")', e)
    def get_value(
            self, log, field: DocumentField, doc: Document,
            cur_field_code_to_value: Dict[str,
                                          Any]) -> Optional[FieldValueDTO]:

        party_query = PartyUsage.objects.filter(text_unit__document_id=doc.pk)
        party_values = party_query.values_list('party__name',
                                               'text_unit__location_start',
                                               'text_unit__location_end')
        party_names = set()
        for name, start, end in party_values:
            party_names.add(name)

        names = ', '.join(party_names)
        value = TypedField.by(field).field_value_python_to_json(names)
        return FieldValueDTO(field_value=value) if names else None
Example #24
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree...
        detected_value = cls.maybe_detect_with_stop_words(
            field, field_code_to_value)
        if detected_value is not None:
            return detected_value

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)  # type: ClassifierModel
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            category_probabilities = model.predict_proba([field_code_to_value
                                                          ])[0]

            target_index = max(range(len(category_probabilities)),
                               key=category_probabilities.__getitem__)
            target_probability = category_probabilities[target_index]

            predicted_value = categories[
                target_index] if 0 <= target_index < len(categories) else None

            if predicted_value is None:
                target_name = field.unsure_choice_value
            else:
                threshold = (field.unsure_thresholds_by_value or {}).get(predicted_value) \
                            or DocumentField.DEFAULT_UNSURE_THRESHOLD

                target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value

            value = TypedField.by(field).field_value_python_to_json(
                target_name)
            if classifier_model.store_suggestion:
                store_classification_suggestion(field, doc, value,
                                                target_probability)

            return FieldValueDTO(field_value=value)

        except ClassifierModel.DoesNotExist as e:
            raise e
Example #25
0
    def load_field_values(task: ExtendedTask, document: Document, document_fields_alias_to_value: Dict[str, Any]) \
            -> Dict[DocumentField, FieldValueDTO]:
        document_type = document.document_type
        fields_to_values = dict()  # type: Dict[DocumentField, FieldValueDTO]

        if not document_type:
            return fields_to_values

        field_code_aliases = document_type.field_code_aliases

        field_codes_to_fields = {f.code.lower(): f for f in document_type.fields.all()}

        if field_code_aliases:
            field_codes_to_fields.update({field_alias.lower(): field_codes_to_fields.get(field_code.lower())
                                          for field_alias, field_code in field_code_aliases.items() if
                                          field_alias and field_code})

        for field_alias, field_value_text in document_fields_alias_to_value.items():
            if field_value_text is None:
                continue

            field = field_codes_to_fields.get(field_alias.lower())  # type: DocumentField
            if not field:
                task.log_warn(
                    'Field alias "{0}" not found for document type {1}'.format(field_alias, document_type.code))
                continue
            typed_field = TypedField.by(field)  # type: TypedField

            if type(field_value_text) is list:
                for possible_value_text in list(field_value_text):
                    maybe_value = typed_field.extract_from_possible_value_text(possible_value_text)
                    if maybe_value:
                        maybe_value = typed_field.field_value_python_to_json(maybe_value)
                        fields_to_values[field] = FieldValueDTO(field_value=maybe_value)
                        break
            else:
                maybe_value = typed_field.extract_from_possible_value_text(field_value_text)
                if maybe_value:
                    maybe_value = typed_field.field_value_python_to_json(maybe_value)
                    fields_to_values[field] = FieldValueDTO(field_value=maybe_value)

        return fields_to_values
Example #26
0
def document_feature_vector_pipeline(feature_vector_fields: List[DocumentField], use_field_codes: bool = False) \
        -> FeatureUnion:
    transformer_list = []
    for field in sorted(feature_vector_fields,
                        key=lambda f: f.pk):  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField

        field_vect_steps = [
            ('sel',
             FieldValueExtractor(field.code if use_field_codes else field.pk))
        ]

        field_vect_pipeline, _field_feature_names_func = typed_field.build_vectorization_pipeline(
        )

        field_vect_steps.extend(field_vect_pipeline)

        transformer_list.append((field.code, Pipeline(field_vect_steps)))

    return FeatureUnion(transformer_list)
Example #27
0
    def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document,
                  cur_field_code_to_value: Dict[str, Any]) \
            -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField
        if typed_field.multi_value:
            raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and '
                            f'{typed_field.type_code} is multi-value')

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        for text_unit in qs_text_units.iterator():  # type: TextUnit
            found, value = self.find_value_in_text_unit(log, field, doc, text_unit)
            if found:
                value = typed_field.field_value_python_to_json(value)
                ant = AnnotationDTO(annotation_value=value,
                                    location_in_doc_start=text_unit.location_start,
                                    location_in_doc_end=text_unit.location_end)
                return FieldValueDTO(field_value=value, annotations={ant})
Example #28
0
    def calc_formula(cls,
                     field_code: str,
                     formula: str,
                     depends_on_field_to_value: Dict[str, Any],
                     convert_decimals_to_floats: bool = False) -> Any:
        if not formula or not formula.strip():
            return None
        if convert_decimals_to_floats:
            depends_on_field_to_value = {c: TypedField.replace_decimals_with_floats_in_python_value_of_any_type(v)
                                         for c, v in depends_on_field_to_value.items()} \
                if depends_on_field_to_value else {}
        try:
            if settings.DEBUG_SLOW_DOWN_FIELD_FORMULAS_SEC:
                import time
                time.sleep(settings.DEBUG_SLOW_DOWN_FIELD_FORMULAS_SEC)

            return eval_script(script_title=f'{field_code} formula',
                               script_code=formula,
                               eval_locals=depends_on_field_to_value)
        except ScriptError as se:
            raise DocumentFieldFormulaError(field_code, formula,
                                            depends_on_field_to_value) from se
Example #29
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
Example #30
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None,
                                               skip_modified_values: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n'
                           f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
             f'Updated fields: {updated_field_codes or "All"}.\n'
             f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
             f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [c for c in sorted_codes
                        if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk,
                                                                    doc_id=document.pk,
                                                                    field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(list(FieldValue.objects.filter(
            modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk,
                                                list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(log=log,
                                                                              doc=document,
                                                                              field=field,
                                                                              field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'")

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(document=document,
                                                                                  field=field,
                                                                                  field_value_dto=new_field_value_dto,
                                                                                  user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                      exc_info=e)
            detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(sender=detect_and_cache_field_values_for_document,
                              log=log,
                              document=document,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              system_fields_changed=system_fields_changed,
                              generic_fields_changed=generic_fields_changed,
                              user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                              for k in current_field_values])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg)

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res