예제 #1
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
예제 #2
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_fields = field.get_depends_on_codes()
        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items() if c in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_value \
                = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
            if detected_with_stop_words:
                return FieldValueDTO(field_value=detected_value)

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]
        typed_field = TypedField.by(field)  # type: TypedField

        values = list()  # type: List

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for detector_field_matcher in detectors:  # type: DetectorFieldMatcher
                matching_piece = detector_field_matcher.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_piece is not None:
                    matching_string = matching_piece[0]
                    value = detector_field_matcher.get_validated_detected_value(
                        field)
                    if typed_field.requires_value:
                        hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = typed_field \
                            .get_or_extract_value(doc,
                                                  value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    value = typed_field.annotation_value_python_to_json(value)
                    if not isinstance(typed_field, MultiValueField):
                        return FieldValueDTO(field_value=value)
                    else:
                        values.append(value)

        if isinstance(typed_field, MultiValueField):
            return FieldValueDTO(
                field_value=typed_field.
                build_json_field_value_from_json_ant_values(values))
        else:
            return None
예제 #3
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        log.debug('detect_field_value: csv_regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        detectors = cls.detecting_cache.get_detectors(
            field.pk,
            lambda msg, er: log.error(msg, field_code=field.code, exc_info=er))
        if not detectors:
            return None

        is_multichoice = field.type == MultiChoiceField.type_code
        doc_text = cls.get_document_text(doc)

        annotations = []

        for detector in detectors:
            found_item = detector.find_value(doc_text)
            if not found_item:
                continue

            # TODO: implement reading values from full text (TextParts.FULL.value)
            # as it is done now, or from text units - paragraphs or sentences
            # based on field.text_unit_type - for other detector.text_part options
            """            
            if detector.text_part == TextParts.BEFORE_REGEXP.value:
                return matching_string[:begin], 0, begin
            elif detector.text_part == TextParts.AFTER_REGEXP.value:
                return matching_string[end:], end, len(text)
            elif detector.text_part == TextParts.INSIDE_REGEXP.value:
                return matching_string[begin:end], begin, end
            else:
                return text, 0, len(text)
            """

            # starting position has to be shifted backward by 1 symbol for FE
            ant = AnnotationDTO(annotation_value=found_item[0],
                                location_in_doc_start=max(
                                    found_item[1] - 1, 0),
                                location_in_doc_end=found_item[2],
                                extraction_hint_name='')
            if not is_multichoice:
                return FieldValueDTO(field_value=found_item[0],
                                     annotations=[ant])
            else:
                annotations.append(ant)

        if annotations:
            f_val = [a.annotation_value for a in annotations]
            return FieldValueDTO(field_value=f_val, annotations=annotations)
        return None
예제 #5
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
    def maybe_detect_with_stop_words(
            cls, field: DocumentField, doc: Document,
            cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]:
        if field.stop_words:
            depends_on_field_codes = list(
                field.depends_on_fields.all().values_list(
                    'code', flat=True))  # type: List[str]
            depends_on_full_text = []

            if not any(cached_fields):
                return None

            for field_code in depends_on_field_codes:
                v = cached_fields.get(field_code)
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_field_value = \
                detect_with_stop_words_by_field_and_full_text(field=field,
                                                              doc=doc,
                                                              full_text='\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return FieldValueDTO(field_value=TypedField.by(
                    field).field_value_python_to_json(detected_field_value))
        return None
예제 #7
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        dto = cls.maybe_detect_with_stop_words(field, field_code_to_value)
        if dto is not None:
            return dto

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            doc_field_vals = field_code_to_value
            predicted = model.predict([doc_field_vals])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dto = FieldValueDTO(field_value=TypedField.by(
                field).field_value_python_to_json(target_name))

            return dto

        except ClassifierModel.DoesNotExist as e:
            raise e
 def pack_parsed_value(cls, typed_field: TypedField, value: Any,
                       loc_start: int, loc_end: int):
     value = typed_field.field_value_python_to_json(value)
     ant = AnnotationDTO(annotation_value=value,
                         location_in_doc_start=loc_start,
                         location_in_doc_end=loc_end)
     return FieldValueDTO(field_value=value, annotations=[ant])
예제 #9
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
예제 #10
0
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     v = PartyUsage.objects.filter(text_unit__document_id=doc.id) \
         .aggregate(value=StringAgg('party__name', delimiter=', ', distinct=True))
     value = TypedField.by(field).field_value_python_to_json(v['value'])
     return FieldValueDTO(field_value=value) if v else None
예제 #11
0
 def get_value(self, log, field: DocumentField, doc: Document,
               cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
     for v in CurrencyUsage.objects.filter(text_unit__document_id=doc.id) \
             .order_by('-amount') \
             .values('currency', 'amount'):
         v = TypedField.by(field).field_value_python_to_json(v)
         return FieldValueDTO(field_value=v)
     return None
예제 #12
0
 def get_value(self, log, field: DocumentField, doc: Document,
               cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
     for v in DateUsage.objects.filter(text_unit__document_id=doc.id) \
             .order_by('-date') \
             .values_list('date', flat=True):
         v = TypedField.by(field).field_value_python_to_json(v)
         return FieldValueDTO(field_value=v)
     return None
예제 #13
0
    def load_field_values(task: ExtendedTask, document: Document, document_fields_alias_to_value: Dict[str, Any]) \
            -> Dict[DocumentField, FieldValueDTO]:
        document_type = document.document_type
        fields_to_values = dict()  # type: Dict[DocumentField, FieldValueDTO]

        if not document_type:
            return fields_to_values

        field_code_aliases = document_type.field_code_aliases

        field_codes_to_fields = {f.code.lower(): f for f in document_type.fields.all()}

        if field_code_aliases:
            field_codes_to_fields.update({field_alias.lower(): field_codes_to_fields.get(field_code.lower())
                                          for field_alias, field_code in field_code_aliases.items() if
                                          field_alias and field_code})

        for field_alias, field_value_text in document_fields_alias_to_value.items():
            if field_value_text is None:
                continue

            field = field_codes_to_fields.get(field_alias.lower())  # type: DocumentField
            if not field:
                task.log_warn(
                    'Field alias "{0}" not found for document type {1}'.format(field_alias, document_type.code))
                continue
            typed_field = TypedField.by(field)  # type: TypedField

            if type(field_value_text) is list:
                for possible_value_text in list(field_value_text):
                    maybe_value = typed_field.extract_from_possible_value_text(possible_value_text)
                    if maybe_value:
                        maybe_value = typed_field.field_value_python_to_json(maybe_value)
                        fields_to_values[field] = FieldValueDTO(field_value=maybe_value)
                        break
            else:
                maybe_value = typed_field.extract_from_possible_value_text(field_value_text)
                if maybe_value:
                    maybe_value = typed_field.field_value_python_to_json(maybe_value)
                    fields_to_values[field] = FieldValueDTO(field_value=maybe_value)

        return fields_to_values
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     for date, start, end in DateUsage.objects.filter(text_unit__document_id=doc.pk) \
             .order_by('-date') \
             .values_list('date',
                          'text_unit__location_start',
                          'text_unit__location_end'):
         v = TypedField.by(field).field_value_python_to_json(date)
         return FieldValueDTO(field_value=v)
     return None
예제 #15
0
def is_unit_limit_exceeded(fval_dto: FieldValueDTO,
                           field: DocumentField,
                           _: Document) -> bool:
    if not fval_dto.annotations or not field.detect_limit_count:
        return False
    # "filter" annotations by detect_limit_count
    if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
        fval_dto.annotations = [d for d in fval_dto.annotations
                                if d.location_in_doc_start <= field.detect_limit_count]
        return not (not field.requires_text_annotations or fval_dto.annotations)
    # TODO: we can't calculate other text measuring units (sentence, paragraph, page)
    # with reasonable effort
    return False
 def get_value(
         self, log, field: DocumentField, doc: Document,
         cur_field_code_to_value: Dict[str,
                                       Any]) -> Optional[FieldValueDTO]:
     for curx, amt, start, end in \
         CurrencyUsage.objects.filter(text_unit__document_id=doc.pk) \
             .order_by('-amount') \
             .values('currency', 'amount',
                     'text_unit__location_start', 'text_unit__location_end'):
         v = TypedField.by(field).field_value_python_to_json((
             curx,
             amt,
         ))
         return FieldValueDTO(field_value=v)
     return None
    def get_value(
            self, log, field: DocumentField, doc: Document,
            cur_field_code_to_value: Dict[str,
                                          Any]) -> Optional[FieldValueDTO]:

        party_query = PartyUsage.objects.filter(text_unit__document_id=doc.pk)
        party_values = party_query.values_list('party__name',
                                               'text_unit__location_start',
                                               'text_unit__location_end')
        party_names = set()
        for name, start, end in party_values:
            party_names.add(name)

        names = ', '.join(party_names)
        value = TypedField.by(field).field_value_python_to_json(names)
        return FieldValueDTO(field_value=value) if names else None
예제 #18
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree...
        detected_value = cls.maybe_detect_with_stop_words(
            field, field_code_to_value)
        if detected_value is not None:
            return detected_value

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)  # type: ClassifierModel
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            category_probabilities = model.predict_proba([field_code_to_value
                                                          ])[0]

            target_index = max(range(len(category_probabilities)),
                               key=category_probabilities.__getitem__)
            target_probability = category_probabilities[target_index]

            predicted_value = categories[
                target_index] if 0 <= target_index < len(categories) else None

            if predicted_value is None:
                target_name = field.unsure_choice_value
            else:
                threshold = (field.unsure_thresholds_by_value or {}).get(predicted_value) \
                            or DocumentField.DEFAULT_UNSURE_THRESHOLD

                target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value

            value = TypedField.by(field).field_value_python_to_json(
                target_name)
            if classifier_model.store_suggestion:
                store_classification_suggestion(field, doc, value,
                                                target_probability)

            return FieldValueDTO(field_value=value)

        except ClassifierModel.DoesNotExist as e:
            raise e
 def _maybe_save_reverse_similarity_value(self,
                                          log: ProcessLogger,
                                          field: DocumentField,
                                          document: Document,
                                          other_doc_id) -> bool:
     field_repo = DocumentFieldRepository()
     if not field_repo.field_value_exists(other_doc_id, field.pk, [document.pk]):
         other_document = Document.all_objects.get(pk=other_doc_id)
         field_repo.update_field_value_with_dto(document=other_document,
                                                field=field,
                                                field_value_dto=FieldValueDTO(field_value=[document.pk]),
                                                merge=True)
         cache_document_fields(log=log,
                               document=other_document,
                               cache_system_fields=False,
                               cache_generic_fields=False,
                               cache_user_fields=[field.code])
예제 #20
0
    def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document,
                  cur_field_code_to_value: Dict[str, Any]) \
            -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField
        if typed_field.multi_value:
            raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and '
                            f'{typed_field.type_code} is multi-value')

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        for text_unit in qs_text_units.iterator():  # type: TextUnit
            found, value = self.find_value_in_text_unit(log, field, doc, text_unit)
            if found:
                value = typed_field.field_value_python_to_json(value)
                ant = AnnotationDTO(annotation_value=value,
                                    location_in_doc_start=text_unit.location_start,
                                    location_in_doc_end=text_unit.location_end)
                return FieldValueDTO(field_value=value, annotations={ant})
예제 #21
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        ants: List[AnnotationDTO] = []
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo = cls.text_unit_repo

        if field.mlflow_detect_on_document_level:
            log.debug(
                'detect_field_value: mlflow_field_detection on doc level, ' +
                f'field {field.code}({field.pk}), document #{doc.pk}')
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        log.debug(
            'detect_field_value: mlflow_field_detection on text unit level, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        for text_unit in qs_text_units.iterator():
            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if value is None:
                continue
            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field.get_or_extract_value(
                    doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
예제 #23
0
    def get_value(self,
                  log: ProcessLogger,
                  field: DocumentField,
                  doc: Document,
                  cur_field_code_to_value: Dict[str, Any],
                  location_text: Optional[str],
                  location_start: int = 0,
                  location_end: int = 0) -> Optional[FieldValueDTO]:

        try:
            conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR
                           )  # type: Optional[DocumentSimilarityConfig]
        except DocumentSimilarityConfig.DoesNotExist:
            conf = None

        if conf:
            conf.self_validate()

        similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD
        feature_vector_fields = field.depends_on_fields.all()
        date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None
        date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS
        document_type = doc.document_type

        feature_vector_field_codes = {f.code for f in feature_vector_fields}

        doc_field_values = dict(cur_field_code_to_value)
        doc_field_values[FIELD_CODE_DOC_ID] = doc.pk

        if date_constraint_field_code:
            doc_date = doc_field_values.get(date_constraint_field_code)
            date_start = doc_date - timedelta(days=date_constraint_days)
            date_end = doc_date + timedelta(days=date_constraint_days)

            doc_ids_query = FieldValue.objects \
                .filter(field__code=date_constraint_field_code) \
                .filter(value__gte=date_start) \
                .filter(value__lte=date_end) \
                .filter(document__document_type_id=document_type.pk) \
                .exclude(document_id=doc.pk) \
                .values_list('document_id', flat=True)
        else:
            doc_date = doc.history.last().history_date
            date_start = doc_date - timedelta(days=date_constraint_days)
            date_end = doc_date + timedelta(days=date_constraint_days)

            doc_ids_query = Document.history \
                .filter(history_type='+',
                        history_date__gte=date_start,
                        history_date__lte=date_end,
                        document_type_id=document_type.pk) \
                .exclude(id=doc.pk) \
                .values_list('pk', flat=True)

        try:
            vectorizer = document_feature_vector_pipeline(
                feature_vector_fields, use_field_codes=True)

            field_repo = DocumentFieldRepository()

            field_values_list = list()

            for doc_id, field_values in field_repo \
                    .get_field_code_to_python_value_multiple_docs(document_type_id=document_type.pk,
                                                                  doc_ids=doc_ids_query,
                                                                  field_codes_only=feature_vector_field_codes):
                d = dict(field_values)
                d[FIELD_CODE_DOC_ID] = doc_id
                field_values_list.append(d)

            if not field_values_list:
                return None

            field_values_list = [doc_field_values] + field_values_list
            feature_vectors = vectorizer.fit_transform(field_values_list)
            doc_feature_vectors = feature_vectors[0]
        except ValueError as ve:
            if 'empty vocabulary' in str(ve):
                log.info(
                    f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs '
                    f'contains any value in the feature vector fields.')
                return None
            raise ve

        similarities = cosine_similarity(doc_feature_vectors, feature_vectors)

        # TODO: Think about removing usage of other_field_values_list here and switching it to generator
        # to avoid storing the list of all field values. We only need feature vectors but they have no doc id.
        res = set()  # type: Set[int]
        for y, field_values in enumerate(field_values_list):
            other_doc_pk = field_values[FIELD_CODE_DOC_ID]
            if doc.pk == other_doc_pk:
                continue
            similarity = similarities[0, y]
            if similarity < similarity_threshold:
                continue
            res.add(other_doc_pk)
            self._maybe_save_reverse_similarity_value(
                log=log, field=field, document=doc, other_doc_id=other_doc_pk)

        if res:
            field_value = sorted(res)[0]
            return FieldValueDTO(field_value)
        return None
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField

        if field.mlflow_detect_on_document_level:
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        ants = list()  # type: List[AnnotationDTO]

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .select_related('textunittext') \
            .order_by('location_start', 'pk')

        units_counted = 0
        for text_unit in qs_text_units.iterator():
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)

            value = model_output[0]

            if value is None:
                continue

            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                if ant.location_in_doc_start > field.detect_limit_count:
                    break

            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)