def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        typed_field = TypedField.by(field)  # type: TypedField

        if python_coded_field.type != typed_field.type_code:
            raise RuntimeError(
                f'Python-coded field {python_coded_field.__class__.__name__} is '
                f'for fields of type {python_coded_field.type} and field {field.code} '
                f'is of type {typed_field.type_code}')

        log.debug('detect_field_value: python_coded_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        field_value_dto = python_coded_field.get_value(
            log=log,
            field=field,
            doc=doc,
            cur_field_code_to_value=field_code_to_value)
        if not typed_field.is_json_field_value_ok(field_value_dto.field_value):
            raise ValueError(
                f'Python coded field class {field.python_coded_field} returned value not suitable for '
                f'field {field.code} ({typed_field.type_code})')
        return field_value_dto
Exemple #2
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
Exemple #3
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        log.debug('detect_field_value: csv_regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        detectors = cls.detecting_cache.get_detectors(
            field.pk,
            lambda msg, er: log.error(msg, field_code=field.code, exc_info=er))
        if not detectors:
            return None

        is_multichoice = field.type == MultiChoiceField.type_code
        doc_text = cls.get_document_text(doc)

        annotations = []

        for detector in detectors:
            found_item = detector.find_value(doc_text)
            if not found_item:
                continue

            # TODO: implement reading values from full text (TextParts.FULL.value)
            # as it is done now, or from text units - paragraphs or sentences
            # based on field.text_unit_type - for other detector.text_part options
            """            
            if detector.text_part == TextParts.BEFORE_REGEXP.value:
                return matching_string[:begin], 0, begin
            elif detector.text_part == TextParts.AFTER_REGEXP.value:
                return matching_string[end:], end, len(text)
            elif detector.text_part == TextParts.INSIDE_REGEXP.value:
                return matching_string[begin:end], begin, end
            else:
                return text, 0, len(text)
            """

            # starting position has to be shifted backward by 1 symbol for FE
            ant = AnnotationDTO(annotation_value=found_item[0],
                                location_in_doc_start=max(
                                    found_item[1] - 1, 0),
                                location_in_doc_end=found_item[2],
                                extraction_hint_name='')
            if not is_multichoice:
                return FieldValueDTO(field_value=found_item[0],
                                     annotations=[ant])
            else:
                annotations.append(ant)

        if annotations:
            f_val = [a.annotation_value for a in annotations]
            return FieldValueDTO(field_value=f_val, annotations=annotations)
        return None
Exemple #5
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
Exemple #6
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            log.debug('detect_field_value: regexps_and_text_based_ml_field_value, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        ants: List[AnnotationDTO] = []
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo = cls.text_unit_repo

        if field.mlflow_detect_on_document_level:
            log.debug(
                'detect_field_value: mlflow_field_detection on doc level, ' +
                f'field {field.code}({field.pk}), document #{doc.pk}')
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        log.debug(
            'detect_field_value: mlflow_field_detection on text unit level, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        for text_unit in qs_text_units.iterator():
            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if value is None:
                continue
            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field.get_or_extract_value(
                    doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
Exemple #9
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField

        if field.mlflow_detect_on_document_level:
            log.debug('detect_field_value: mlflow_field_detection on doc level, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        ants = list()  # type: List[AnnotationDTO]

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .select_related('textunittext') \
            .order_by('location_start', 'pk') \
            .defer('textunittext__text_tsvector')

        units_counted = 0
        log.debug('detect_field_value: mlflow_field_detection on text unit level, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        for text_unit in qs_text_units.iterator():
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(field.mlflow_model_uri, model_input_df)

            value = model_output[0]

            if value is None:
                continue

            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned '
                                     f'annotation value not suitable for this field:\n'
                                     f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(value)
                ant = AnnotationDTO(annotation_value=annotation_value,
                                    location_in_doc_start=text_unit.location_start,
                                    location_in_doc_end=text_unit.location_end,
                                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(annotation_value=None,
                                    location_in_doc_start=text_unit.location_start,
                                    location_in_doc_end=text_unit.location_end,
                                    extraction_hint_name=None)

            if ant is None:
                continue
            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                if ant.location_in_doc_start > field.detect_limit_count:
                    break

            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                  for a in ants]),
                             annotations=ants)