예제 #1
0
    def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel,
                                  typed_field: TypedField,
                                  document: Document,
                                  field: DocumentField,
                                  text_unit: TextUnit) -> Optional[AnnotationDTO]:
        field_code, value, hint_name = cls.predict_value(sklearn_model, text_unit)
        if field_code == field.code:
            if typed_field.requires_value:
                hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(document, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned '
                                     f'annotation value not suitable for this field:\n'
                                     f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(value)
                return AnnotationDTO(annotation_value=annotation_value,
                                     location_in_doc_start=text_unit.location_start,
                                     location_in_doc_end=text_unit.location_end,
                                     extraction_hint_name=hint_name)
            else:
                return AnnotationDTO(annotation_value=None,
                                     location_in_doc_start=text_unit.location_start,
                                     location_in_doc_end=text_unit.location_end,
                                     extraction_hint_name=None)

        return None
 def pack_parsed_value(cls, typed_field: TypedField, value: Any,
                       loc_start: int, loc_end: int):
     value = typed_field.field_value_python_to_json(value)
     ant = AnnotationDTO(annotation_value=value,
                         location_in_doc_start=loc_start,
                         location_in_doc_end=loc_end)
     return FieldValueDTO(field_value=value, annotations=[ant])
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        log.debug('detect_field_value: csv_regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        detectors = cls.detecting_cache.get_detectors(
            field.pk,
            lambda msg, er: log.error(msg, field_code=field.code, exc_info=er))
        if not detectors:
            return None

        is_multichoice = field.type == MultiChoiceField.type_code
        doc_text = cls.get_document_text(doc)

        annotations = []

        for detector in detectors:
            found_item = detector.find_value(doc_text)
            if not found_item:
                continue

            # TODO: implement reading values from full text (TextParts.FULL.value)
            # as it is done now, or from text units - paragraphs or sentences
            # based on field.text_unit_type - for other detector.text_part options
            """            
            if detector.text_part == TextParts.BEFORE_REGEXP.value:
                return matching_string[:begin], 0, begin
            elif detector.text_part == TextParts.AFTER_REGEXP.value:
                return matching_string[end:], end, len(text)
            elif detector.text_part == TextParts.INSIDE_REGEXP.value:
                return matching_string[begin:end], begin, end
            else:
                return text, 0, len(text)
            """

            # starting position has to be shifted backward by 1 symbol for FE
            ant = AnnotationDTO(annotation_value=found_item[0],
                                location_in_doc_start=max(
                                    found_item[1] - 1, 0),
                                location_in_doc_end=found_item[2],
                                extraction_hint_name='')
            if not is_multichoice:
                return FieldValueDTO(field_value=found_item[0],
                                     annotations=[ant])
            else:
                annotations.append(ant)

        if annotations:
            f_val = [a.annotation_value for a in annotations]
            return FieldValueDTO(field_value=f_val, annotations=annotations)
        return None
예제 #4
0
    def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document,
                  cur_field_code_to_value: Dict[str, Any]) \
            -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField
        if typed_field.multi_value:
            raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and '
                            f'{typed_field.type_code} is multi-value')

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        for text_unit in qs_text_units.iterator():  # type: TextUnit
            found, value = self.find_value_in_text_unit(log, field, doc, text_unit)
            if found:
                value = typed_field.field_value_python_to_json(value)
                ant = AnnotationDTO(annotation_value=value,
                                    location_in_doc_start=text_unit.location_start,
                                    location_in_doc_end=text_unit.location_end)
                return FieldValueDTO(field_value=value, annotations={ant})
예제 #5
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
예제 #6
0
    def extract_from_textunit(
            cls, text_unit: TextUnit, field: DocumentField,
            detectors: List[DetectorFieldMatcher]) -> List[AnnotationDTO]:
        """
        Searches TextUnit text for FieldDetector regex matches.
        Entity extraction, based on the Field type, is run and returned.

        Args:
            text_unit (TextUnit):
            field (DocumentField):
            detectors (List[DetectorFieldMatcher]):

        Returns:
            List[AnnotationDTO]:
        """

        ants = []
        src_text = text_unit.textunittext.text
        typed_field: TypedField = TypedField.by(field)
        for field_detector in detectors:
            try:
                matching_and_location = field_detector.matching_string(
                    src_text, text_unit.is_sentence())
                if matching_and_location is not None:
                    hint_name = None
                    matching_string = matching_and_location[0]
                    value = field_detector.get_validated_detected_value(field)
                    if typed_field.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        try:
                            value, hint_name = typed_field.get_or_extract_value(
                                document=text_unit.document,
                                possible_value=value,
                                possible_hint=hint_name,
                                text=matching_string)
                        except Exception as e:
                            raise ValueExtractionFunctionThrownException(
                                f'Document: {text_unit.document.name} (#{text_unit.document.pk})\n'
                                f'TextUnit: {text_unit.unit_type} (#{text_unit.pk})\n'
                                f'Value: {value}\n'
                                f'Extraction hint: {hint_name}\n'
                                f'Matching string:\n'
                                f'{matching_string}', e) from e
                        if value is None:
                            continue

                    annotation_value = typed_field.annotation_value_python_to_json(
                        value)
                    ant = AnnotationDTO(
                        annotation_value=annotation_value,
                        location_in_doc_start=text_unit.location_start,
                        location_in_doc_end=text_unit.location_end,
                        extraction_hint_name=hint_name)

                    if not isinstance(typed_field, MultiValueField):
                        return [ant]
                    else:
                        ants.append(ant)
            except Exception as e:
                if isinstance(e, ValueExtractionFunctionThrownException):
                    raise e
                else:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Document: {text_unit.document.name} (#{text_unit.document.pk})\n'
                        f'TextUnit: {text_unit.unit_type} (#{text_unit.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text: {src_text[:300]}', e) from e
        return ants
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField

        if field.mlflow_detect_on_document_level:
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        ants = list()  # type: List[AnnotationDTO]

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .select_related('textunittext') \
            .order_by('location_start', 'pk')

        units_counted = 0
        for text_unit in qs_text_units.iterator():
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)

            value = model_output[0]

            if value is None:
                continue

            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                if ant.location_in_doc_start > field.detect_limit_count:
                    break

            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        ants: List[AnnotationDTO] = []
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo = cls.text_unit_repo

        if field.mlflow_detect_on_document_level:
            log.debug(
                'detect_field_value: mlflow_field_detection on doc level, ' +
                f'field {field.code}({field.pk}), document #{doc.pk}')
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        log.debug(
            'detect_field_value: mlflow_field_detection on text unit level, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        for text_unit in qs_text_units.iterator():
            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if value is None:
                continue
            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field.get_or_extract_value(
                    doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)