Example #1
0
    def __init__(self, text: str, field_type: str):
        self.document = Document()
        self.field = DocumentField()
        self.field.type = field_type

        self.text_unit = TextUnit()
        self.text_unit.document = self.document
        self.text_unit.textunittext = TextUnitText()
        self.text_unit.textunittext.text = text
        self.text_unit.location_start = 1001
        self.text_unit.location_end = self.text_unit.location_start + len(text)

        self.detector = DocumentFieldDetector()
        self.detector.regexps_pre_process_lower = True
        self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \
            '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))'
        self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \
                                    'required revolving credit lenders\nrequired term lenders\n' + \
                                    'requisite lenders\nrequisite revolving lenders\n' + \
                                    'required class lenders\nrequired ddtl lenders'
        self.detector.detected_value = 'AFFILIATED'
        self.detector.text_part = TextParts.FULL.value
        self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST

        self.matcher = DetectorFieldMatcher(self.detector)
Example #2
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_fields = field.get_depends_on_codes()
        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items() if c in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_value \
                = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
            if detected_with_stop_words:
                return FieldValueDTO(field_value=detected_value)

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]
        typed_field = TypedField.by(field)  # type: TypedField

        values = list()  # type: List

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for detector_field_matcher in detectors:  # type: DetectorFieldMatcher
                matching_piece = detector_field_matcher.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_piece is not None:
                    matching_string = matching_piece[0]
                    value = detector_field_matcher.get_validated_detected_value(
                        field)
                    if typed_field.requires_value:
                        hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = typed_field \
                            .get_or_extract_value(doc,
                                                  value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    value = typed_field.annotation_value_python_to_json(value)
                    if not isinstance(typed_field, MultiValueField):
                        return FieldValueDTO(field_value=value)
                    else:
                        values.append(value)

        if isinstance(typed_field, MultiValueField):
            return FieldValueDTO(
                field_value=typed_field.
                build_json_field_value_from_json_ant_values(values))
        else:
            return None
Example #3
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
Example #4
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)