Example #1
0
    def detect_field_values(cls,
                            log: ProcessLogger,
                            doc: Document,
                            field: DocumentField,
                            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        formula = field.formula

        if not formula:
            raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid))

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values))

        field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields}

        if field.stop_words:
            depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        v = cls.calc_formula(field.code, field.type, formula, field_code_to_value)
        return [DetectedFieldValue(field, v)]
    def detect_field_values(cls,
                            log: ProcessLogger,
                            doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values))

        field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields}

        if field.stop_words:
            depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:
                matching_string = field_detector.matching_string(depends_on_value)
                if matching_string is not None:
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()):
                break

        return detected_values
Example #3
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')
        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)

        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Example #4
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = RegexpsOnlyFieldDetectionStrategy.\
            text_unit_repo.get_doc_text_units(doc, field.text_unit_type)

        field_detectors = RegexpsOnlyFieldDetectionStrategy.\
            field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units:  # type: TextUnit

            for field_detector in detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Example #5
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        field_detectors = DocumentFieldDetector.objects.filter(field=field)

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():  # type: TextUnit

            for field_detector in field_detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Example #6
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = field.get_field_type()

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
    def maybe_detect_with_stop_words(
            cls, log: ProcessLogger, doc: Document,
            field: DocumentField) -> Optional[List[DetectedFieldValue]]:
        if field.stop_words:
            depends_on_fields = list(field.depends_on_fields.all())
            depends_on_full_text = []

            for df in depends_on_fields:  # type: DocumentField
                field_type_adapter = FIELD_TYPES_REGISTRY[
                    df.type]  # type: FieldType
                v = field_type_adapter.merged_db_value_to_python(
                    doc.field_values.get(df.uid))
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_values = \
                detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return detected_values or list()
        return None
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        if field.stop_words:
            depends_on_fields = list(field.depends_on_fields.all())
            depends_on_full_text = []

            for df in depends_on_fields:  # type: DocumentField
                field_type_adapter = FIELD_TYPES_REGISTRY[
                    df.type]  # type: FieldType
                v = field_type_adapter.merged_db_value_to_python(
                    doc.field_values.get(df.uid))
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_values = \
                detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return detected_values or list()

        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            predicted = model.predict([doc.field_values])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e
    def maybe_detect_with_stop_words(
            cls, field: DocumentField,
            cached_fields: Dict[str,
                                Any]) -> Optional[List[DetectedFieldValue]]:
        if field.stop_words:
            depends_on_fields = list(field.depends_on_fields.all())
            depends_on_full_text = []

            if not any(cached_fields):
                return None

            for df in depends_on_fields:  # type: DocumentField
                field_type_adapter = FIELD_TYPE_REGISTRY[
                    df.type]  # type: FieldType
                v = field_type_adapter.merged_db_value_to_python(
                    cached_fields.get(df.uid))
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_values = \
                detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return detected_values or list()
        return None