def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        typed_field = TypedField.by(field)  # type: TypedField

        if field.mlflow_detect_on_document_level:
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        ants = list()  # type: List[AnnotationDTO]

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .select_related('textunittext') \
            .order_by('location_start', 'pk')

        units_counted = 0
        for text_unit in qs_text_units.iterator():
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)

            value = model_output[0]

            if value is None:
                continue

            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field \
                    .get_or_extract_value(doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                if ant.location_in_doc_start > field.detect_limit_count:
                    break

            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        ants: List[AnnotationDTO] = []
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo = cls.text_unit_repo

        if field.mlflow_detect_on_document_level:
            log.debug(
                'detect_field_value: mlflow_field_detection on doc level, ' +
                f'field {field.code}({field.pk}), document #{doc.pk}')
            text = doc.text
            model_input = dict(field_code_to_value)
            model_input['text'] = text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if not value:
                return None

            hint_name = ValueExtractionHint.TAKE_FIRST.name
            value, hint_name = typed_field \
                .get_or_extract_value(doc, value, hint_name, text)

            if not value:
                return None
            return FieldValueDTO(field_value=value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        log.debug(
            'detect_field_value: mlflow_field_detection on text unit level, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        for text_unit in qs_text_units.iterator():
            model_input = dict(field_code_to_value)
            model_input['text'] = text_unit.text
            model_input_df = pd.DataFrame([model_input])
            model_output = MLFlowModelManager().predict(
                field.mlflow_model_uri, model_input_df)
            value = model_output[0]
            if value is None:
                continue
            ant = None

            if typed_field.requires_value:
                # For the field types expecting a value the mlflow model must return either a value or None.
                hint_name = ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = typed_field.get_or_extract_value(
                    doc, value, hint_name, text_unit.text)
                if not typed_field.is_python_annotation_value_ok(value):
                    raise ValueError(
                        f'ML model of field {field.code} ({typed_field.type_code}) returned '
                        f'annotation value not suitable for this field:\n'
                        f'{value}')

                annotation_value = typed_field.annotation_value_python_to_json(
                    value)
                ant = AnnotationDTO(
                    annotation_value=annotation_value,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=hint_name)
            elif value:
                # For the related-info fields the mlflow model must return 0 or 1
                # where 1 means the text unit matches the field.
                ant = AnnotationDTO(
                    annotation_value=None,
                    location_in_doc_start=text_unit.location_start,
                    location_in_doc_end=text_unit.location_end,
                    extraction_hint_name=None)

            if ant is None:
                continue
            ants.append(ant)
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=ant.annotation_value,
                                     annotations=ants)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False) \
            -> Optional[ClassifierModel]:

        typed_field = TypedField.by(field)  # type: TypedField

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(typed_field, ChoiceField):
            raise ValueError('Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                             .format(field.code, field.uid, typed_field.type_code))
        # Lets find good values of depends-on fields suitable for using as train data.

        categories = cls.get_categories(field)

        train_data = cls.get_train_values(field,
                                          train_data_project_ids,
                                          set(categories),
                                          use_only_confirmed_field_values)

        if not train_data:
            raise RuntimeError('Not enough train data for field {0} (#{1}). '
                               'Need at least {2} approved or changed documents of type {3}.'
                               .format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                                       field.document_type.code))

        depends_on_code_types = cls.get_depends_on_code_type(field)
        depends_on_code_types = cls.remove_empty_fields(depends_on_code_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(field,
                                                           depends_on_code_types)  # type: Pipeline, List[Callable]

        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info('Collecting feature rows from train and test documents in dict form...')

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.code)
            del doc_field_values[field.code]

            field_value_idx = category_names_to_indexes.get(field_value) if field_value else None

            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data  # [:is_index]
        test_is_target_data = train_target_data  # [:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(test_oos_target_data,
                                                                            predicted_oos,
                                                                            target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(test_is_target_data,
                                                                        predicted_is,
                                                                        target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({'model': model, 'categories': categories, 'feature_names': feature_names})
        log.step_progress()
        log.info('Finished.')
        return cm
Example #4
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = TextUnitRepository()
        field_detector_repo = FieldDetectorRepository()

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)
Example #5
0
def detect_and_cache_field_values_for_document(
        log: ProcessLogger,
        document: Document,
        save: bool = True,
        clear_old_values: bool = True,
        changed_by_user: User = None,
        system_fields_changed: bool = False,
        generic_fields_changed: bool = False,
        document_initial_load: bool = False,
        ignore_field_codes: Set[str] = None,
        updated_field_codes: List[str] = None,
        skip_modified_values: bool = True,
        field_codes_to_detect: Optional[List[str]] = None):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :param field_codes_to_detect - optional list of fields codes - only these fields are to be detected
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(
            f'Detecting field values for completed documents is not permitted.\n'
            f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set())
                       for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    log.info(
        f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
        f'Updated fields: {updated_field_codes or "All"}.\n'
        f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
        f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [
            c for c in sorted_codes if c in dependent_fields and (
                not ignore_field_codes or c not in ignore_field_codes)
        ]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(
        document_type_id=document_type.pk,
        doc_id=document.pk,
        field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(
            list(
                FieldValue.objects.filter(modified_by__isnull=False,
                                          document_id=document.pk).values_list(
                                              'field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if field_codes_to_detect:
        all_codes = set([f.code for f in all_fields])
        skip_codes = skip_codes.union(all_codes - set(field_codes_to_detect))

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk, list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(
                log=log,
                doc=document,
                field=field,
                field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(
                    f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'"
            )

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(
                    document=document,
                    field=field,
                    field_value_dto=new_field_value_dto,
                    user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[
                    field.code] = typed_field.field_value_json_to_python(
                        field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(
                f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                exc_info=e,
                extra={Document.LOG_FIELD_DOC_ID,
                       str(document.pk)})
            detection_errors.append(
                (field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(
                user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(
            sender=detect_and_cache_field_values_for_document,
            log=log,
            document=document,
            changed_by_user=changed_by_user,
            document_initial_load=document_initial_load,
            system_fields_changed=system_fields_changed,
            generic_fields_changed=generic_fields_changed,
            user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([
                f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                for k in current_field_values
            ])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg, extra={Document.LOG_FIELD_DOC_ID, str(document.pk)})

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res
Example #6
0
    def get(self, request, subscription_id, content_format, **_kwargs):
        send_email = as_bool(request.GET, self.PARAM_SEND, False)

        subscription = DocumentNotificationSubscription.objects.get(
            pk=subscription_id)

        document_type = subscription.document_type

        document_id = as_int(request.GET, self.PARAM_DOCUMENT, None)
        if document_id:
            document = Document.objects.filter(document_type=document_type,
                                               pk=document_id).first()
            if not document:
                return HttpResponseBadRequest(
                    'Document with id = {0} not found or has wrong type.'.
                    format(document_id))
        else:
            document = Document.objects.filter(
                document_type=document_type).first()
            if not document:
                return HttpResponseBadRequest(
                    'Document id not provided and '
                    'there are no example documents of type {0}.'.format(
                        document_type.code))

        document_id = document.pk
        field_handlers = build_field_handlers(document_type,
                                              include_annotation_fields=False)
        field_values = get_document_field_values(document_type,
                                                 document_id,
                                                 handlers=field_handlers)

        example_changes = dict()
        if subscription.event in {
                DocumentAssignedEvent.code, DocumentChangedEvent.code
        } and field_values:
            for h in field_handlers:
                if random.random() > 0.3:
                    continue
                field = DocumentField.objects.filter(code=h.field_code).first()
                if not field:
                    continue
                typed_field = TypedField.by(field)
                example_value = typed_field.example_python_value()
                example_changes[h.field_code] = (example_value,
                                                 field_values.get(
                                                     h.field_code))

        try:
            notification = NotificationRenderer.render_notification(
                uuid.uuid4().hex, subscription,
                DocumentNotificationSource(document=document,
                                           field_handlers=field_handlers,
                                           field_values=field_values,
                                           changes=example_changes,
                                           changed_by_user=request.user))
        except Exception as e:
            return HttpResponse(render_error(
                'Exception caught while trying to render notification', e),
                                status=500,
                                content_type='text/plain')
        if not notification:
            return HttpResponse('Notification contains no data.', status=200)

        if content_format == self.FORMAT_HTML:
            content = notification.html
            content_type = 'text/html'
        else:
            content = notification.txt
            content_type = 'text/plain'

        if send_email:
            log = ErrorCollectingLogger()
            notification.send(log=log)
            error = log.get_error()
            if error:
                return HttpResponseServerError(content=error,
                                               content_type='application/json')

        return HttpResponse(content=content,
                            content_type=content_type,
                            status=200)