def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if field.mlflow_detect_on_document_level: text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .select_related('textunittext') \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: ants: List[AnnotationDTO] = [] typed_field: TypedField = TypedField.by(field) text_unit_repo = cls.text_unit_repo if field.mlflow_detect_on_document_level: log.debug( 'detect_field_value: mlflow_field_detection on doc level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) log.debug( 'detect_field_value: mlflow_field_detection on text unit level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') for text_unit in qs_text_units.iterator(): model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field.get_or_extract_value( doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False) \ -> Optional[ClassifierModel]: typed_field = TypedField.by(field) # type: TypedField log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format(field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(typed_field, ChoiceField): raise ValueError('Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, typed_field.type_code)) # Lets find good values of depends-on fields suitable for using as train data. categories = cls.get_categories(field) train_data = cls.get_train_values(field, train_data_project_ids, set(categories), use_only_confirmed_field_values) if not train_data: raise RuntimeError('Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.' .format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_code_types = cls.get_depends_on_code_type(field) depends_on_code_types = cls.remove_empty_fields(depends_on_code_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline(field, depends_on_code_types) # type: Pipeline, List[Callable] category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info('Collecting feature rows from train and test documents in dict form...') # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.code) del doc_field_values[field.code] field_value_idx = category_names_to_indexes.get(field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data # [:is_index] test_is_target_data = train_target_data # [:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report(test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report(test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({'model': model, 'categories': categories, 'feature_names': feature_names}) log.step_progress() log.info('Finished.') return cm
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = TextUnitRepository() field_detector_repo = FieldDetectorRepository() depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_and_cache_field_values_for_document( log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True, field_codes_to_detect: Optional[List[str]] = None): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :param field_codes_to_detect - optional list of fields codes - only these fields are to be detected :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError( f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields } # type: Dict[str, DocumentField] log.info( f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [ c for c in sorted_codes if c in dependent_fields and ( not ignore_field_codes or c not in ignore_field_codes) ] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value( document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set( list( FieldValue.objects.filter(modified_by__isnull=False, document_id=document.pk).values_list( 'field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if field_codes_to_detect: all_codes = set([f.code for f in all_fields]) skip_codes = skip_codes.union(all_codes - set(field_codes_to_detect)) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value( log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append( f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'" ) # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto( document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[ field.code] = typed_field.field_value_json_to_python( field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error( f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e, extra={Document.LOG_FIELD_DOC_ID, str(document.pk)}) detection_errors.append( (field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list( user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed( sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([ f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values ]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg, extra={Document.LOG_FIELD_DOC_ID, str(document.pk)}) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res
def get(self, request, subscription_id, content_format, **_kwargs): send_email = as_bool(request.GET, self.PARAM_SEND, False) subscription = DocumentNotificationSubscription.objects.get( pk=subscription_id) document_type = subscription.document_type document_id = as_int(request.GET, self.PARAM_DOCUMENT, None) if document_id: document = Document.objects.filter(document_type=document_type, pk=document_id).first() if not document: return HttpResponseBadRequest( 'Document with id = {0} not found or has wrong type.'. format(document_id)) else: document = Document.objects.filter( document_type=document_type).first() if not document: return HttpResponseBadRequest( 'Document id not provided and ' 'there are no example documents of type {0}.'.format( document_type.code)) document_id = document.pk field_handlers = build_field_handlers(document_type, include_annotation_fields=False) field_values = get_document_field_values(document_type, document_id, handlers=field_handlers) example_changes = dict() if subscription.event in { DocumentAssignedEvent.code, DocumentChangedEvent.code } and field_values: for h in field_handlers: if random.random() > 0.3: continue field = DocumentField.objects.filter(code=h.field_code).first() if not field: continue typed_field = TypedField.by(field) example_value = typed_field.example_python_value() example_changes[h.field_code] = (example_value, field_values.get( h.field_code)) try: notification = NotificationRenderer.render_notification( uuid.uuid4().hex, subscription, DocumentNotificationSource(document=document, field_handlers=field_handlers, field_values=field_values, changes=example_changes, changed_by_user=request.user)) except Exception as e: return HttpResponse(render_error( 'Exception caught while trying to render notification', e), status=500, content_type='text/plain') if not notification: return HttpResponse('Notification contains no data.', status=200) if content_format == self.FORMAT_HTML: content = notification.html content_type = 'text/html' else: content = notification.txt content_type = 'text/plain' if send_email: log = ErrorCollectingLogger() notification.send(log=log) error = log.get_error() if error: return HttpResponseServerError(content=error, content_type='application/json') return HttpResponse(content=content, content_type=content_type, status=200)