def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_fields = field.get_depends_on_codes() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_value \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) field_detectors = DocumentFieldDetector.objects.filter(field=field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField values = list() # type: List for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for detector_field_matcher in detectors: # type: DetectorFieldMatcher matching_piece = detector_field_matcher.matching_string( depends_on_value, text_is_sentence=False) if matching_piece is not None: matching_string = matching_piece[0] value = detector_field_matcher.get_validated_detected_value( field) if typed_field.requires_value: hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) if value is None: continue value = typed_field.annotation_value_python_to_json(value) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=value) else: values.append(value) if isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=typed_field. build_json_field_value_from_json_ant_values(values)) else: return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug('detect_field_value: csv_regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detectors = cls.detecting_cache.get_detectors( field.pk, lambda msg, er: log.error(msg, field_code=field.code, exc_info=er)) if not detectors: return None is_multichoice = field.type == MultiChoiceField.type_code doc_text = cls.get_document_text(doc) annotations = [] for detector in detectors: found_item = detector.find_value(doc_text) if not found_item: continue # TODO: implement reading values from full text (TextParts.FULL.value) # as it is done now, or from text units - paragraphs or sentences # based on field.text_unit_type - for other detector.text_part options """ if detector.text_part == TextParts.BEFORE_REGEXP.value: return matching_string[:begin], 0, begin elif detector.text_part == TextParts.AFTER_REGEXP.value: return matching_string[end:], end, len(text) elif detector.text_part == TextParts.INSIDE_REGEXP.value: return matching_string[begin:end], begin, end else: return text, 0, len(text) """ # starting position has to be shifted backward by 1 symbol for FE ant = AnnotationDTO(annotation_value=found_item[0], location_in_doc_start=max( found_item[1] - 1, 0), location_in_doc_end=found_item[2], extraction_hint_name='') if not is_multichoice: return FieldValueDTO(field_value=found_item[0], annotations=[ant]) else: annotations.append(ant) if annotations: f_val = [a.annotation_value for a in annotations] return FieldValueDTO(field_value=f_val, annotations=annotations) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: try: log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') except AttributeError: pass ants: List[AnnotationDTO] = [] depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) text_unit_repo: TextUnitRepository = cls.text_unit_repo field_detector_repo: FieldDetectorRepository = cls.field_detector_repo detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] for text_unit in qs_text_units: unit_ants = cls.extract_from_textunit(text_unit, field, detectors) if not unit_ants: continue if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=unit_ants[0].annotation_value, annotations=unit_ants) else: ants += unit_ants if not ants: return None if isinstance(typed_field, MultiValueField): field_value = typed_field.build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]) else: field_value = typed_field.annotation_value_python_to_json( ants[0].annotation_value) return FieldValueDTO(field_value=field_value, annotations=ants)
def maybe_detect_with_stop_words( cls, field: DocumentField, doc: Document, cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]: if field.stop_words: depends_on_field_codes = list( field.depends_on_fields.all().values_list( 'code', flat=True)) # type: List[str] depends_on_full_text = [] if not any(cached_fields): return None for field_code in depends_on_field_codes: v = cached_fields.get(field_code) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_field_value = \ detect_with_stop_words_by_field_and_full_text(field=field, doc=doc, full_text='\n'.join(depends_on_full_text)) if detected_with_stop_words: return FieldValueDTO(field_value=TypedField.by( field).field_value_python_to_json(detected_field_value)) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: dto = cls.maybe_detect_with_stop_words(field, field_code_to_value) if dto is not None: return dto try: classifier_model = ClassifierModel.objects.get( document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] doc_field_vals = field_code_to_value predicted = model.predict([doc_field_vals]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dto = FieldValueDTO(field_value=TypedField.by( field).field_value_python_to_json(target_name)) return dto except ClassifierModel.DoesNotExist as e: raise e
def pack_parsed_value(cls, typed_field: TypedField, value: Any, loc_start: int, loc_end: int): value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=loc_start, location_in_doc_end=loc_end) return FieldValueDTO(field_value=value, annotations=[ant])
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: v = PartyUsage.objects.filter(text_unit__document_id=doc.id) \ .aggregate(value=StringAgg('party__name', delimiter=', ', distinct=True)) value = TypedField.by(field).field_value_python_to_json(v['value']) return FieldValueDTO(field_value=value) if v else None
def get_value(self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for v in CurrencyUsage.objects.filter(text_unit__document_id=doc.id) \ .order_by('-amount') \ .values('currency', 'amount'): v = TypedField.by(field).field_value_python_to_json(v) return FieldValueDTO(field_value=v) return None
def get_value(self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for v in DateUsage.objects.filter(text_unit__document_id=doc.id) \ .order_by('-date') \ .values_list('date', flat=True): v = TypedField.by(field).field_value_python_to_json(v) return FieldValueDTO(field_value=v) return None
def load_field_values(task: ExtendedTask, document: Document, document_fields_alias_to_value: Dict[str, Any]) \ -> Dict[DocumentField, FieldValueDTO]: document_type = document.document_type fields_to_values = dict() # type: Dict[DocumentField, FieldValueDTO] if not document_type: return fields_to_values field_code_aliases = document_type.field_code_aliases field_codes_to_fields = {f.code.lower(): f for f in document_type.fields.all()} if field_code_aliases: field_codes_to_fields.update({field_alias.lower(): field_codes_to_fields.get(field_code.lower()) for field_alias, field_code in field_code_aliases.items() if field_alias and field_code}) for field_alias, field_value_text in document_fields_alias_to_value.items(): if field_value_text is None: continue field = field_codes_to_fields.get(field_alias.lower()) # type: DocumentField if not field: task.log_warn( 'Field alias "{0}" not found for document type {1}'.format(field_alias, document_type.code)) continue typed_field = TypedField.by(field) # type: TypedField if type(field_value_text) is list: for possible_value_text in list(field_value_text): maybe_value = typed_field.extract_from_possible_value_text(possible_value_text) if maybe_value: maybe_value = typed_field.field_value_python_to_json(maybe_value) fields_to_values[field] = FieldValueDTO(field_value=maybe_value) break else: maybe_value = typed_field.extract_from_possible_value_text(field_value_text) if maybe_value: maybe_value = typed_field.field_value_python_to_json(maybe_value) fields_to_values[field] = FieldValueDTO(field_value=maybe_value) return fields_to_values
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for date, start, end in DateUsage.objects.filter(text_unit__document_id=doc.pk) \ .order_by('-date') \ .values_list('date', 'text_unit__location_start', 'text_unit__location_end'): v = TypedField.by(field).field_value_python_to_json(date) return FieldValueDTO(field_value=v) return None
def is_unit_limit_exceeded(fval_dto: FieldValueDTO, field: DocumentField, _: Document) -> bool: if not fval_dto.annotations or not field.detect_limit_count: return False # "filter" annotations by detect_limit_count if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: fval_dto.annotations = [d for d in fval_dto.annotations if d.location_in_doc_start <= field.detect_limit_count] return not (not field.requires_text_annotations or fval_dto.annotations) # TODO: we can't calculate other text measuring units (sentence, paragraph, page) # with reasonable effort return False
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for curx, amt, start, end in \ CurrencyUsage.objects.filter(text_unit__document_id=doc.pk) \ .order_by('-amount') \ .values('currency', 'amount', 'text_unit__location_start', 'text_unit__location_end'): v = TypedField.by(field).field_value_python_to_json(( curx, amt, )) return FieldValueDTO(field_value=v) return None
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: party_query = PartyUsage.objects.filter(text_unit__document_id=doc.pk) party_values = party_query.values_list('party__name', 'text_unit__location_start', 'text_unit__location_end') party_names = set() for name, start, end in party_values: party_names.add(name) names = ', '.join(party_names) value = TypedField.by(field).field_value_python_to_json(names) return FieldValueDTO(field_value=value) if names else None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree... detected_value = cls.maybe_detect_with_stop_words( field, field_code_to_value) if detected_value is not None: return detected_value try: classifier_model = ClassifierModel.objects.get( document_field=field) # type: ClassifierModel obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] category_probabilities = model.predict_proba([field_code_to_value ])[0] target_index = max(range(len(category_probabilities)), key=category_probabilities.__getitem__) target_probability = category_probabilities[target_index] predicted_value = categories[ target_index] if 0 <= target_index < len(categories) else None if predicted_value is None: target_name = field.unsure_choice_value else: threshold = (field.unsure_thresholds_by_value or {}).get(predicted_value) \ or DocumentField.DEFAULT_UNSURE_THRESHOLD target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value value = TypedField.by(field).field_value_python_to_json( target_name) if classifier_model.store_suggestion: store_classification_suggestion(field, doc, value, target_probability) return FieldValueDTO(field_value=value) except ClassifierModel.DoesNotExist as e: raise e
def _maybe_save_reverse_similarity_value(self, log: ProcessLogger, field: DocumentField, document: Document, other_doc_id) -> bool: field_repo = DocumentFieldRepository() if not field_repo.field_value_exists(other_doc_id, field.pk, [document.pk]): other_document = Document.all_objects.get(pk=other_doc_id) field_repo.update_field_value_with_dto(document=other_document, field=field, field_value_dto=FieldValueDTO(field_value=[document.pk]), merge=True) cache_document_fields(log=log, document=other_document, cache_system_fields=False, cache_generic_fields=False, cache_user_fields=[field.code])
def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) \ -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if typed_field.multi_value: raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and ' f'{typed_field.type_code} is multi-value') qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): # type: TextUnit found, value = self.find_value_in_text_unit(log, field, doc, text_unit) if found: value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end) return FieldValueDTO(field_value=value, annotations={ant})
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = cls.text_unit_repo field_detector_repo = cls.field_detector_repo log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: ants: List[AnnotationDTO] = [] typed_field: TypedField = TypedField.by(field) text_unit_repo = cls.text_unit_repo if field.mlflow_detect_on_document_level: log.debug( 'detect_field_value: mlflow_field_detection on doc level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) log.debug( 'detect_field_value: mlflow_field_detection on text unit level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') for text_unit in qs_text_units.iterator(): model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field.get_or_extract_value( doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any], location_text: Optional[str], location_start: int = 0, location_end: int = 0) -> Optional[FieldValueDTO]: try: conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR ) # type: Optional[DocumentSimilarityConfig] except DocumentSimilarityConfig.DoesNotExist: conf = None if conf: conf.self_validate() similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD feature_vector_fields = field.depends_on_fields.all() date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS document_type = doc.document_type feature_vector_field_codes = {f.code for f in feature_vector_fields} doc_field_values = dict(cur_field_code_to_value) doc_field_values[FIELD_CODE_DOC_ID] = doc.pk if date_constraint_field_code: doc_date = doc_field_values.get(date_constraint_field_code) date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) doc_ids_query = FieldValue.objects \ .filter(field__code=date_constraint_field_code) \ .filter(value__gte=date_start) \ .filter(value__lte=date_end) \ .filter(document__document_type_id=document_type.pk) \ .exclude(document_id=doc.pk) \ .values_list('document_id', flat=True) else: doc_date = doc.history.last().history_date date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) doc_ids_query = Document.history \ .filter(history_type='+', history_date__gte=date_start, history_date__lte=date_end, document_type_id=document_type.pk) \ .exclude(id=doc.pk) \ .values_list('pk', flat=True) try: vectorizer = document_feature_vector_pipeline( feature_vector_fields, use_field_codes=True) field_repo = DocumentFieldRepository() field_values_list = list() for doc_id, field_values in field_repo \ .get_field_code_to_python_value_multiple_docs(document_type_id=document_type.pk, doc_ids=doc_ids_query, field_codes_only=feature_vector_field_codes): d = dict(field_values) d[FIELD_CODE_DOC_ID] = doc_id field_values_list.append(d) if not field_values_list: return None field_values_list = [doc_field_values] + field_values_list feature_vectors = vectorizer.fit_transform(field_values_list) doc_feature_vectors = feature_vectors[0] except ValueError as ve: if 'empty vocabulary' in str(ve): log.info( f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs ' f'contains any value in the feature vector fields.') return None raise ve similarities = cosine_similarity(doc_feature_vectors, feature_vectors) # TODO: Think about removing usage of other_field_values_list here and switching it to generator # to avoid storing the list of all field values. We only need feature vectors but they have no doc id. res = set() # type: Set[int] for y, field_values in enumerate(field_values_list): other_doc_pk = field_values[FIELD_CODE_DOC_ID] if doc.pk == other_doc_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue res.add(other_doc_pk) self._maybe_save_reverse_similarity_value( log=log, field=field, document=doc, other_doc_id=other_doc_pk) if res: field_value = sorted(res)[0] return FieldValueDTO(field_value) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if field.mlflow_detect_on_document_level: text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .select_related('textunittext') \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)