def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel, typed_field: TypedField, document: Document, field: DocumentField, text_unit: TextUnit) -> Optional[AnnotationDTO]: field_code, value, hint_name = cls.predict_value(sklearn_model, text_unit) if field_code == field.code: if typed_field.requires_value: hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(document, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json(value) return AnnotationDTO(annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) else: return AnnotationDTO(annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) return None
def _validate_choice_values_removed(self, context: dict) -> None: saved_field = self._get_saved_field(context) if not saved_field or not TypedField.by(saved_field).is_choice_field \ or not TypedField.by(self.object).is_choice_field: return err_msg = '' invalid_choices = self._get_invalid_choices(saved_field) if self._is_allow_values_not_specified_in_choices_was_unset( saved_field): err_msg += '"Allow values not specified in choices" flag is unset in the the config being imported. ' if invalid_choices: invalid_choices = [ '"{0}"'.format(invalid_choice) for invalid_choice in invalid_choices ] err_msg += 'The following choice values are missing in the config being imported: {0}. ' \ .format(', '.join(invalid_choices)) if err_msg: invalid_values_count = self.object.get_invalid_choice_annotations( ).count() user_values_count = 0 detected_values_count = 0 if invalid_values_count > 0: field_repo = DocumentFieldRepository() user_values_count = field_repo.get_invalid_choice_vals_count( self.object) detected_values_count = self._get_detected_values_count( invalid_values_count, user_values_count) err_msg += 'Number of invalid values: user entered values {0}, automatically detected values {1}.' \ ' You need to set force auto-fixes option to continue (this option will remove all invalid' \ ' values) or make manual updates.'.format(user_values_count, detected_values_count) err_msg = 'Unable to update field #{0} "{1}". {2}'.format( self.pk, self.object.code, err_msg) raise ValidationError(err_msg)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: dto = cls.maybe_detect_with_stop_words(field, field_code_to_value) if dto is not None: return dto try: classifier_model = ClassifierModel.objects.get( document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] doc_field_vals = field_code_to_value predicted = model.predict([doc_field_vals]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dto = FieldValueDTO(field_value=TypedField.by( field).field_value_python_to_json(target_name)) return dto except ClassifierModel.DoesNotExist as e: raise e
def maybe_detect_with_stop_words( cls, field: DocumentField, doc: Document, cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]: if field.stop_words: depends_on_field_codes = list( field.depends_on_fields.all().values_list( 'code', flat=True)) # type: List[str] depends_on_full_text = [] if not any(cached_fields): return None for field_code in depends_on_field_codes: v = cached_fields.get(field_code) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_field_value = \ detect_with_stop_words_by_field_and_full_text(field=field, doc=doc, full_text='\n'.join(depends_on_full_text)) if detected_with_stop_words: return FieldValueDTO(field_value=TypedField.by( field).field_value_python_to_json(detected_field_value)) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) typed_field = TypedField.by(field) # type: TypedField if python_coded_field.type != typed_field.type_code: raise RuntimeError( f'Python-coded field {python_coded_field.__class__.__name__} is ' f'for fields of type {python_coded_field.type} and field {field.code} ' f'is of type {typed_field.type_code}') log.debug('detect_field_value: python_coded_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') field_value_dto = python_coded_field.get_value( log=log, field=field, doc=doc, cur_field_code_to_value=field_code_to_value) if not typed_field.is_json_field_value_ok(field_value_dto.field_value): raise ValueError( f'Python coded field class {field.python_coded_field} returned value not suitable for ' f'field {field.code} ({typed_field.type_code})') return field_value_dto
def build_pipeline( cls, field: DocumentField, depends_on_fields: List[Tuple[str, str]] ) -> Tuple[Pipeline, List[Callable[[], List[str]]]]: transformer_list = [] feature_names_funcs = [] fields = list( DocumentField.objects.filter( code__in={code for code, _type in depends_on_fields})) fields_by_code = {f.code: f for f in fields} for field_code, field_type in sorted( depends_on_fields, key=lambda t: t[1]): # type: str, str field_type = TypedField.by( fields_by_code[field_code]) # type: TypedField field_vect_steps = [('sel', FieldValueExtractor(field_code))] field_vect_pipeline, field_feature_names_func = field_type.build_vectorization_pipeline( ) field_vect_steps.extend(field_vect_pipeline) transformer_list.append((field_code, Pipeline(field_vect_steps))) feature_names_funcs.append( wrap_feature_names_with_field_code(field_feature_names_func, field_code)) classifier = cls.init_classifier(field) return Pipeline([('vect', FeatureUnion(transformer_list)), ('clf', classifier)]), feature_names_funcs
def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def pack_parsed_value(cls, typed_field: TypedField, value: Any, loc_start: int, loc_end: int): value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=loc_start, location_in_doc_end=loc_end) return FieldValueDTO(field_value=value, annotations=[ant])
def join_field_detector_model_tests(task: ExtendedTask, field_uid, classifier_model_id): results = list(Task.objects .filter(main_task_id=task.request.parent_id, name=TrainAndTest.test_field_detector_model.name) .values_list('result', flat=True)) test_text_units_number = 0 match_number = 0 test_doc_number = 0 matches_per_value = dict() total_per_value = dict() for res in results: actual_field_value = res.get('actual_field_value') if actual_field_value: if actual_field_value not in total_per_value: total_per_value[actual_field_value] = 0 total_per_value[actual_field_value] += 1 test_doc_number += 1 test_text_units_number += (res.get('text_units_number') or 0) if res.get('value_matches_expected'): match_number += 1 if actual_field_value: if actual_field_value not in matches_per_value: matches_per_value[actual_field_value] = 0 matches_per_value[actual_field_value] += 1 accuracy = match_number / test_doc_number if classifier_model_id: classifier_model = ClassifierModel.objects.get(pk=classifier_model_id) classifier_model.field_detection_accuracy = accuracy classifier_model.save() field = DocumentField.objects.get(pk=field_uid) task.log_info('Testing finished.\n' 'Document type: {0}.\n' 'Field: {1}.\n' 'Text unit type: {2}.\n' 'Test documents number: {3}.\n' 'Test text units number: {4}.\n' 'Accuracy: {5}.\n' .format(field.document_type.code, field.code, field.text_unit_type, test_doc_number, test_text_units_number, accuracy)) if TypedField.by(field).is_choice_field: accuracy_per_value = {actual_field_value: (matches_per_value.get(actual_field_value) or 0) / total for actual_field_value, total in total_per_value.items()} task.log_info('Accuracy per value:\n{0}'.format(json.dumps(accuracy_per_value, sort_keys=True, indent=2)))
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: v = PartyUsage.objects.filter(text_unit__document_id=doc.id) \ .aggregate(value=StringAgg('party__name', delimiter=', ', distinct=True)) value = TypedField.by(field).field_value_python_to_json(v['value']) return FieldValueDTO(field_value=value) if v else None
def get_value(self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for v in CurrencyUsage.objects.filter(text_unit__document_id=doc.id) \ .order_by('-amount') \ .values('currency', 'amount'): v = TypedField.by(field).field_value_python_to_json(v) return FieldValueDTO(field_value=v) return None
def get_value(self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for v in DateUsage.objects.filter(text_unit__document_id=doc.id) \ .order_by('-date') \ .values_list('date', flat=True): v = TypedField.by(field).field_value_python_to_json(v) return FieldValueDTO(field_value=v) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_fields = field.get_depends_on_codes() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_value \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) field_detectors = DocumentFieldDetector.objects.filter(field=field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField values = list() # type: List for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for detector_field_matcher in detectors: # type: DetectorFieldMatcher matching_piece = detector_field_matcher.matching_string( depends_on_value, text_is_sentence=False) if matching_piece is not None: matching_string = matching_piece[0] value = detector_field_matcher.get_validated_detected_value( field) if typed_field.requires_value: hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) if value is None: continue value = typed_field.annotation_value_python_to_json(value) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=value) else: values.append(value) if isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=typed_field. build_json_field_value_from_json_ant_values(values)) else: return None
def check_field_values(task: ExtendedTask, field_value_ids: List[int], delete_broken: bool = False): import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() qa_field_values = field_repo.get_field_values_by_ids(field_value_ids) for fv in qa_field_values: # FieldAnnotation field = fv.field # type: DocumentField if not TypedField.by(field).is_json_field_value_ok(fv.value): FindBrokenDocumentFieldValues.process_broken(task, fv, delete_broken)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def validate(dst_field: DocumentField, date_constraint_field: DocumentField, date_constraint_days: int) \ -> List[Tuple[str, str]]: document_type = dst_field.document_type feature_vector_fields = dst_field.depends_on_fields.all() res = list() if date_constraint_days is not None and date_constraint_days < 1: res.append((ATTR_DATE_CONSTRAINT_DAYS, 'Date constraint days should be either empty or a ' 'positive integer.')) if date_constraint_field is not None and date_constraint_field.document_type_id != document_type.pk: res.append(( ATTR_DATE_CONSTRAINT_FIELD, 'Date constraint field should be owned by the same document type' 'as the destination field.')) if date_constraint_field is not None and date_constraint_days is None: res.append( (ATTR_DATE_CONSTRAINT_DAYS, 'Date constraint days number should not be empty if the date ' 'constraint field is assigned.')) if date_constraint_field is not None: example_value = TypedField.by( date_constraint_field).example_python_value() if not isinstance(example_value, (date, datetime)): res.append(( ATTR_DATE_CONSTRAINT_FIELD, 'Type of the date constraint field should be date or datetime' )) if dst_field.type != LinkedDocumentsField.code: res.append((ATTR_DST_FIELD, 'Destination field should be of type {0}'.format( LinkedDocumentsField.code))) if not feature_vector_fields: res.append((ATTR_FEATURE_VECTOR_FIELDS, 'Feature vector fields list can not be empty.')) wrong_doc_type_fields = list() for f in feature_vector_fields: if f.document_type_id != dst_field.document_type_id: wrong_doc_type_fields.append(f.code) if wrong_doc_type_fields: res.append( (ATTR_FEATURE_VECTOR_FIELDS, mark_safe('''All feature vector fields should be owned by the same document type as the destination field: {dst_field_type}.<br /> The following fields are owned by different document type(s):<br />{bad_fields}''' .format( bad_fields='<br />'.join(wrong_doc_type_fields), dst_field_type=document_type.code)))) return res or None
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for date, start, end in DateUsage.objects.filter(text_unit__document_id=doc.pk) \ .order_by('-date') \ .values_list('date', 'text_unit__location_start', 'text_unit__location_end'): v = TypedField.by(field).field_value_python_to_json(date) return FieldValueDTO(field_value=v) return None
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: for curx, amt, start, end in \ CurrencyUsage.objects.filter(text_unit__document_id=doc.pk) \ .order_by('-amount') \ .values('currency', 'amount', 'text_unit__location_start', 'text_unit__location_end'): v = TypedField.by(field).field_value_python_to_json(( curx, amt, )) return FieldValueDTO(field_value=v) return None
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Any]: if field.requires_text_annotations: return False, None stop_words = compile_stop_words(field.stop_words) if not stop_words: return False, None typed_field = TypedField.by(field) # type: TypedField detected, possible_value = detect_value_with_stop_words(stop_words, full_text) if not detected: return False, None if possible_value is None: return True, None else: possible_value = typed_field.extract_from_possible_value_text(possible_value) return True, possible_value
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: try: log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') except AttributeError: pass ants: List[AnnotationDTO] = [] depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) text_unit_repo: TextUnitRepository = cls.text_unit_repo field_detector_repo: FieldDetectorRepository = cls.field_detector_repo detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] for text_unit in qs_text_units: unit_ants = cls.extract_from_textunit(text_unit, field, detectors) if not unit_ants: continue if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=unit_ants[0].annotation_value, annotations=unit_ants) else: ants += unit_ants if not ants: return None if isinstance(typed_field, MultiValueField): field_value = typed_field.build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]) else: field_value = typed_field.annotation_value_python_to_json( ants[0].annotation_value) return FieldValueDTO(field_value=field_value, annotations=ants)
def has_problems_with_field(cls, field: DocumentField) -> Optional[str]: if not field.mlflow_model_uri: return f'MLFlow model uri is not set for field {field.code}' try: output = cls.test_model(field.mlflow_model_uri) output = output[0] if output is not None else None tf = TypedField.by(field) if not tf.is_python_field_value_ok(output): return f'MLFlow model returned value which does not match the field type.\n' \ f'Returned value (shortened up to 100 chars): {str(output)[:100]}.\n ' \ f'Example value: {tf.example_python_value()}.' return None except Exception as e: from apps.common.errors import render_error render_error('MLFlow model has thrown exception when testing ' '(input = 1-row DataFrame with text = "Hello World")', e)
def get_value( self, log, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: party_query = PartyUsage.objects.filter(text_unit__document_id=doc.pk) party_values = party_query.values_list('party__name', 'text_unit__location_start', 'text_unit__location_end') party_names = set() for name, start, end in party_values: party_names.add(name) names = ', '.join(party_names) value = TypedField.by(field).field_value_python_to_json(names) return FieldValueDTO(field_value=value) if names else None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree... detected_value = cls.maybe_detect_with_stop_words( field, field_code_to_value) if detected_value is not None: return detected_value try: classifier_model = ClassifierModel.objects.get( document_field=field) # type: ClassifierModel obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] category_probabilities = model.predict_proba([field_code_to_value ])[0] target_index = max(range(len(category_probabilities)), key=category_probabilities.__getitem__) target_probability = category_probabilities[target_index] predicted_value = categories[ target_index] if 0 <= target_index < len(categories) else None if predicted_value is None: target_name = field.unsure_choice_value else: threshold = (field.unsure_thresholds_by_value or {}).get(predicted_value) \ or DocumentField.DEFAULT_UNSURE_THRESHOLD target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value value = TypedField.by(field).field_value_python_to_json( target_name) if classifier_model.store_suggestion: store_classification_suggestion(field, doc, value, target_probability) return FieldValueDTO(field_value=value) except ClassifierModel.DoesNotExist as e: raise e
def load_field_values(task: ExtendedTask, document: Document, document_fields_alias_to_value: Dict[str, Any]) \ -> Dict[DocumentField, FieldValueDTO]: document_type = document.document_type fields_to_values = dict() # type: Dict[DocumentField, FieldValueDTO] if not document_type: return fields_to_values field_code_aliases = document_type.field_code_aliases field_codes_to_fields = {f.code.lower(): f for f in document_type.fields.all()} if field_code_aliases: field_codes_to_fields.update({field_alias.lower(): field_codes_to_fields.get(field_code.lower()) for field_alias, field_code in field_code_aliases.items() if field_alias and field_code}) for field_alias, field_value_text in document_fields_alias_to_value.items(): if field_value_text is None: continue field = field_codes_to_fields.get(field_alias.lower()) # type: DocumentField if not field: task.log_warn( 'Field alias "{0}" not found for document type {1}'.format(field_alias, document_type.code)) continue typed_field = TypedField.by(field) # type: TypedField if type(field_value_text) is list: for possible_value_text in list(field_value_text): maybe_value = typed_field.extract_from_possible_value_text(possible_value_text) if maybe_value: maybe_value = typed_field.field_value_python_to_json(maybe_value) fields_to_values[field] = FieldValueDTO(field_value=maybe_value) break else: maybe_value = typed_field.extract_from_possible_value_text(field_value_text) if maybe_value: maybe_value = typed_field.field_value_python_to_json(maybe_value) fields_to_values[field] = FieldValueDTO(field_value=maybe_value) return fields_to_values
def document_feature_vector_pipeline(feature_vector_fields: List[DocumentField], use_field_codes: bool = False) \ -> FeatureUnion: transformer_list = [] for field in sorted(feature_vector_fields, key=lambda f: f.pk): # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_vect_steps = [ ('sel', FieldValueExtractor(field.code if use_field_codes else field.pk)) ] field_vect_pipeline, _field_feature_names_func = typed_field.build_vectorization_pipeline( ) field_vect_steps.extend(field_vect_pipeline) transformer_list.append((field.code, Pipeline(field_vect_steps))) return FeatureUnion(transformer_list)
def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) \ -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if typed_field.multi_value: raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and ' f'{typed_field.type_code} is multi-value') qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): # type: TextUnit found, value = self.find_value_in_text_unit(log, field, doc, text_unit) if found: value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end) return FieldValueDTO(field_value=value, annotations={ant})
def calc_formula(cls, field_code: str, formula: str, depends_on_field_to_value: Dict[str, Any], convert_decimals_to_floats: bool = False) -> Any: if not formula or not formula.strip(): return None if convert_decimals_to_floats: depends_on_field_to_value = {c: TypedField.replace_decimals_with_floats_in_python_value_of_any_type(v) for c, v in depends_on_field_to_value.items()} \ if depends_on_field_to_value else {} try: if settings.DEBUG_SLOW_DOWN_FIELD_FORMULAS_SEC: import time time.sleep(settings.DEBUG_SLOW_DOWN_FIELD_FORMULAS_SEC) return eval_script(script_title=f'{field_code} formula', script_code=formula, eval_locals=depends_on_field_to_value) except ScriptError as se: raise DocumentFieldFormulaError(field_code, formula, depends_on_field_to_value) from se
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = cls.text_unit_repo field_detector_repo = cls.field_detector_repo log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [c for c in sorted_codes if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set(list(FieldValue.objects.filter( modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value(log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append(f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'") # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e) detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list(user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed(sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res