def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.by_sentence: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end in python_coded_field.get_values( text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end in python_coded_field.get_values( doc.full_text) or []: text_unit = TextUnit.objects.filter( document=doc, unit_type='sentence', location_start__lte=location_start, location_end__gte=location_start).first() # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'It can not be. Something is broken.'.format( field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. formula = field.formula if not formula: raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid)) depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values)) field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields} if field.stop_words: depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() v = cls.calc_formula(field.code, field.type, formula, field_code_to_value) return [DetectedFieldValue(field, v)]
def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel, field_type_adapter: FieldType, document: Document, field: DocumentField, text_unit: TextUnit) -> Optional[DetectedFieldValue]: field_uid, value, hint_name = cls.predict_value(sklearn_model, text_unit) if field_uid == field.uid: if field_type_adapter.value_aware: hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(document, field, value, hint_name, text_unit.text) return DetectedFieldValue(field, value, text_unit, hint_name) else: return DetectedFieldValue(field, None, text_unit) return None
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. formula = field.formula if not formula: raise ValueError( 'No formula specified for field {0} (#{1})'.format( field.code, field.uid)) depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } v = cls.calc_formula(field.code, field.type, formula, field_code_to_value) return [DetectedFieldValue(field, v)]
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] predicted = model.predict([doc.field_values]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: detected_values = cls.maybe_detect_with_stop_words( field, cached_fields) if detected_values is not None: return detected_values try: classifier_model = ClassifierModel.objects.get( document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] doc_field_vals = cached_fields predicted = model.predict([doc_field_vals]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e
def _maybe_add_val(f, v, owner): if v is None: return v = DetectedFieldValue(f, v, user=owner) prev = fields_to_values.get(f) if not prev: fields_to_values[f] = [v] else: prev.append(v)
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: if field_detector.matches(depends_on_value): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, depends_on_value) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = RegexpsOnlyFieldDetectionStrategy.\ text_unit_repo.get_doc_text_units(doc, field.text_unit_type) field_detectors = RegexpsOnlyFieldDetectionStrategy.\ field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units: # type: TextUnit for field_detector in detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): # type: TextUnit for field_detector in field_detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Optional[List]]: if field.requires_text_annotations: return False, None stop_words = compile_stop_words(field.stop_words) if not stop_words: return False, None field_type_adapter = field.get_field_type() # type: FieldType detected, possible_value = detect_value_with_stop_words(stop_words, full_text) if not detected: return False, None if possible_value is None: return True, None else: possible_value = field_type_adapter.extract_from_possible_value_text(field, possible_value) return True, [DetectedFieldValue(field, possible_value)]
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree... detected_values = cls.maybe_detect_with_stop_words( field, cached_fields) if detected_values is not None: return detected_values try: classifier_model = ClassifierModel.objects.get( document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] category_probabilities = model.predict_proba([cached_fields])[0] target_index = max(range(len(category_probabilities)), key=category_probabilities.__getitem__) target_probability = category_probabilities[target_index] predicted_value = categories[ target_index] if 0 <= target_index < len(categories) else None if predicted_value is None: target_name = field.unsure_choice_value else: threshold = (field.unsure_thresholds_by_value or {}) \ .get(predicted_value) or DocumentField.DEFAULT_UNSURE_THRESHOLD target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: if field.stop_words: depends_on_fields = list(field.depends_on_fields.all()) depends_on_full_text = [] for df in depends_on_fields: # type: DocumentField field_type_adapter = FIELD_TYPES_REGISTRY[ df.type] # type: FieldType v = field_type_adapter.merged_db_value_to_python( doc.field_values.get(df.uid)) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_values = \ detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text)) if detected_with_stop_words: return detected_values or list() document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] predicted = model.predict([doc.field_values]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.detect_per_text_unit: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, doc.full_text) or []: if field.requires_text_annotations and ( location_start is None or location_end is None): raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'undefined location but the field requires text annotation (and location).\n' 'This should not happen. Something is broken.'.format( field.python_coded_field, doc)) if location_start is not None and location_end is not None: text_unit = TextUnit.objects.filter( document=doc, unit_type=field.text_unit_type, location_start__lte=location_start, location_end__gte=location_start).first( ) # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'This should not happen. Something is broken.'. format(field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length else: text_unit = None location_start = None location_end = None detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: # type: DocumentFieldDetector matching_string = field_detector.matching_string( depends_on_value, text_is_sentence=False) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values