def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. formula = field.formula if not formula: raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid)) depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values)) field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields} if field.stop_words: depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() v = cls.calc_formula(field.code, field.type, formula, field_code_to_value) return [DetectedFieldValue(field, v)]
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values)) field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields} if field.stop_words: depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get(field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: matching_string = field_detector.matching_string(depends_on_value) if matching_string is not None: value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append(DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = RegexpsOnlyFieldDetectionStrategy.\ text_unit_repo.get_doc_text_units(doc, field.text_unit_type) field_detectors = RegexpsOnlyFieldDetectionStrategy.\ field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units: # type: TextUnit for field_detector in detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): # type: TextUnit for field_detector in field_detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = field.get_field_type() detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def maybe_detect_with_stop_words( cls, log: ProcessLogger, doc: Document, field: DocumentField) -> Optional[List[DetectedFieldValue]]: if field.stop_words: depends_on_fields = list(field.depends_on_fields.all()) depends_on_full_text = [] for df in depends_on_fields: # type: DocumentField field_type_adapter = FIELD_TYPES_REGISTRY[ df.type] # type: FieldType v = field_type_adapter.merged_db_value_to_python( doc.field_values.get(df.uid)) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_values = \ detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text)) if detected_with_stop_words: return detected_values or list() return None
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: if field.stop_words: depends_on_fields = list(field.depends_on_fields.all()) depends_on_full_text = [] for df in depends_on_fields: # type: DocumentField field_type_adapter = FIELD_TYPES_REGISTRY[ df.type] # type: FieldType v = field_type_adapter.merged_db_value_to_python( doc.field_values.get(df.uid)) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_values = \ detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text)) if detected_with_stop_words: return detected_values or list() document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] predicted = model.predict([doc.field_values]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e
def maybe_detect_with_stop_words( cls, field: DocumentField, cached_fields: Dict[str, Any]) -> Optional[List[DetectedFieldValue]]: if field.stop_words: depends_on_fields = list(field.depends_on_fields.all()) depends_on_full_text = [] if not any(cached_fields): return None for df in depends_on_fields: # type: DocumentField field_type_adapter = FIELD_TYPE_REGISTRY[ df.type] # type: FieldType v = field_type_adapter.merged_db_value_to_python( cached_fields.get(df.uid)) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_values = \ detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text)) if detected_with_stop_words: return detected_values or list() return None