def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = FIELD_TYPES_REGISTRY[field.type] detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool, csv_contains_regexps: bool = False): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): if len(row) == 0: continue includes = row.dropna() if not csv_contains_regexps: includes = [i.strip().replace(' ', '\s{1,100}') for i in includes] includes = [i for i in includes if i] if len(includes) == 1: log.info( 'There are no search strings specified for detected value {0}'. format(row[0])) continue detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(includes[1:]) detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def apply_simple_config(log: ProcessLogger, document_field: DocumentField, document_type: DocumentType, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, document_type=document_type).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.document_type = document_type detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = field.get_field_type() detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def train_document_field_detector_model(cls, log: ProcessLogger, document_type: DocumentType, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False) -> Optional[ClassifierModel]: log.info('Training model for field #{0} ({1})...' .format(field.pk, field.code)) if train_data_project_ids and not use_only_confirmed_field_values: train_data = DocumentFieldValue.objects \ .filter(field_id=field.pk, document__project_id__in=train_data_project_ids, document__document_type_id=document_type.pk, removed_by_user=False) \ .values('created_by', 'text_unit__text', 'value', 'extraction_hint') train_data_sets = [list(train_data)] else: train_data_sets = get_train_data_sets(document_type, field, train_data_project_ids) if not train_data_sets: log.info('Not enough data to train model for document_type #{0} and field #{1}.' .format(document_type.pk, field.pk)) return None classifier_model = train_model(document_type, field, train_data_sets) log.info( 'Finished training model for document_type #{0} and field #{1}.'.format(document_type.pk, field.pk)) return classifier_model
def train_document_field_detector_model( cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False ) -> Optional[ClassifierModel]: field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format( field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(field_type_adapter, ChoiceField): raise ValueError( 'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, field_type_adapter.code)) # Lets find good values of depends-on fields suitable for using as train data. if train_data_project_ids and not use_only_confirmed_field_values: train_data = list(Document.objects \ .filter(project_id__in=train_data_project_ids) \ .values_list('field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) else: train_data = list(cls.get_user_data(field, train_data_project_ids)) if not train_data: raise RuntimeError( 'Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.'. format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_fields_types = cls.get_depends_on_uid_code_type(field) depends_on_fields_types = cls.remove_empty_fields( depends_on_fields_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline( field, depends_on_fields_types) # type: Pipeline, List[Callable] categories = sorted([c.strip() for c in field.choices.split('\n')]) category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info( 'Collecting feature rows from train and test documents in dict form...' ) # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.uid) del doc_field_values[field.uid] field_value_idx = category_names_to_indexes.get( field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data[:is_index] test_is_target_data = train_target_data[:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report( test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report( test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({ 'model': model, 'categories': categories, 'feature_names': feature_names }) log.step_progress() log.info('Finished.') return cm