def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info( 'Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields } # type: Dict[str, DocumentField] field_values_pre_cached = False res = list() for field_code in sorted_codes: field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy if not field_values_pre_cached \ and field_detection_strategy.uses_cached_document_field_values(field): # Pre-cache Document.field_values structure for the usage in field detection strategies document.field_values = field_value_cache.cache_field_values( document, None, save=False) field_values_pre_cached = True detected_values = field_detection_strategy.detect_field_values( log, document, field) # type: List[DetectedFieldValue] if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values(document, res, save=True, log=log) return res
def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, len(tag_models), len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e) self.save_summary(log, user_id)
def _recreate_document_fields_table(log: ProcessLogger, table_name: str, column_defs: Dict[str, str], index_defs: Dict[str, str]): log.info('Recreating raw sql table: {0}'.format(table_name)) column_def_clauses = [ SQLClause('"{column}" {pg_type}'.format(column=column, pg_type=pg_type)) for column, pg_type in column_defs.items() ] create_table = format_clause( 'CREATE TABLE "{table_name}" (\n' '{columns}, \n' 'FOREIGN KEY ({field_document_id}) ' 'REFERENCES document_document (id) ON DELETE CASCADE)', table_name=table_name, columns=join_clauses(', \n', column_def_clauses), field_document_id=FIELD_CODE_DOC_ID) # type: SQLClause log.info('Create table SQL for table {0}:\n{1}\nParams: {2}'.format( table_name, create_table.sql, create_table.params)) with connection.cursor() as cursor: cursor.execute('drop table if exists "{table_name}"'.format( table_name=table_name)) cursor.execute(create_table.sql, create_table.params) for index_name, index_def in index_defs.items(): # type: str, str create_index = _build_create_index_statement( table_name, index_name, index_def) cursor.execute(create_index, [])
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: if not self.document_initial_load: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS tags_saved = 0 if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models) # save "_usage" objects count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if not entities: continue count += SafeBulkCreate.bulk_create(entity_class.objects, entities) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, tags_saved, len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e)
def refresh_materialized_view(self, log: ProcessLogger, view_name: str): """ Refresh the specified materialized view and delete all refresh requests older or equal to the last request date taken at this method start. Additionally this method acquires a PG advisory lock to prevent parallel refreshing of the same view. The lock is used by the planning routine which tries to acquire the lock to prevent re-planning the same refresh if it is already running. :param view_name: :param log :return: """ try: with connection.cursor() as cursor: cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATING, view_name]) except Exception as e: log.error(f'Error saving updated status for view "{view_name}": {e}') with transaction.atomic(): with connection.cursor() as cursor: if not self.advisory_lock_by_relation_name(cursor, view_name): log.info(f'Canceled refreshing materialized view: {view_name}. ' f'Unable to acquire the advisory lock.') cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATED, view_name]) return log.info(f'Refreshing materialized view: {view_name}.') cursor.execute('select max(request_date) ' f'from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s;', [view_name]) row = cursor.fetchone() request_date = row[0] if row else None concurency_clause = '' from apps.materialized_views.app_vars import CONCURRENCY_UPDATE if CONCURRENCY_UPDATE.val: concurency_clause = ' CONCURRENTLY' cursor.execute(f'refresh materialized view{concurency_clause} {view_name};') if request_date is not None: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s and request_date <= %s', [view_name, request_date]) else: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s', [view_name]) dt_now = timezone.now() cursor.execute(f'insert into {TABLE_M_VIEW} ' '(view_name, refresh_date, status) ' 'values (%s, %s, %s) ' 'on conflict (view_name) do update set refresh_date = %s, ' 'status = %s;', [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED, dt_now, MaterializedView.VIEW_STATUS_UPDATED])
def parse_file_local_xhtml(self, local_path: str, original_file_name: str, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') for cmd_list in [ self.tika_default_command_list, self.tika_lexnlp_default_command_list ]: cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] last_try = cmd == self.tika_lexnlp_default_command_list text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout) or '' try: output = self.xhtml_parser.parse_text(text) output_len = len(output.text) if output and output.text else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def there_are_non_indexed_docs_not_planned_to_index( document_type: DocumentType, log: ProcessLogger) -> bool: for doc_id in non_indexed_doc_ids_not_planned_to_index(document_type, 1): if doc_id: task_name = _get_reindex_task_name() fields_table = doc_fields_table_name(document_type.code) log.info( f'there_are_non_indexed_docs_not_planned_to_index: ' f'found document id={doc_id} of type {document_type.code}, ' f'task {task_name}. Fields table: {fields_table}') return True return False
def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT # don't use at all TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag os.environ[self.TIKA_PARSER_DETAIL] = '' tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def plan_refreshes(self, log: ProcessLogger, refresh_task_name: str, plan_task_func: Callable[[str, datetime], None]): """ Checks if there are materialized view refresh requests older than N seconds and plans the refreshing. The requests are inserted into the corresponding table by the document loading routines or any other code which changes the data on which these views are based. Maybe they will be replaced by a DB trigger in future. :param plan_task_func: :param log :return: """ from apps.materialized_views.app_vars import REFRESH_DELAY refresh_delay_sec = REFRESH_DELAY.val to_refresh = list() with connection.cursor() as cursor: cursor.execute( f'''select view_name, max(request_date) from {TABLE_M_VIEW_REQUEST} where to_jsonb(view_name) not in (select args->0 from task_task where name = %s and own_status = %s) group by view_name''', (refresh_task_name, PENDING)) for view_name, max_request_date in cursor.fetchall( ): # type: str, datetime if timezone.now() - max_request_date > timedelta( seconds=refresh_delay_sec): to_refresh.append(view_name) # Here we use PG advisory locks to prevent planning the materialized view refresh it the refresh # is already being executed. # The same lock is acquired in refresh_materialized_view() by any Celery worker (maybe on a different machine) # which is running the refresh of the same view. # And the following code running in Celery-beat on the master machine checks is the "refresh" is in progress # by trying to acquire the lock. for view_name in to_refresh: with transaction.atomic(): # We need to execute it in a separate transaction to release the PG advisory lock # before executing plan_task_func. # Cursor is closed on the transaction end. So we initialize it here and don't re-use. with connection.cursor() as cursor: locked = self.advisory_lock_by_relation_name( cursor, view_name) if locked: log.info( f'Planning refresh for materialized view {view_name}.') plan_task_func(view_name)
def document_fields_change_listener_impl( _sender, signal, log: ProcessLogger, document_event: str, document_pk: int, field_handlers: Dict[str, RawdbFieldHandler], fields_before: Optional[Dict], fields_after: Optional[Dict], changed_by_user: User = None): from apps.notifications.tasks import process_notifications_on_document_change if not changed_by_user: # we ignore changes made by system at the moment return if not fields_before and not fields_after: log.error( 'Document fields changed event appeared with both "before" and "after" fields empty.' ) return from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val: return process_notifications_on_document_change(lambda m: log.info(m), document_event, document_pk, fields_before, fields_after, changed_by_user.pk)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter( text_unit_id__in=self.processed_text_unit_ids).delete( ) for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter( text_unit_id__in=self.processed_text_unit_ids ).delete() count = 0 for entity_class, entities in self.located_usage_entities.items( ): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append( TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units' .format(count, len(tag_models), len(self.processed_text_unit_ids))) except: msg = render_error( 'Unable to store location results.\n' 'Text unit ids: {text_unit_ids}\n' 'Usage models caused the problem:\n{entities}'.format( text_unit_ids=self.processed_text_unit_ids, entities='\n'.join([ str(e) for e in self.processed_usage_entity_classes ]))) log.error(msg)
def train_document_field_detector_model( cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False ) -> Optional[ClassifierModel]: log.info(f'Training model for field {field.code} (#{field.pk})...') if train_data_project_ids and not use_only_confirmed_field_values: train_data_sets = cls.get_train_datasets_from_projects( field.pk, train_data_project_ids) else: train_data_sets = cls.get_train_data_sets(field, train_data_project_ids) if not train_data_sets: log.info( 'Not enough data to train model for document_type #{0} and field #{1}.' .format(field.document_type.pk, field.pk)) return None classifier_model = cls.train_model(field, train_data_sets) log.info( 'Finished training model for document_type #{0} and field #{1}.'. format(field.document_type.pk, field.pk)) return classifier_model
def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False, split_and_log_out_of_sample_test_report: bool = False) \ -> Optional[ClassifierModel]: log.info(f'Training model for field {field.code} (#{field.pk})...') if train_data_project_ids and not use_only_confirmed_field_values: train_data_sets = cls.get_train_datasets_from_projects( field.pk, train_data_project_ids) else: train_data_sets = cls.get_train_data_sets(field, train_data_project_ids) if not train_data_sets: log.info( f'Not enough data to train model for document_type {field.document_type.code}, field: {field.code}.' ) return None classifier_model = cls.train_model( log, field, train_data_sets, split_and_log_out_of_sample_test_report) log.info( f'Finished training model for document_type {field.document_type.code}, field: {field.code}.' ) return classifier_model
def refresh_materialized_view(self, log: ProcessLogger, view_name: str): """ Refresh the specified materialized view and delete all refresh requests older or equal to the last request date taken at this method start. Additionally this method acquires a PG advisory lock to prevent parallel refreshing of the same view. The lock is used by the planning routine which tries to acquire the lock to prevent re-planning the same refresh if it is already running. :param view_name: :param log :return: """ with transaction.atomic(): with connection.cursor() as cursor: if not self.advisory_lock_by_relation_name(cursor, view_name): log.info(f'Canceled refreshing materialized view: {view_name}. ' f'Unable to acquire the advisory lock.') return log.info(f'Refreshing materialized view: {view_name}.') cursor.execute('select max(request_date) ' 'from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s', [view_name]) row = cursor.fetchone() request_date = row[0] if row else None cursor.execute(f'refresh materialized view {view_name}') if request_date is not None: cursor.execute('delete from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s and request_date <= %s', [view_name, request_date]) else: cursor.execute('delete from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s', [view_name]) dt_now = timezone.now() cursor.execute('insert into materialized_views_materializedview ' '(view_name, refresh_date) values (%s, %s) ' 'on conflict (view_name) do update set refresh_date = %s', [view_name, dt_now, dt_now])
def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool, csv_contains_regexps: bool = False): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): if len(row) == 0: continue includes = row.dropna() if not csv_contains_regexps: includes = [i.strip().replace(' ', '\s{1,100}') for i in includes] includes = [i for i in includes if i] if len(includes) == 1: log.info( 'There are no search strings specified for detected value {0}'. format(row[0])) continue detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(includes[1:]) detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info('Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] required_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] res = list() for field_code in sorted_codes: if ignore_field_codes and field_code in ignore_field_codes: continue if required_fields and field_code not in required_fields: continue field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: field_vals = field_value_cache.cache_field_values(document, None, save=False) detected_values = field_detection_strategy.detect_field_values(log, document, field, field_vals) # type: List[DetectedFieldValue] except Exception as e: msg = '''Unable to detect field value. Document type: {0} Document: {1} Field: {2}'''.format(document_type.code, document.pk, field.code) log.error(render_error(msg, e)) raise e if save_detected and clear_old_values: # Delete previously detected values # to avoid accumulating garbage on each iteration. DocumentFieldValue.objects \ .filter(document=document, field=field, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .exclude(field__value_detection_strategy=DocumentField.VD_DISABLED) \ .delete() if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values(document, suggested_field_values=res, save=True, log=log, changed_by_user=changed_by_user, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, document_initial_load=document_initial_load) return res
def get_values(self, log: ProcessLogger, field: DocumentField, doc: Document, text: str) \ -> List[Tuple[Any, Optional[int], Optional[int]]]: try: conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR ) # type: DocumentSimilarityConfig except DocumentSimilarityConfig.DoesNotExist: conf = None if conf: conf.self_validate() similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD feature_vector_fields = field.depends_on_fields.all() date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS document_type = doc.document_type feature_vector_field_codes = {f.code for f in feature_vector_fields} # TODO: replace with the corresponding method call when ready doc_field_values = dict() for fv in doc.documentfieldvalue_set \ .filter(field__code__in=feature_vector_field_codes.union({date_constraint_field_code})): if fv.removed_by_user: continue field = fv.field field_type = fv.field.get_field_type() # type: FieldType doc_field_values[field.code] = field_type \ .merge_multi_python_values(doc_field_values.get(field.code), fv.python_value) doc_field_values[FIELD_CODE_DOC_ID] = doc.pk doc_date = doc_field_values.get( date_constraint_field_code) if date_constraint_field_code else None if not doc_date: doc_date = doc.history.last().history_date date_constraint_field_code = FIELD_CODE_CREATE_DATE date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) try: vectorizer = document_feature_vector_pipeline( feature_vector_fields, use_field_codes=True) rawdb = RawDbRepository() where = SQLClause( f'"{FIELD_CODE_DOC_ID}" != %s ' f'and "{date_constraint_field_code}" >= %s ' f'and "{date_constraint_field_code}" <= %s', [doc.pk, date_start, date_end]) field_values_list = list( rawdb.get_field_values( document_type=document_type, where_sql=where, field_codes=feature_vector_field_codes.union( {FIELD_CODE_DOC_ID, date_constraint_field_code}))) if not field_values_list: return [] field_values_list = [doc_field_values] + field_values_list feature_vectors = vectorizer.fit_transform(field_values_list) doc_feature_vectors = feature_vectors[0] except ValueError as ve: if 'empty vocabulary' in str(ve): log.info( f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs ' f'contains any value in the feature vector fields.') return [] raise ve similarities = cosine_similarity(doc_feature_vectors, feature_vectors) # TODO: Think about removing usage of other_field_values_list here and switching it to generator # to avoid storing the list of all field values. We only need feature vectors but they have no doc id. res = list() # type: List[Tuple[Any, Optional[int], Optional[int]]]: for y, field_values in enumerate(field_values_list): other_doc_pk = field_values[FIELD_CODE_DOC_ID] if doc.pk == other_doc_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue res.append((other_doc_pk, None, None)) self._maybe_save_reverse_similarity_value( log=log, field=field, document=doc, other_doc_id=other_doc_pk) return res
def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any], location_text: Optional[str], location_start: int = 0, location_end: int = 0) -> Optional[FieldValueDTO]: try: conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR ) # type: Optional[DocumentSimilarityConfig] except DocumentSimilarityConfig.DoesNotExist: conf = None if conf: conf.self_validate() similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD feature_vector_fields = field.depends_on_fields.all() date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS document_type = doc.document_type feature_vector_field_codes = {f.code for f in feature_vector_fields} doc_field_values = dict(cur_field_code_to_value) doc_field_values[FIELD_CODE_DOC_ID] = doc.pk if date_constraint_field_code: doc_date = doc_field_values.get(date_constraint_field_code) date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) doc_ids_query = FieldValue.objects \ .filter(field__code=date_constraint_field_code) \ .filter(value__gte=date_start) \ .filter(value__lte=date_end) \ .filter(document__document_type_id=document_type.pk) \ .exclude(document_id=doc.pk) \ .values_list('document_id', flat=True) else: doc_date = doc.history.last().history_date date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) doc_ids_query = Document.history \ .filter(history_type='+', history_date__gte=date_start, history_date__lte=date_end, document_type_id=document_type.pk) \ .exclude(id=doc.pk) \ .values_list('pk', flat=True) try: vectorizer = document_feature_vector_pipeline( feature_vector_fields, use_field_codes=True) field_repo = DocumentFieldRepository() field_values_list = list() for doc_id, field_values in field_repo \ .get_field_code_to_python_value_multiple_docs(document_type_id=document_type.pk, doc_ids=doc_ids_query, field_codes_only=feature_vector_field_codes): d = dict(field_values) d[FIELD_CODE_DOC_ID] = doc_id field_values_list.append(d) if not field_values_list: return None field_values_list = [doc_field_values] + field_values_list feature_vectors = vectorizer.fit_transform(field_values_list) doc_feature_vectors = feature_vectors[0] except ValueError as ve: if 'empty vocabulary' in str(ve): log.info( f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs ' f'contains any value in the feature vector fields.') return None raise ve similarities = cosine_similarity(doc_feature_vectors, feature_vectors) # TODO: Think about removing usage of other_field_values_list here and switching it to generator # to avoid storing the list of all field values. We only need feature vectors but they have no doc id. res = set() # type: Set[int] for y, field_values in enumerate(field_values_list): other_doc_pk = field_values[FIELD_CODE_DOC_ID] if doc.pk == other_doc_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue res.add(other_doc_pk) self._maybe_save_reverse_similarity_value( log=log, field=field, document=doc, other_doc_id=other_doc_pk) if res: field_value = sorted(res)[0] return FieldValueDTO(field_value) return None
def adapt_table_structure(log: ProcessLogger, document_type: DocumentType, force: bool = False, check_only: bool = False) -> bool: """ Create or alter raw db table for it to match the field structure of the specified document type. :param log: :param document_type: :param force: Force re-creating the table. :param check_only Do not really alter the table but only check if the re-indexing will be required. :return: True/False - if any column has been added/removed/altered and re-index is required for this doc type. """ table_name = doc_fields_table_name(document_type.code) fields = build_field_handlers(document_type, table_name) should_be_columns = dict() # type: Dict[str, str] should_be_indexes = dict() # type: Dict[str, str] for field_handler in fields: field_columns = field_handler.get_pg_column_definitions( ) # type: Dict[str, field_handlers.PgTypes] should_be_columns.update( {name: pg_type.value for name, pg_type in field_columns.items()}) index_defs = field_handler.get_pg_index_definitions() if index_defs: should_be_indexes.update({ build_index_name(table_name, index_def): index_def for index_def in index_defs }) if not check_only and (force or not table_exists(table_name)): _recreate_document_fields_table(log, table_name, should_be_columns, should_be_indexes) cleanup_saved_filters(document_type, set(should_be_columns.keys())) return True reindex_needed = False dropped_columns = list() # type: List[Tuple[str, str]] added_columns = list() # type: List[Tuple[str, str]] with connection.cursor() as cursor: with transaction.atomic(): existing_columns = get_table_columns_from_pg( cursor, table_name) # type: Dict[str, str] alter_table_actions = list() # type: List[str] for existing_name, existing_type in existing_columns.items(): should_be_type = should_be_columns.get(existing_name) if not should_be_type or should_be_type != existing_type: # column does not exist in "should_be_columns" or has different type alter_table_actions.append( 'drop column "{column}"'.format(column=existing_name)) dropped_columns.append((existing_name, existing_type)) for should_be_name, should_be_type in should_be_columns.items(): existing_type = existing_columns.get(should_be_name) if not existing_type or existing_type != should_be_type: # column does not exist in "existing_columns" or has # different type (and has been dropped in prev loop) alter_table_actions.append( 'add column "{column}" {pg_type}'.format( column=should_be_name, pg_type=should_be_type)) added_columns.append((should_be_name, should_be_type)) if alter_table_actions: if not check_only: alter_table_sql = 'alter table "{table_name}"\n{actions}' \ .format(table_name=table_name, actions=',\n'.join(alter_table_actions)) cursor.execute(alter_table_sql, []) log.info( 'Altered table: {0}\nDropped columns:\n{1}\nAdded columns:\n{2}' .format( table_name, '\n'.join( [c + ': ' + t for c, t in dropped_columns]), '\n'.join([c + ': ' + t for c, t in added_columns]))) cleanup_saved_filters(document_type, set(should_be_columns.keys())) reindex_needed = True if not check_only: # Changes in indexes do not require document re-indexing - the values will already be in the columns. existing_indexes = get_table_index_names_from_pg( cursor, table_name) # type: Set[str] for existing_index_name in existing_indexes: if existing_index_name not in should_be_indexes: cursor.execute( 'drop index concurrently "{index_name}"'.format( index_name=existing_index_name), []) for should_be_index_name, should_be_index_def in should_be_indexes.items( ): if should_be_index_name not in existing_indexes: create_index_sql = _build_create_index_statement( table_name, should_be_index_name, should_be_index_def) cursor.execute(create_index_sql, []) return reindex_needed
def check_task_health(log: ProcessLogger, restart_task_func: Callable[[str], None]): """ Find and process unhealthy tasks - the tasks which are hanging in PENDING while there is at least one free worker of each kind (default, high, doc_load). This is intended to wait silently until all other tasks processed and next re-send the hanged PENDING tasks. Goal state: if there are PENDING tasks which are not known by any worker - there should not be free workers of all types. """ start_time = time() inspect_start_time = time() celery_stats = get_celery_stats() inspect_time_spent = time() - inspect_start_time if not celery_stats.free_workers_available_of_any_kind: log.info( f'Task health check: there are no workers at all or at least some kind of worker is still busy.\n' f'Not checking for the hanged tasks.' f'Celery inspect time: {inspect_time_spent:.3f}s\n') return query_time_start = time() # There is at least one free worker of each kind. # This means there should be no PENDING tasks not known to workers. # Increasing bad health check counter for the PENDING tasks not known to workers. Task.objects \ .filter(own_status='PENDING', bad_health_check_num__lt=TASK_BAD_HEALTH_CHECK_RETRIES) \ .exclude(queue=settings.CELERY_QUEUE_SERIAL) \ .exclude(name__in=settings.EXCLUDE_FROM_TRACKING) \ .exclude(pk__in=celery_stats.tasks_on_workers) \ .update(bad_health_check_num=F('bad_health_check_num') + 1) # Set bad counter to zero for all tasks on workers Task.objects \ .filter(pk__in=celery_stats.tasks_on_workers) \ .exclude(bad_health_check_num=0) \ .update(bad_health_check_num=0) # Restarts those having the counter >= threshold to_restart = list( Task.objects.filter( own_status='PENDING', bad_health_check_num=TASK_BAD_HEALTH_CHECK_RETRIES).values_list( 'pk', 'name')) query_time_spent = time() - query_time_start restarted_tasks = list() could_not_restart_tasks = list() for task_id, task_name in to_restart: try: restart_task_func(task_id) restarted_tasks.append((task_id, task_name)) except Exception as ex: log.error(f'Unable to restart task {task_name} ({task_id})', exc_info=ex) could_not_restart_tasks.append((task_id, task_name)) restarted_msg = '\n'.join( task_id + " " + task_name for task_id, task_name in restarted_tasks) if restarted_tasks else 'no' problem_restarting_msg = '\n'.join( task_id + " " + task_name for task_id, task_name in could_not_restart_tasks) if restarted_tasks else 'no' log.info( f'Checked task health. Found {len(to_restart)} unhealthy tasks.\n' f'Total time: {time() - start_time:.3f}s\n' f'Celery inspect time: {inspect_time_spent:.3f}s\n' f'DB query time: {query_time_spent:.3f}s\n' f'Restarted tasks:\n{restarted_msg}\n' f'Could not restart tasks:\n{problem_restarting_msg}', extra={'log_unhealthy_tasks': bool(to_restart)})
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [c for c in sorted_codes if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set(list(FieldValue.objects.filter( modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value(log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append(f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'") # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e) detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list(user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed(sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res
def parse_file_local_xhtml(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list parse_commands = [ tika_default_command_list, self.tika_default_command_list ] from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val for cmd_index in range(len(parse_commands)): cmd_list = parse_commands[cmd_index] cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] if ram_limit: java_index = cmd.index('java') cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m' ] + cmd[java_index + 1:] logger.info(f'Tika (XHTML) args: {", ".join(cmd)}') last_try = cmd_index == len(parse_commands) - 1 text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: output = self.xhtml_parser.parse_text(text) output_len = output.pure_text_length if output else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def train_model(cls, log: ProcessLogger, field: DocumentField, train_data_sets: List[List[dict]], split_and_log_out_of_sample_test_report: bool = False) -> ClassifierModel: typed_field = TypedField.by(field) df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.code, row.value if typed_field.is_choice_field else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append( [{'text_unit__textunittext__text': i} for i in cls.get_no_field_text_units(field.document_type, field.text_unit_type)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype( 'str') df['user_input'] = df['modified_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) if field.classifier_init_script: try: clf = cls.init_classifier(field) except Exception as e: log.error(f'Unable to initialize classifier for field {field.code}. ' f'Classifier init script: {field.classifier_init_script}', exc_info=e) else: clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced') log.info(f'Classifier initialized: {clf}') text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', clf), ]) x = res_df['text_unit__textunittext__text'] y = res_df['target_index'] if split_and_log_out_of_sample_test_report: x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42) else: x_train, x_test_os, y_train, y_test_os = x, None, y, None sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field classifier_model.classifier_accuracy_report_in_sample = \ classification_report(y, text_clf.predict(x), target_names=target_names) if y_test_os is not None and x_test_os is not None: classifier_model.classifier_accuracy_report_out_of_sample = \ classification_report(y_test_os, text_clf.predict(x_test_os), target_names=target_names) return classifier_model