def save_summary(self, log: ProcessLogger, user_id): # save DocumentTermUsage if self.located_usage_entities and TermUsage in self.located_usage_entities: term_usages = self.located_usage_entities[TermUsage] # update DocumentTermUsage records doc_term_usgs = {} # type: Dict[Tuple[int, int], DocumentTermUsage] for tu in term_usages: # type: TermUsage key = (tu.text_unit.document_id, tu.term.pk,) doc_usg = doc_term_usgs.get(key) if doc_usg: doc_usg.count += 1 else: doc_usg = DocumentTermUsage() doc_usg.document_id = tu.text_unit.document_id doc_usg.term_id = tu.term.pk doc_usg.count = 1 doc_term_usgs[key] = doc_usg if doc_term_usgs: doc_term_usgs_lst = [v for _, v in doc_term_usgs.items()] try: with transaction.atomic(): DocumentTermUsage.objects.bulk_create(doc_term_usgs_lst, ignore_conflicts=True) except Exception as e: log.error(f'Unable to store {len(doc_term_usgs)} DocumentTermUsage records.\n', exc_info=e)
def send_email(log: ProcessLogger, dst_user, subject: str, txt: str, html: str, image_dir: str, cc: Set[str] = None): if not dst_user.email: log.error('Destination user {0} has no email assigned'.format(dst_user.get_full_name())) return try: email = EmailMultiAlternatives(subject=subject, body=txt, cc=list(cc) if cc else None, from_email=settings.DEFAULT_FROM_EMAIL, to=['"{0}" <{1}>'.format(dst_user.get_full_name(), dst_user.email)]) if html: images = [m.group(3) for m in RE_SRC_ATTACHMENT.finditer(html)] email_html = RE_SRC_ATTACHMENT.sub(r'\1cid:\3\4', html) email.attach_alternative(email_html, 'text/html') for image_fn in images: data = get_notification_template_resource(os.path.join(image_dir, image_fn)) mime_type = get_predefined_mime_type(image_fn) try: img = MIMEImage(data, _subtype=mime_type) if mime_type else MIMEImage(data) except TypeError as e: raise RuntimeError(f"Couldn't guess MIME type for tile {image_fn}") from e img.add_header('Content-Id', '<' + image_fn + '>') img.add_header("Content-Disposition", "inline", filename=image_fn) email.attach(img) email.send(fail_silently=False) except Exception as caused_by: log.error(f'Unable to send email to user "{dst_user.get_full_name()}" (#{dst_user.pk})', exc_info=caused_by)
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info( 'Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields } # type: Dict[str, DocumentField] field_values_pre_cached = False res = list() for field_code in sorted_codes: field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy if not field_values_pre_cached \ and field_detection_strategy.uses_cached_document_field_values(field): # Pre-cache Document.field_values structure for the usage in field detection strategies document.field_values = field_value_cache.cache_field_values( document, None, save=False) field_values_pre_cached = True detected_values = field_detection_strategy.detect_field_values( log, document, field) # type: List[DetectedFieldValue] if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values(document, res, save=True, log=log) return res
def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def _recreate_document_fields_table(log: ProcessLogger, table_name: str, column_defs: Dict[str, str], index_defs: Dict[str, str]): log.info('Recreating raw sql table: {0}'.format(table_name)) column_def_clauses = [ SQLClause('"{column}" {pg_type}'.format(column=column, pg_type=pg_type)) for column, pg_type in column_defs.items() ] create_table = format_clause( 'CREATE TABLE "{table_name}" (\n' '{columns}, \n' 'FOREIGN KEY ({field_document_id}) ' 'REFERENCES document_document (id) ON DELETE CASCADE)', table_name=table_name, columns=join_clauses(', \n', column_def_clauses), field_document_id=FIELD_CODE_DOC_ID) # type: SQLClause log.info('Create table SQL for table {0}:\n{1}\nParams: {2}'.format( table_name, create_table.sql, create_table.params)) with connection.cursor() as cursor: cursor.execute('drop table if exists "{table_name}"'.format( table_name=table_name)) cursor.execute(create_table.sql, create_table.params) for index_name, index_def in index_defs.items(): # type: str, str create_index = _build_create_index_statement( table_name, index_name, index_def) cursor.execute(create_index, [])
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, document_id: int, document_project_id: int, **kwargs): if not text: return start = datetime.datetime.now() try: parse_results = self.parse(log, text, text_unit_id, text_unit_lang, locate_results.document_initial_load, **kwargs) # type: ParseResults if parse_results: parse_results.update_doc_project_ids(document_id, document_project_id) locate_results.collect(self, text_unit_id, parse_results) elapsed = (datetime.datetime.now() - start).total_seconds() LocatingPerformanceMeter().add_record(str(type(self).__name__), elapsed, text_unit_id, text) except Exception as e: log.error( f'Exception caught while trying to run locator on a text unit.\n' f'Locator: {self.__class__.__name__}\n' f'Text unit id: {text_unit_id}\n' f'Text: {text[:1024]}\n' f'Text unit language: {text_unit_lang}\n', exc_info=e)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) typed_field = TypedField.by(field) # type: TypedField if python_coded_field.type != typed_field.type_code: raise RuntimeError( f'Python-coded field {python_coded_field.__class__.__name__} is ' f'for fields of type {python_coded_field.type} and field {field.code} ' f'is of type {typed_field.type_code}') log.debug('detect_field_value: python_coded_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') field_value_dto = python_coded_field.get_value( log=log, field=field, doc=doc, cur_field_code_to_value=field_code_to_value) if not typed_field.is_json_field_value_ok(field_value_dto.field_value): raise ValueError( f'Python coded field class {field.python_coded_field} returned value not suitable for ' f'field {field.code} ({typed_field.type_code})') return field_value_dto
def _build_insert_clause(log: ProcessLogger, table_name: str, handlers: List[field_handlers.FieldHandler], document: Document, fields_to_python_values: Dict[str, Any]) -> SQLClause: insert_clauses = list() for handler in handlers: # type: field_handlers.FieldHandler python_value = fields_to_python_values.get(handler.field_code) try: insert_clause = handler.get_pg_sql_insert_clause( document.language, python_value) # type: SQLInsertClause insert_clauses.append(insert_clause) except Exception as ex: msg = render_error('Unable to cache field values.\n' 'Document: {0} (#{1}).\n' 'Field: {2}'.format(document.name, document.id, handler.field_code), caused_by=ex) log.error(msg) columns_clause, values_clause = SQLInsertClause.join(insert_clauses) insert_clause = format_clause( 'insert into "{table_name}" ({columns}) ' 'values ({values}) on conflict ({column_document_id}) ' 'do update set ({columns}) = ({values})', table_name=table_name, columns=columns_clause, values=values_clause, column_document_id=FIELD_CODE_DOC_ID) return insert_clause
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: if not self.document_initial_load: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS tags_saved = 0 if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models) # save "_usage" objects count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if not entities: continue count += SafeBulkCreate.bulk_create(entity_class.objects, entities) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, tags_saved, len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e)
def document_fields_change_listener_impl(_sender, signal, log: ProcessLogger, document_event: str, document: Document, field_handlers: Dict[str, FieldHandler], fields_before: Optional[Dict], fields_after: Optional[Dict], changed_by_user: User = None): from apps.task.tasks import call_task_func from apps.notifications.tasks import process_notifications_on_document_change if not changed_by_user: # we ignore changes made by system at the moment return if not fields_before and not fields_after: log.error( 'Document fields changed event appeared with both "before" and "after" fields empty.' ) return from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val: return call_task_func(process_notifications_on_document_change, (document_event, document.pk, fields_before, fields_after, changed_by_user.pk), changed_by_user.pk)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, len(tag_models), len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e) self.save_summary(log, user_id)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def parse_file_local_xhtml(self, local_path: str, original_file_name: str, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text plus extra formatting information plus metadata. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') for cmd_list in [ self.tika_default_command_list, self.tika_lexnlp_default_command_list ]: cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path] last_try = cmd == self.tika_lexnlp_default_command_list text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout) or '' try: output = self.xhtml_parser.parse_text(text) output_len = len(output.text) if output and output.text else 0 logger.info( f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}' ) if not output_len and not last_try: continue output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \ { 'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len, 'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len, } return output except Exception as ex: text_sample = text[:255] if text and isinstance( text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def parse_file_local_plain_text(self, local_path: str, original_file_name: str, task: Any, timeout: int = 60, encoding_name: str = 'utf-8', logger: ProcessLogger = None, enable_ocr: bool = True) -> MarkedUpText: """ Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process. Tika will use plain text "stripper" and transform the source document into plain text inside its (Java) process. :param local_path: local path to the file being parsed :param original_file_name: original file name, can differ from local_path (e.g. temporary file path) :param timeout: timeout to interrupt Java process in seconds :param encoding_name: encoding to use, is passed to Tika :param logger: logger object to write errors and warnings :param enable_ocr: allow (True) converting images to text :return: MarkedUpText: text + metadata """ mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT # don't use at all TIKA_MODE_PDF_ONLY os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag os.environ[self.TIKA_PARSER_DETAIL] = '' tika_default_command_list = self.tika_lexnlp_default_command_list if enable_ocr is False and self.tika_noocr_default_command_list is not None: tika_default_command_list = self.tika_noocr_default_command_list cmd = tika_default_command_list + [ '-J', '-t', f'-e{encoding_name}', local_path ] def err(line): logger.info(f'TIKA parsing {original_file_name}:\n{line}') logger.info(f'Tika (plain text) args: {", ".join(cmd)}') text = read_output(cmd, stderr_callback=err, encoding=encoding_name, timeout_sec=timeout, task=task) or '' try: ptr_val = _parse((200, text)) return MarkedUpText(text=ptr_val['content'], meta=ptr_val['metadata']) except Exception as ex: text_sample = text[:255] if text and isinstance(text, str) else str(text) raise Exception( 'Error in parse_default_pdf_ocr -> _parse(). Text:\n' + text_sample) from ex
def there_are_non_indexed_docs_not_planned_to_index( document_type: DocumentType, log: ProcessLogger) -> bool: for doc_id in non_indexed_doc_ids_not_planned_to_index(document_type, 1): if doc_id: task_name = _get_reindex_task_name() fields_table = doc_fields_table_name(document_type.code) log.info( f'there_are_non_indexed_docs_not_planned_to_index: ' f'found document id={doc_id} of type {document_type.code}, ' f'task {task_name}. Fields table: {fields_table}') return True return False
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, **kwargs): try: parse_results = self.parse(log, text, text_unit_id, text_unit_lang, **kwargs) # type: ParseResults if parse_results: locate_results.collect(self, text_unit_id, parse_results) except Exception as e: log.error(f'Exception caught while trying to run locator on a text unit.\n' f'Locator: {self.__class__.__name__}\n' f'Text unit id: {text_unit_id}\n' f'Text: {text[:1024]}\n' f'Text unit language: {text_unit_lang}\n', exc_info=e)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug('detect_field_value: csv_regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detectors = cls.detecting_cache.get_detectors( field.pk, lambda msg, er: log.error(msg, field_code=field.code, exc_info=er)) if not detectors: return None is_multichoice = field.type == MultiChoiceField.type_code doc_text = cls.get_document_text(doc) annotations = [] for detector in detectors: found_item = detector.find_value(doc_text) if not found_item: continue # TODO: implement reading values from full text (TextParts.FULL.value) # as it is done now, or from text units - paragraphs or sentences # based on field.text_unit_type - for other detector.text_part options """ if detector.text_part == TextParts.BEFORE_REGEXP.value: return matching_string[:begin], 0, begin elif detector.text_part == TextParts.AFTER_REGEXP.value: return matching_string[end:], end, len(text) elif detector.text_part == TextParts.INSIDE_REGEXP.value: return matching_string[begin:end], begin, end else: return text, 0, len(text) """ # starting position has to be shifted backward by 1 symbol for FE ant = AnnotationDTO(annotation_value=found_item[0], location_in_doc_start=max( found_item[1] - 1, 0), location_in_doc_end=found_item[2], extraction_hint_name='') if not is_multichoice: return FieldValueDTO(field_value=found_item[0], annotations=[ant]) else: annotations.append(ant) if annotations: f_val = [a.annotation_value for a in annotations] return FieldValueDTO(field_value=f_val, annotations=annotations) return None
def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool, csv_contains_regexps: bool = False): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): if len(row) == 0: continue includes = row.dropna() if not csv_contains_regexps: includes = [i.strip().replace(' ', '\s{1,100}') for i in includes] includes = [i for i in includes if i] if len(includes) == 1: log.info( 'There are no search strings specified for detected value {0}'. format(row[0])) continue detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(includes[1:]) detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def plan_refreshes(self, log: ProcessLogger, refresh_task_name: str, plan_task_func: Callable[[str, datetime], None]): """ Checks if there are materialized view refresh requests older than N seconds and plans the refreshing. The requests are inserted into the corresponding table by the document loading routines or any other code which changes the data on which these views are based. Maybe they will be replaced by a DB trigger in future. :param plan_task_func: :param log :return: """ from apps.materialized_views.app_vars import REFRESH_DELAY refresh_delay_sec = REFRESH_DELAY.val to_refresh = list() with connection.cursor() as cursor: cursor.execute( f'''select view_name, max(request_date) from {TABLE_M_VIEW_REQUEST} where to_jsonb(view_name) not in (select args->0 from task_task where name = %s and own_status = %s) group by view_name''', (refresh_task_name, PENDING)) for view_name, max_request_date in cursor.fetchall( ): # type: str, datetime if timezone.now() - max_request_date > timedelta( seconds=refresh_delay_sec): to_refresh.append(view_name) # Here we use PG advisory locks to prevent planning the materialized view refresh it the refresh # is already being executed. # The same lock is acquired in refresh_materialized_view() by any Celery worker (maybe on a different machine) # which is running the refresh of the same view. # And the following code running in Celery-beat on the master machine checks is the "refresh" is in progress # by trying to acquire the lock. for view_name in to_refresh: with transaction.atomic(): # We need to execute it in a separate transaction to release the PG advisory lock # before executing plan_task_func. # Cursor is closed on the transaction end. So we initialize it here and don't re-use. with connection.cursor() as cursor: locked = self.advisory_lock_by_relation_name( cursor, view_name) if locked: log.info( f'Planning refresh for materialized view {view_name}.') plan_task_func(view_name)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: try: log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') except AttributeError: pass ants: List[AnnotationDTO] = [] depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) text_unit_repo: TextUnitRepository = cls.text_unit_repo field_detector_repo: FieldDetectorRepository = cls.field_detector_repo detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] for text_unit in qs_text_units: unit_ants = cls.extract_from_textunit(text_unit, field, detectors) if not unit_ants: continue if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=unit_ants[0].annotation_value, annotations=unit_ants) else: ants += unit_ants if not ants: return None if isinstance(typed_field, MultiValueField): field_value = typed_field.build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]) else: field_value = typed_field.annotation_value_python_to_json( ants[0].annotation_value) return FieldValueDTO(field_value=field_value, annotations=ants)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter( text_unit_id__in=self.processed_text_unit_ids).delete( ) for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter( text_unit_id__in=self.processed_text_unit_ids ).delete() count = 0 for entity_class, entities in self.located_usage_entities.items( ): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append( TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units' .format(count, len(tag_models), len(self.processed_text_unit_ids))) except: msg = render_error( 'Unable to store location results.\n' 'Text unit ids: {text_unit_ids}\n' 'Usage models caused the problem:\n{entities}'.format( text_unit_ids=self.processed_text_unit_ids, entities='\n'.join([ str(e) for e in self.processed_usage_entity_classes ]))) log.error(msg)
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, **kwargs): try: parse_results = self.parse(text, text_unit_id, text_unit_lang, **kwargs) # type: ParseResults locate_results.collect(self, text_unit_id, parse_results) except: msg = render_error( 'Exception caught while trying to run locator on a text unit.\n' 'Locator: {locator}\n' 'Text unit id: {text_unit_id}\n' 'Text: {text}\n' 'Text unit language: {text_unit_lang}\n'.format( locator=self.__class__.__name__, text_unit_id=text_unit_id, text=text[:1024], text_unit_lang=text_unit_lang)) log.error(msg)
def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False, split_and_log_out_of_sample_test_report: bool = False) \ -> Optional[ClassifierModel]: log.info(f'Training model for field {field.code} (#{field.pk})...') if train_data_project_ids and not use_only_confirmed_field_values: train_data_sets = cls.get_train_datasets_from_projects( field.pk, train_data_project_ids) else: train_data_sets = cls.get_train_data_sets(field, train_data_project_ids) if not train_data_sets: log.info( f'Not enough data to train model for document_type {field.document_type.code}, field: {field.code}.' ) return None classifier_model = cls.train_model( log, field, train_data_sets, split_and_log_out_of_sample_test_report) log.info( f'Finished training model for document_type {field.document_type.code}, field: {field.code}.' ) return classifier_model
def train_document_field_detector_model( cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False ) -> Optional[ClassifierModel]: log.info(f'Training model for field {field.code} (#{field.pk})...') if train_data_project_ids and not use_only_confirmed_field_values: train_data_sets = cls.get_train_datasets_from_projects( field.pk, train_data_project_ids) else: train_data_sets = cls.get_train_data_sets(field, train_data_project_ids) if not train_data_sets: log.info( 'Not enough data to train model for document_type #{0} and field #{1}.' .format(field.document_type.pk, field.pk)) return None classifier_model = cls.train_model(field, train_data_sets) log.info( 'Finished training model for document_type #{0} and field #{1}.'. format(field.document_type.pk, field.pk)) return classifier_model
def refresh_materialized_view(self, log: ProcessLogger, view_name: str): """ Refresh the specified materialized view and delete all refresh requests older or equal to the last request date taken at this method start. Additionally this method acquires a PG advisory lock to prevent parallel refreshing of the same view. The lock is used by the planning routine which tries to acquire the lock to prevent re-planning the same refresh if it is already running. :param view_name: :param log :return: """ try: with connection.cursor() as cursor: cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATING, view_name]) except Exception as e: log.error(f'Error saving updated status for view "{view_name}": {e}') with transaction.atomic(): with connection.cursor() as cursor: if not self.advisory_lock_by_relation_name(cursor, view_name): log.info(f'Canceled refreshing materialized view: {view_name}. ' f'Unable to acquire the advisory lock.') cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATED, view_name]) return log.info(f'Refreshing materialized view: {view_name}.') cursor.execute('select max(request_date) ' f'from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s;', [view_name]) row = cursor.fetchone() request_date = row[0] if row else None concurency_clause = '' from apps.materialized_views.app_vars import CONCURRENCY_UPDATE if CONCURRENCY_UPDATE.val: concurency_clause = ' CONCURRENTLY' cursor.execute(f'refresh materialized view{concurency_clause} {view_name};') if request_date is not None: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s and request_date <= %s', [view_name, request_date]) else: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s', [view_name]) dt_now = timezone.now() cursor.execute(f'insert into {TABLE_M_VIEW} ' '(view_name, refresh_date, status) ' 'values (%s, %s, %s) ' 'on conflict (view_name) do update set refresh_date = %s, ' 'status = %s;', [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED, dt_now, MaterializedView.VIEW_STATUS_UPDATED])
def document_change_listener(event: events.DocumentChangedEvent): from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if APP_VAR_DISABLE_RAW_DB_CACHING.val: return from apps.rawdb.field_value_tables import cache_document_fields log = event.log or ProcessLogger() cache_document_fields( log=log, document=event.document, cache_generic_fields=event.generic_fields_changed, cache_user_fields=event.user_fields_changed, pre_detected_field_codes_to_suggested_values=event. pre_detected_field_values)
def refresh_materialized_view(self, log: ProcessLogger, view_name: str): """ Refresh the specified materialized view and delete all refresh requests older or equal to the last request date taken at this method start. Additionally this method acquires a PG advisory lock to prevent parallel refreshing of the same view. The lock is used by the planning routine which tries to acquire the lock to prevent re-planning the same refresh if it is already running. :param view_name: :param log :return: """ with transaction.atomic(): with connection.cursor() as cursor: if not self.advisory_lock_by_relation_name(cursor, view_name): log.info(f'Canceled refreshing materialized view: {view_name}. ' f'Unable to acquire the advisory lock.') return log.info(f'Refreshing materialized view: {view_name}.') cursor.execute('select max(request_date) ' 'from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s', [view_name]) row = cursor.fetchone() request_date = row[0] if row else None cursor.execute(f'refresh materialized view {view_name}') if request_date is not None: cursor.execute('delete from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s and request_date <= %s', [view_name, request_date]) else: cursor.execute('delete from materialized_views_materializedviewrefreshrequest ' 'where view_name = %s', [view_name]) dt_now = timezone.now() cursor.execute('insert into materialized_views_materializedview ' '(view_name, refresh_date) values (%s, %s) ' 'on conflict (view_name) do update set refresh_date = %s', [view_name, dt_now, dt_now])
def send_email(self, log: ProcessLogger = None, subject: str = None, text: str = None, html: str = None): from apps.notifications.notifications import send_email link = self.get_link(abs_path=True, as_html=False) default_subject = 'Document Files Ready to Download' default_msg_template = 'You can download your documents {}' default_text = default_msg_template.format(link) default_html = default_msg_template.format(f'<a href="{link}">here</a>') send_email( log=log or ProcessLogger(), dst_user=self.user, subject=subject or default_subject, txt=text or default_text, html=html or default_html) self.email_sent = True self.save()
def document_change_listener_impl(sender, signal, log: ProcessLogger, document: Document, system_fields_changed: FieldSpec = True, generic_fields_changed: FieldSpec = True, user_fields_changed: bool = True, changed_by_user: User = None, document_initial_load: bool = False): from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if APP_VAR_DISABLE_RAW_DB_CACHING.val: return from apps.rawdb.field_value_tables import cache_document_fields log = log or ProcessLogger() cache_document_fields(log=log, document=document, cache_system_fields=system_fields_changed, cache_generic_fields=generic_fields_changed, cache_user_fields=user_fields_changed, changed_by_user=changed_by_user, document_initial_load=document_initial_load)