def send_email(log: ProcessLogger, dst_user, subject: str, txt: str, html: str, image_dir: str, cc: Set[str] = None): if not dst_user.email: log.error('Destination user {0} has no email assigned'.format(dst_user.get_full_name())) return try: email = EmailMultiAlternatives(subject=subject, body=txt, cc=list(cc) if cc else None, from_email=settings.DEFAULT_FROM_EMAIL, to=['"{0}" <{1}>'.format(dst_user.get_full_name(), dst_user.email)]) if html: images = [m.group(3) for m in RE_SRC_ATTACHMENT.finditer(html)] email_html = RE_SRC_ATTACHMENT.sub(r'\1cid:\3\4', html) email.attach_alternative(email_html, 'text/html') for image_fn in images: data = get_notification_template_resource(os.path.join(image_dir, image_fn)) mime_type = get_predefined_mime_type(image_fn) try: img = MIMEImage(data, _subtype=mime_type) if mime_type else MIMEImage(data) except TypeError as e: raise RuntimeError(f"Couldn't guess MIME type for tile {image_fn}") from e img.add_header('Content-Id', '<' + image_fn + '>') img.add_header("Content-Disposition", "inline", filename=image_fn) email.attach(img) email.send(fail_silently=False) except Exception as caused_by: log.error(f'Unable to send email to user "{dst_user.get_full_name()}" (#{dst_user.pk})', exc_info=caused_by)
def save_summary(self, log: ProcessLogger, user_id): # save DocumentTermUsage if self.located_usage_entities and TermUsage in self.located_usage_entities: term_usages = self.located_usage_entities[TermUsage] # update DocumentTermUsage records doc_term_usgs = {} # type: Dict[Tuple[int, int], DocumentTermUsage] for tu in term_usages: # type: TermUsage key = (tu.text_unit.document_id, tu.term.pk,) doc_usg = doc_term_usgs.get(key) if doc_usg: doc_usg.count += 1 else: doc_usg = DocumentTermUsage() doc_usg.document_id = tu.text_unit.document_id doc_usg.term_id = tu.term.pk doc_usg.count = 1 doc_term_usgs[key] = doc_usg if doc_term_usgs: doc_term_usgs_lst = [v for _, v in doc_term_usgs.items()] try: with transaction.atomic(): DocumentTermUsage.objects.bulk_create(doc_term_usgs_lst, ignore_conflicts=True) except Exception as e: log.error(f'Unable to store {len(doc_term_usgs)} DocumentTermUsage records.\n', exc_info=e)
def _build_insert_clause(log: ProcessLogger, table_name: str, handlers: List[field_handlers.FieldHandler], document: Document, fields_to_python_values: Dict[str, Any]) -> SQLClause: insert_clauses = list() for handler in handlers: # type: field_handlers.FieldHandler python_value = fields_to_python_values.get(handler.field_code) try: insert_clause = handler.get_pg_sql_insert_clause( document.language, python_value) # type: SQLInsertClause insert_clauses.append(insert_clause) except Exception as ex: msg = render_error('Unable to cache field values.\n' 'Document: {0} (#{1}).\n' 'Field: {2}'.format(document.name, document.id, handler.field_code), caused_by=ex) log.error(msg) columns_clause, values_clause = SQLInsertClause.join(insert_clauses) insert_clause = format_clause( 'insert into "{table_name}" ({columns}) ' 'values ({values}) on conflict ({column_document_id}) ' 'do update set ({columns}) = ({values})', table_name=table_name, columns=columns_clause, values=values_clause, column_document_id=FIELD_CODE_DOC_ID) return insert_clause
def document_fields_change_listener_impl(_sender, signal, log: ProcessLogger, document_event: str, document: Document, field_handlers: Dict[str, FieldHandler], fields_before: Optional[Dict], fields_after: Optional[Dict], changed_by_user: User = None): from apps.task.tasks import call_task_func from apps.notifications.tasks import process_notifications_on_document_change if not changed_by_user: # we ignore changes made by system at the moment return if not fields_before and not fields_after: log.error( 'Document fields changed event appeared with both "before" and "after" fields empty.' ) return from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val: return call_task_func(process_notifications_on_document_change, (document_event, document.pk, fields_before, fields_after, changed_by_user.pk), changed_by_user.pk)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, len(tag_models), len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e) self.save_summary(log, user_id)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: if not self.document_initial_load: TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete() tag_models = list() from apps.document.app_vars import LOCATE_TEXTUNITTAGS tags_saved = 0 if LOCATE_TEXTUNITTAGS.val: for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append(TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models) # save "_usage" objects count = 0 for entity_class, entities in self.located_usage_entities.items(): # type: Type[Usage], List[Usage] if not entities: continue count += SafeBulkCreate.bulk_create(entity_class.objects, entities) log.info( 'Stored {0} usage entities and {1} tags for {2} text units'.format( count, tags_saved, len(self.processed_text_unit_ids))) except Exception as e: entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes]) log.error(f'Unable to store location results.\n' f'Text unit ids: {self.processed_text_unit_ids}\n' f'Usage models caused the problem:\n{entities_str}', exc_info=e)
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, document_id: int, document_project_id: int, **kwargs): if not text: return start = datetime.datetime.now() try: parse_results = self.parse(log, text, text_unit_id, text_unit_lang, locate_results.document_initial_load, **kwargs) # type: ParseResults if parse_results: parse_results.update_doc_project_ids(document_id, document_project_id) locate_results.collect(self, text_unit_id, parse_results) elapsed = (datetime.datetime.now() - start).total_seconds() LocatingPerformanceMeter().add_record(str(type(self).__name__), elapsed, text_unit_id, text) except Exception as e: log.error( f'Exception caught while trying to run locator on a text unit.\n' f'Locator: {self.__class__.__name__}\n' f'Text unit id: {text_unit_id}\n' f'Text: {text[:1024]}\n' f'Text unit language: {text_unit_lang}\n', exc_info=e)
def refresh_materialized_view(self, log: ProcessLogger, view_name: str): """ Refresh the specified materialized view and delete all refresh requests older or equal to the last request date taken at this method start. Additionally this method acquires a PG advisory lock to prevent parallel refreshing of the same view. The lock is used by the planning routine which tries to acquire the lock to prevent re-planning the same refresh if it is already running. :param view_name: :param log :return: """ try: with connection.cursor() as cursor: cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATING, view_name]) except Exception as e: log.error(f'Error saving updated status for view "{view_name}": {e}') with transaction.atomic(): with connection.cursor() as cursor: if not self.advisory_lock_by_relation_name(cursor, view_name): log.info(f'Canceled refreshing materialized view: {view_name}. ' f'Unable to acquire the advisory lock.') cursor.execute(f'update {TABLE_M_VIEW} ' 'set status=%s where view_name=%s;', [MaterializedView.VIEW_STATUS_UPDATED, view_name]) return log.info(f'Refreshing materialized view: {view_name}.') cursor.execute('select max(request_date) ' f'from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s;', [view_name]) row = cursor.fetchone() request_date = row[0] if row else None concurency_clause = '' from apps.materialized_views.app_vars import CONCURRENCY_UPDATE if CONCURRENCY_UPDATE.val: concurency_clause = ' CONCURRENTLY' cursor.execute(f'refresh materialized view{concurency_clause} {view_name};') if request_date is not None: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s and request_date <= %s', [view_name, request_date]) else: cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} ' 'where view_name = %s', [view_name]) dt_now = timezone.now() cursor.execute(f'insert into {TABLE_M_VIEW} ' '(view_name, refresh_date, status) ' 'values (%s, %s, %s) ' 'on conflict (view_name) do update set refresh_date = %s, ' 'status = %s;', [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED, dt_now, MaterializedView.VIEW_STATUS_UPDATED])
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, **kwargs): try: parse_results = self.parse(log, text, text_unit_id, text_unit_lang, **kwargs) # type: ParseResults if parse_results: locate_results.collect(self, text_unit_id, parse_results) except Exception as e: log.error(f'Exception caught while trying to run locator on a text unit.\n' f'Locator: {self.__class__.__name__}\n' f'Text unit id: {text_unit_id}\n' f'Text: {text[:1024]}\n' f'Text unit language: {text_unit_lang}\n', exc_info=e)
def save(self, log: ProcessLogger, user_id): try: with transaction.atomic(): if self.processed_text_unit_ids: TextUnitTag.objects.filter( text_unit_id__in=self.processed_text_unit_ids).delete( ) for entity_class in self.processed_usage_entity_classes: entity_class.objects.filter( text_unit_id__in=self.processed_text_unit_ids ).delete() count = 0 for entity_class, entities in self.located_usage_entities.items( ): # type: Type[Usage], List[Usage] if entities: entity_class.objects.bulk_create(entities, ignore_conflicts=True) count += len(entities) tag_models = list() for text_unit_id, tags in self.tags.items(): for tag in tags: tag_models.append( TextUnitTag(user_id=user_id, text_unit_id=text_unit_id, tag=tag)) TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True) log.info( 'Stored {0} usage entities and {1} tags for {2} text units' .format(count, len(tag_models), len(self.processed_text_unit_ids))) except: msg = render_error( 'Unable to store location results.\n' 'Text unit ids: {text_unit_ids}\n' 'Usage models caused the problem:\n{entities}'.format( text_unit_ids=self.processed_text_unit_ids, entities='\n'.join([ str(e) for e in self.processed_usage_entity_classes ]))) log.error(msg)
def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str, text_unit_id: int, text_unit_lang: str, **kwargs): try: parse_results = self.parse(text, text_unit_id, text_unit_lang, **kwargs) # type: ParseResults locate_results.collect(self, text_unit_id, parse_results) except: msg = render_error( 'Exception caught while trying to run locator on a text unit.\n' 'Locator: {locator}\n' 'Text unit id: {text_unit_id}\n' 'Text: {text}\n' 'Text unit language: {text_unit_lang}\n'.format( locator=self.__class__.__name__, text_unit_id=text_unit_id, text=text[:1024], text_unit_lang=text_unit_lang)) log.error(msg)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug('detect_field_value: csv_regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detectors = cls.detecting_cache.get_detectors( field.pk, lambda msg, er: log.error(msg, field_code=field.code, exc_info=er)) if not detectors: return None is_multichoice = field.type == MultiChoiceField.type_code doc_text = cls.get_document_text(doc) annotations = [] for detector in detectors: found_item = detector.find_value(doc_text) if not found_item: continue # TODO: implement reading values from full text (TextParts.FULL.value) # as it is done now, or from text units - paragraphs or sentences # based on field.text_unit_type - for other detector.text_part options """ if detector.text_part == TextParts.BEFORE_REGEXP.value: return matching_string[:begin], 0, begin elif detector.text_part == TextParts.AFTER_REGEXP.value: return matching_string[end:], end, len(text) elif detector.text_part == TextParts.INSIDE_REGEXP.value: return matching_string[begin:end], begin, end else: return text, 0, len(text) """ # starting position has to be shifted backward by 1 symbol for FE ant = AnnotationDTO(annotation_value=found_item[0], location_in_doc_start=max( found_item[1] - 1, 0), location_in_doc_end=found_item[2], extraction_hint_name='') if not is_multichoice: return FieldValueDTO(field_value=found_item[0], annotations=[ant]) else: annotations.append(ant) if annotations: f_val = [a.annotation_value for a in annotations] return FieldValueDTO(field_value=f_val, annotations=annotations) return None
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [c for c in sorted_codes if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set(list(FieldValue.objects.filter( modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value(log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append(f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'") # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e) detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list(user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed(sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info('Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] required_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] res = list() for field_code in sorted_codes: if ignore_field_codes and field_code in ignore_field_codes: continue if required_fields and field_code not in required_fields: continue field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: field_vals = field_value_cache.cache_field_values(document, None, save=False) detected_values = field_detection_strategy.detect_field_values(log, document, field, field_vals) # type: List[DetectedFieldValue] except Exception as e: msg = '''Unable to detect field value. Document type: {0} Document: {1} Field: {2}'''.format(document_type.code, document.pk, field.code) log.error(render_error(msg, e)) raise e if save_detected and clear_old_values: # Delete previously detected values # to avoid accumulating garbage on each iteration. DocumentFieldValue.objects \ .filter(document=document, field=field, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .exclude(field__value_detection_strategy=DocumentField.VD_DISABLED) \ .delete() if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values(document, suggested_field_values=res, save=True, log=log, changed_by_user=changed_by_user, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, document_initial_load=document_initial_load) return res
def cache_document_fields( log: ProcessLogger, document: Document, cache_generic_fields: bool = True, cache_user_fields: bool = True, pre_detected_field_codes_to_suggested_values: Optional[Dict[ str, Any]] = None, document_initial_load: bool = False, changed_by_user: User = None): document_type = document.document_type table_name = doc_fields_table_name(document_type.code) cache_suggested_fields = pre_detected_field_codes_to_suggested_values is not None handlers = build_field_handlers(document_type, table_name, include_generic_fields=True, include_user_fields=True, include_suggested_fields=True) system_field_handlers = list() # type: List[field_handlers.FieldHandler] generic_field_handlers = list() # type: List[field_handlers.FieldHandler] user_field_handers = list() # type: List[field_handlers.FieldHandler] user_suggested_field_handlers = list() for h in handlers: if h.field_code in FIELD_CODES_SYSTEM: system_field_handlers.append(h) elif h.field_code in _FIELD_CODES_GENERIC: generic_field_handlers.append(h) elif h.is_suggested: user_suggested_field_handlers.append(h) else: user_field_handers.append(h) insert_field_handlers = list() # type: List[field_handlers.FieldHandler] field_to_python_values = dict() _fill_system_fields_to_python_values(document, field_to_python_values) insert_field_handlers += system_field_handlers if cache_generic_fields: _fill_generic_fields_to_python_values(document, field_to_python_values) insert_field_handlers += generic_field_handlers if cache_user_fields: if user_field_handers: insert_field_handlers += user_field_handers real_document_field_values = DocumentFieldValue.objects \ .filter(document=document, field__code__in={h.field_code for h in user_field_handers}) \ .exclude(removed_by_user=True) \ .select_related('field') # type: List[DocumentFieldValue] for dfv in real_document_field_values: field_type = field_types.FIELD_TYPES_REGISTRY[dfv.field.type] field_to_python_values[ dfv.field.code] = field_type.merge_multi_python_values( field_to_python_values.get(dfv.field.code), dfv.python_value) if cache_suggested_fields and pre_detected_field_codes_to_suggested_values is not None: insert_field_handlers += user_suggested_field_handlers for field_code, python_value in pre_detected_field_codes_to_suggested_values.items( ): field_to_python_values[field_code + '_suggested'] = python_value insert_clause = _build_insert_clause(log, table_name, insert_field_handlers, document, field_to_python_values) with connection.cursor() as cursor: document_fields_before = _get_document_fields(cursor=cursor, document_id=document.pk, table_name=table_name, handlers=handlers) try: cursor.execute(insert_clause.sql, insert_clause.params) except: import sys etype, evalue, _ = sys.exc_info() log.error( 'Error {etype}: {evalue}\n' + 'in cache_document_fields(doc_id={document_id})\nSQL: {sql}\nParams: {ptrs}.\n\n' .format(etype=etype, evalue=evalue, document_id=document.pk, sql=insert_clause.sql, ptrs=insert_clause.params)) raise inserted_document_fields = { h.field_code: h.python_value_to_indexed_field_value( field_to_python_values.get(h.field_code)) for h in insert_field_handlers } document_fields_after = dict( document_fields_before) if document_fields_before else dict() document_fields_after.update(inserted_document_fields) fire_document_fields_changed( cache_document_fields, log=log, document_event=DocumentEvent.CREATED.value if document_initial_load else DocumentEvent.CHANGED.value, document=document, field_handlers={h.field_code: h for h in handlers}, fields_before=document_fields_before, fields_after=document_fields_after, changed_by_user=changed_by_user)
def train_model(cls, log: ProcessLogger, field: DocumentField, train_data_sets: List[List[dict]], split_and_log_out_of_sample_test_report: bool = False) -> ClassifierModel: typed_field = TypedField.by(field) df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.code, row.value if typed_field.is_choice_field else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append( [{'text_unit__textunittext__text': i} for i in cls.get_no_field_text_units(field.document_type, field.text_unit_type)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype( 'str') df['user_input'] = df['modified_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) if field.classifier_init_script: try: clf = cls.init_classifier(field) except Exception as e: log.error(f'Unable to initialize classifier for field {field.code}. ' f'Classifier init script: {field.classifier_init_script}', exc_info=e) else: clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced') log.info(f'Classifier initialized: {clf}') text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', clf), ]) x = res_df['text_unit__textunittext__text'] y = res_df['target_index'] if split_and_log_out_of_sample_test_report: x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42) else: x_train, x_test_os, y_train, y_test_os = x, None, y, None sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field classifier_model.classifier_accuracy_report_in_sample = \ classification_report(y, text_clf.predict(x), target_names=target_names) if y_test_os is not None and x_test_os is not None: classifier_model.classifier_accuracy_report_out_of_sample = \ classification_report(y_test_os, text_clf.predict(x_test_os), target_names=target_names) return classifier_model
def check_task_health(log: ProcessLogger, restart_task_func: Callable[[str], None]): """ Find and process unhealthy tasks - the tasks which are hanging in PENDING while there is at least one free worker of each kind (default, high, doc_load). This is intended to wait silently until all other tasks processed and next re-send the hanged PENDING tasks. Goal state: if there are PENDING tasks which are not known by any worker - there should not be free workers of all types. """ start_time = time() inspect_start_time = time() celery_stats = get_celery_stats() inspect_time_spent = time() - inspect_start_time if not celery_stats.free_workers_available_of_any_kind: log.info( f'Task health check: there are no workers at all or at least some kind of worker is still busy.\n' f'Not checking for the hanged tasks.' f'Celery inspect time: {inspect_time_spent:.3f}s\n') return query_time_start = time() # There is at least one free worker of each kind. # This means there should be no PENDING tasks not known to workers. # Increasing bad health check counter for the PENDING tasks not known to workers. Task.objects \ .filter(own_status='PENDING', bad_health_check_num__lt=TASK_BAD_HEALTH_CHECK_RETRIES) \ .exclude(queue=settings.CELERY_QUEUE_SERIAL) \ .exclude(name__in=settings.EXCLUDE_FROM_TRACKING) \ .exclude(pk__in=celery_stats.tasks_on_workers) \ .update(bad_health_check_num=F('bad_health_check_num') + 1) # Set bad counter to zero for all tasks on workers Task.objects \ .filter(pk__in=celery_stats.tasks_on_workers) \ .exclude(bad_health_check_num=0) \ .update(bad_health_check_num=0) # Restarts those having the counter >= threshold to_restart = list( Task.objects.filter( own_status='PENDING', bad_health_check_num=TASK_BAD_HEALTH_CHECK_RETRIES).values_list( 'pk', 'name')) query_time_spent = time() - query_time_start restarted_tasks = list() could_not_restart_tasks = list() for task_id, task_name in to_restart: try: restart_task_func(task_id) restarted_tasks.append((task_id, task_name)) except Exception as ex: log.error(f'Unable to restart task {task_name} ({task_id})', exc_info=ex) could_not_restart_tasks.append((task_id, task_name)) restarted_msg = '\n'.join( task_id + " " + task_name for task_id, task_name in restarted_tasks) if restarted_tasks else 'no' problem_restarting_msg = '\n'.join( task_id + " " + task_name for task_id, task_name in could_not_restart_tasks) if restarted_tasks else 'no' log.info( f'Checked task health. Found {len(to_restart)} unhealthy tasks.\n' f'Total time: {time() - start_time:.3f}s\n' f'Celery inspect time: {inspect_time_spent:.3f}s\n' f'DB query time: {query_time_spent:.3f}s\n' f'Restarted tasks:\n{restarted_msg}\n' f'Could not restart tasks:\n{problem_restarting_msg}', extra={'log_unhealthy_tasks': bool(to_restart)})