def manual_reindex(task: ExtendedTask, document_type_code: str = None, force: bool = False): if APP_VAR_DISABLE_RAW_DB_CACHING.val: task.log_info( 'Document caching to raw tables is disabled in Commons / App Vars') return msg = f'manual_reindex called for {document_type_code}. ' \ f'Task: {task.task_name}, main id: {task.main_task_id}' log = CeleryTaskLogger(task) log.info(msg) adapt_tables_and_reindex(task, document_type_code, force, True)
def rollback(task, project_clustering_id, project_id, new_project_id, reassign_cluster_ids): """ Undo reassigning, update ProjectClustering.metadata """ task.log_error('Rollback Reassigning Document Cluster documents.') log = CeleryTaskLogger(task) p_cl = ProjectClustering.objects.get(id=project_clustering_id) # get reassigned documents queryset documents = Document.objects.filter(documentcluster__pk__in=reassign_cluster_ids) # update rawdb cache for target doc type for document in documents: signals.document_deleted.send(task.__name__, user=None, document=document) # get back Doc Type to Generic Doc Type documents.update(project_id=project_id, document_type=DocumentType.generic()) # update rawdb cache forGeneric Doc Type for document in documents: signals.document_changed.send(task.__name__, log=log, document=document) # update info about reassignings in metadata reassignings = p_cl.metadata.get('reassigning', []) reassigning = { 'date': now().isoformat(), 'new_project_id': new_project_id, 'cluster_ids': reassign_cluster_ids, 'status': FAILURE, } reassignings.append(reassigning) p_cl.metadata['reassigning'] = reassignings p_cl.save()
def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set): for doc in Document.objects.filter(pk__in=doc_ids): log = CeleryTaskLogger(_task) field_value_cache.cache_generic_values(doc, log=log) suggested_values = field_detection.detect_and_cache_field_values_for_document(log, doc, False, clear_old_values=False) field_value_cache.cache_field_values(doc, suggested_values, save=True, log=log)
def cache_document_fields_for_doc_ids(task: ExtendedTask, doc_ids: Set): # This task is added to exclude-from-tracking list and is not seen in task list at /advanced # Also if running it as a aub-task it will not participate in the parent task's progress. log = CeleryTaskLogger(task) for doc in Document.all_objects.filter(pk__in=doc_ids) \ .select_related('document_type', 'assignee', 'status'): # type: Document cache_document_fields(log, doc)
def train_model_for_field(task: ExtendedTask, field_uid: str) -> None: field = DocumentField.objects.get(pk=field_uid) new_model = field_detection \ .train_document_field_detector_model(CeleryTaskLogger(task), field, None) # type: ClassifierModel if new_model: ClassifierModel.objects.filter(document_field=field).delete() new_model.save()
def auto_reindex_not_tracked(task: ExtendedTask, document_type_code: str = None, force: bool = False): if APP_VAR_DISABLE_RAW_DB_CACHING.val: return document_types = [DocumentType.objects.get(code=document_type_code)] \ if document_type_code is not None else DocumentType.objects.all() log = CeleryTaskLogger(task) task_model = task.task for document_type in document_types: reindex_needed = adapt_table_structure(log, document_type, force=force) if reindex_needed: force_fmt = ', forced' if force else '' task.log_info( f'Re-index from auto_reindex_not_tracked, {task.name}, ' f'for {document_type}{force_fmt}') call_task_func(manual_reindex, (document_type.code, False), task_model.user_id) else: if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \ and not any_other_reindex_task(task.request.id, document_type.code).exists(): task.log_info( f'auto_reindex_not_tracked({document_type.code}): ' f'there_are_non_indexed_docs_not_planned_to_index') call_task_func(manual_reindex, (document_type.code, False), task_model.user_id)
def rollback(task, project_id, cluster_ids, new_project_id): task.log_error('Rollback Reassigning Document Cluster documents.') log = CeleryTaskLogger(task) documents = Document.objects.filter(documentcluster__pk__in=cluster_ids) for document in documents: signals.document_deleted.send(task.__name__, user=None, document=document) documents.update(project_id=project_id, document_type=DocumentType.generic()) for document in documents: signals.document_changed.send(task.__name__, log=log, document=document) p_cl = ProjectClustering.objects.get(document_clusters__pk=cluster_ids[0]) reassignings = p_cl.metadata.get('reassigning', []) reassigning = { 'date': now().isoformat(), 'new_project_id': new_project_id, 'cluster_ids': cluster_ids, 'status': FAILURE, } reassignings.append(reassigning) p_cl.metadata['reassigning'] = reassignings p_cl.save()
def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool, filed_owners: dict = None): filed_owners = filed_owners if filed_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners) log = CeleryTaskLogger(task) with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) if not new_document: DocumentFieldValue.objects \ .filter(document=document, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .delete() for field, values in fields_to_values.items(): field_detection.save_detected_values(document, field, values) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log, document, True) else: dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False) field_value_cache.cache_field_values(document, dfvs, save=True) task.log_info('Loaded {0} field values for document #{1} ({2})' .format(len(fields_to_values), document.pk, document.name))
def process(self, **kwargs): n_clusters = kwargs.get('n_clusters', 3) method = kwargs.get('method', 'kmeans') cluster_by = kwargs.get('cluster_by', 'term') self.project_clustering_id = kwargs.get('project_clustering_id') project_clustering = ProjectClustering.objects.get(pk=self.project_clustering_id) project_clustering.status = PENDING project_clustering.task = self.task project_clustering.save() project = project_clustering.project self.log_info('Start clustering documents for project id={}'.format(project.id)) self.log_info('Clustering method: "{}", n_clusters={}'.format(method, n_clusters)) self.log_info('Cluster by: {}'.format(str(cluster_by))) self.set_push_steps(4) self.push() # clear previous clusters, their tasks and cluster sessions project.drop_clusters(exclude_task_ids={self.request.id}, exclude_project_clustering_id=self.project_clustering_id) self.push() cluster_model = ClusterDocuments(project_id=project.id, cluster_algorithm=method, n_clusters=n_clusters, cluster_by=cluster_by, use_default_name=True, log_message=self.log_wo_flooding) result = cluster_model.run() project_clustering.metadata = result.metadata project_clustering.save() project_clustering.document_clusters.add(*result.metadata['cluster_obj_ids']) self.push() self.log_info('Clustering completed. Updating document cache.') log = CeleryTaskLogger(self) for doc in Document.objects.filter(project__pk=project.id): signals.fire_document_changed(sender=self, log=log, document=doc, changed_by_user=None, system_fields_changed=False, user_fields_changed=False, generic_fields_changed=[DocumentGenericField.cluster_id.value]) project_clustering.status = SUCCESS project_clustering.save() self.push() self.log_info('Finished.') return result.metadata
def test_field_detector_model(task: ExtendedTask, field_id, document_id) -> dict: document = Document.objects.get(pk=document_id) # type: Document field = DocumentField.objects.get(pk=field_id) # type: DocumentField expected_dfvs = field_detection.detect_and_cache_field_values( CeleryTaskLogger(task), document, field, save=False) # type: List[DetectedFieldValue] actual_dfvs = list( DocumentFieldValue.objects.filter( document=document, field=field, removed_by_user=False).all()) # type: List[DocumentFieldValue] if field.is_value_aware(): # dates, numbers, e.t.c. expected_field_values = field_detection.merge_detected_field_values_to_python_value( expected_dfvs) expected_field_value = expected_field_values.get(field.code) actual_field_values = merge_document_field_values_to_python_value( actual_dfvs) actual_field_value = actual_field_values.get(field.code) matches = bool(expected_field_value == actual_field_value) else: # related-info e.t.c. expected_set = { 'text_unit_' + str(dfv.text_unit.id) for dfv in expected_dfvs if dfv.text_unit } expected_field_value = '; '.join(sorted(expected_set)) actual_set = { 'text_unit_' + str(dfv.text_unit.id) for dfv in actual_dfvs if dfv.text_unit } actual_field_value = '; '.join(sorted(actual_set)) matches = bool(expected_set == actual_set) if not matches: task.log_info( '{3} Test doc: {0} (Project: {5}). ' 'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----' .format( document.name, expected_field_value, actual_field_value, '[ OK ]' if matches else '[ ERR ]', '\n---\n'.join( [dfv.text_unit.text for dfv in expected_dfvs]) if expected_dfvs else '', document.project.name if document.project else '')) text_units_number = TextUnit.objects.filter( document=document, unit_type=field.text_unit_type).count() return { 'text_units_number': text_units_number, 'value_matches_expected': matches }
def plan_refreshing_materialized_views(_celery_task): mat_views_repo = MaterializedViews() def refresh_view(view_name: str): refresh_materialized_view.apply_async((view_name,)) mat_views_repo.plan_refreshes(CeleryTaskLogger(_celery_task), task_names.TASK_NAME_REFRESH_MATERIALIZED_VIEW, refresh_view)
def manual_reindex(task: ExtendedTask, document_type_code: str = None, force: bool = False, project_id: Optional[int] = None): if APP_VAR_DISABLE_RAW_DB_CACHING.val: task.log_info( 'Document caching to raw tables is disabled in Commons / App Vars') return run_parameters = {'document type': document_type_code} if project_id: run_parameters['project'] = project_id if force: run_parameters['force'] = True ptrs_str = ', '.join([f'{p}={run_parameters[p]}' for p in run_parameters]) msg = f'manual_reindex called for {ptrs_str}. ' \ f'Task: {task.task_name}, main id: {task.main_task_id}' log = CeleryTaskLogger(task) log.info(msg) adapt_tables_and_reindex(task, document_type_code, force, True, project_id)
def reassign_document(task, document_id): document = Document.objects.get(pk=document_id) signals.document_deleted.send(task.__class__, user=None, document=document) log = CeleryTaskLogger(task) dfvs = field_detection.detect_and_cache_field_values_for_document( log=log, document=document, system_fields_changed=True, generic_fields_changed=True) task.log_info( f'Detected {len(dfvs)} field values for document ' + f'#{document.id} ({document.name})')
def train_model_for_dirty_field(task: ExtendedTask, dirty_field_id: Any) -> None: dirty_field = DocumentField.objects.get(pk=dirty_field_id) if dirty_field.can_retrain(): dirty_field.dirty = False dirty_field.save() train_docs_count = field_detection_utils.get_approved_documents_number(dirty_field, None) if train_docs_count >= dirty_field.trained_after_documents_number: new_model = field_detection.train_document_field_detector_model(CeleryTaskLogger(task), dirty_field, None) if new_model: ClassifierModel.objects.filter(document_field=dirty_field).delete() new_model.save()
def process(self, document_field: Dict, config_csv_file: Dict, drop_previous_field_detectors: bool, update_field_choice_values: bool, **kwargs): try: self.log_info( 'Going to configure simple field detection config...') document_field = DocumentField.objects.get(pk=document_field['pk']) csv_bytes = DbCache.get(config_csv_file['cache_key']) apply_simple_config(CeleryTaskLogger(self), document_field, csv_bytes, drop_previous_field_detectors, update_field_choice_values) finally: DbCache.clean_cache(config_csv_file['cache_key'])
def process(self, document_type_config_csv_file: Dict, action: str, update_cache: bool, **kwargs): if action == 'validate': save = False auto_fix_validation_errors = False remove_missed_objects = False elif action == 'validate|import': save = True auto_fix_validation_errors = False remove_missed_objects = False elif action == 'import|auto_fix|retain_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = False elif action == 'import|auto_fix|remove_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = True else: raise RuntimeError('Unknown action') try: json_bytes = DbCache.get(document_type_config_csv_file['cache_key']) document_type = import_document_type(json_bytes=json_bytes, save=save, auto_fix_validation_errors=auto_fix_validation_errors, remove_missed_in_dump_objects=remove_missed_objects, task=self) finally: DbCache.clean_cache(document_type_config_csv_file['cache_key']) if not (save and update_cache): return from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if not APP_VAR_DISABLE_RAW_DB_CACHING.val: self.log_info('Adapting RawDB table structure after import ...') adapt_table_structure(CeleryTaskLogger(self), document_type, force=False) ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True) ids = list(ids) self.log_info('Caching document field values ...') for chunk in chunks(ids, 50): self.run_sub_tasks('Cache field values for a set of documents', ImportDocumentType.cache_document_fields_for_doc_ids, [(list(chunk),)])
def _detect_field_values_for_document(task: ExtendedTask, document_id, do_not_write): doc = Document.objects.get(pk=document_id) log = CeleryTaskLogger(task) # If the document is in one of completed statuses then # the detected values wont be stored even if do_not_write = False. # But caching should go as usual. dfvs = field_detection \ .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write) task.log_info( 'Detected {0} field values for document #{1} ({2})'.format( len(dfvs), document_id, doc.name))
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False, force_reindex: bool = False): """ Checks if raw table with field values of doc type needs to be altered according to the changed field structure and triggers document reindexing if needed. This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications on the same table. See settings.py/CELERY_BEAT_SCHEDULE :param task: :param document_type_code: Document type code or None to check all doc types. :param force_recreate_tables: Force re-creating tables and re-indexing from scratch. :param force_reindex: Force re-indexing of all docs even if the table was not altered. :return: """ document_types = [DocumentType.objects.get(code=document_type_code)] \ if document_type_code is not None else DocumentType.objects.all() log = CeleryTaskLogger(task) for document_type in document_types: reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables) if force_recreate_tables: # If "force" is requested - we cancel all currently planned re-index tasks # and plan (re-plan) reindexing for all documents of this task. for prev_task in any_other_reindex_task(task.request.id, document_type.code): purge_task(prev_task) args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) elif reindex_needed or force_reindex: task.log_info('Raw DB table for document type {0} has been altered. ' 'Initiating re-index for all documents of this document type.'.format(document_type.code)) # If we altered the field structure then we need to re-index all docs of this type. # If no need to force - we plan re-index tasks only # for those documents for which they are not planned yet args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) else: # If we did not alter the table but there are non-indexed docs fo this type # then we trigger the re-index task making it index non-indexed docs only. # In this case we don't stop other re-index tasks. But we can be stopped further in case of # full reindex. # It makes sense to only plan re-indexing for those docs which are: # - not indexed # - have no re-index planned for them args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
def _detect_field_values_for_document(task: ExtendedTask, document_id, do_not_write): doc = Document.objects.get(pk=document_id) if doc.status and not doc.status.is_active: task.log_info('Forbidden detecting field values for document with "completed"' ' status, document #{} ({})'.format(document_id, doc.name)) return log = CeleryTaskLogger(task) dfvs = field_detection \ .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write) task.log_info('Detected {0} field values for document #{1} ({2})'.format( len(dfvs), document_id, doc.name))
def train_model_for_dirty_field(task: ExtendedTask, dirty_field_id: Any) -> None: dirty_field = DocumentTypeField.objects \ .filter(pk=dirty_field_id) \ .prefetch_related('document_type', 'document_field')[0] if dirty_field.can_retrain(): dirty_field.dirty = False dirty_field.save() document_type = dirty_field.document_type field = dirty_field.document_field train_docs_count = field_detection_utils.get_approved_documents_number(document_type, field, None) if train_docs_count >= settings.ML_TRAIN_DATA_SET_GROUP_LEN: new_model = field_detection.train_document_field_detector_model(CeleryTaskLogger(task), document_type, field, None) if new_model: ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete() new_model.save()
def cache_document_fields_for_doc_ids(task: ExtendedTask, doc_ids: Iterable, changed_by_user_id: int = None, cache_system_fields: FieldSpec = True, cache_generic_fields: FieldSpec = True, cache_user_fields: bool = True): log = CeleryTaskLogger(task) changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None for doc in Document.all_objects.filter(pk__in=doc_ids) \ .select_related('document_type', 'assignee', 'status'): # type: Document try: cache_document_fields(log, doc, changed_by_user=changed_by_user, cache_system_fields=cache_system_fields, cache_generic_fields=cache_generic_fields, cache_user_fields=cache_user_fields) except Document.DoesNotExist: pass
def detect_field_values_for_document( task: ExtendedTask, detect_ptrs: DocDetectFieldValuesParams): doc = Document.all_objects.get(pk=detect_ptrs.document_id) log = CeleryTaskLogger(task) # If the document is in one of completed statuses then # the detected values wont be stored even if do_not_write = False. # But caching should go as usual. dfvs = field_detection \ .detect_and_cache_field_values_for_document(log, doc, changed_by_user=task.task.user, save=not detect_ptrs.do_not_write, clear_old_values=detect_ptrs.clear_old_values, updated_field_codes=detect_ptrs.updated_field_codes) task.log_info(f'Detected {len(dfvs)} field values for document ' + f'#{detect_ptrs.document_id} ({doc.name})')
def load_doc(task: ExtendedTask, document: Document, field_values_alias_to_value: Dict[str, Any], run_detect_field_values: bool, field_owners: Dict[str, User] = None): field_owners = field_owners if field_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value) log = CeleryTaskLogger(task) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None}) for field, value_dto in fields_to_values.items(): field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=value_dto, user=field_owners.get(field.code)) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log=log, document=document, save=True, clear_old_values=False) else: signals.fire_document_changed(sender=task, log=log, document=document, changed_by_user=None, document_initial_load=True, system_fields_changed=True, generic_fields_changed=True, user_fields_changed=True) task.log_info('Loaded {0} field values for document #{1} ({2}): {3}' .format(len(fields_to_values), document.pk, document.name, ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
def process_document_changed(task: ExtendedTask, doc_id: int, system_fields_changed: FieldSpec = True, generic_fields_changed: FieldSpec = True, user_fields_changed: bool = True, changed_by_user_id: int = None): from apps.document.repository.document_field_repository import DocumentFieldRepository dfr = DocumentFieldRepository() doc = Document.objects.get(pk=doc_id) # type: Document changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None if DocumentSystemField.status.specified_in(system_fields_changed): dfr.delete_hidden_field_values_if_needed(doc, event_sender=task) fire_document_changed(sender=task, log=CeleryTaskLogger(task), document=doc, changed_by_user=changed_by_user, document_initial_load=False, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed)
def cache_fields_for_docs_queryset( task: ExtendedTask, doc_qr, changed_by_user: User = None, document_initial_load: bool = False, generic_fields_changed: bool = True, user_fields_changed: bool = True, pre_detected_field_values: Optional[Dict[str, Any]] = None, old_field_values: Dict[int, Dict[str, Any]] = None): from apps.rawdb.field_value_tables import cache_document_fields old_field_values = old_field_values or {} for doc in doc_qr.select_related('document_type', 'project', 'status'): # type: Document log = CeleryTaskLogger(task) cache_document_fields(log=log, document=doc, cache_generic_fields=generic_fields_changed, cache_user_fields=user_fields_changed, pre_detected_field_codes_to_suggested_values= pre_detected_field_values, changed_by_user=changed_by_user, document_initial_load=document_initial_load, old_field_values=old_field_values.get(doc.pk))
def import_document_type(json_bytes: bytes, save: bool, auto_fix_validation_errors: bool, remove_missed_in_dump_objects: bool, task: ExtendedTask) -> DocumentType: tasks = Task.objects \ .get_active_user_tasks() \ .exclude(pk=task.task.pk) \ .distinct('name') \ .order_by('name') \ .values_list('name', flat=True) tasks = list(tasks) if tasks: msg = 'The following user tasks are running: {0}. This import can cause their crashing because of document' \ ' type / field structure changes.'.format(', '.join(tasks)) raise RuntimeError(msg) objects = serializers.deserialize("json", json_bytes.decode("utf-8")) document_type = None pk_to_field = {} field_detectors = [] other_objects = [] logger = CeleryTaskLogger(task) for deserialized_object in objects: obj = deserialized_object.object if isinstance(obj, DocumentType): if document_type is not None: raise RuntimeError('More than one document types was detected') document_type = DeserializedDocumentType( deserialized_object, auto_fix_validation_errors=auto_fix_validation_errors, remove_missed_in_dump_objects=remove_missed_in_dump_objects, logger=logger) elif isinstance(obj, DocumentField): field = DeserializedDocumentField( deserialized_object, auto_fix_validation_errors=auto_fix_validation_errors, remove_missed_in_dump_objects=remove_missed_in_dump_objects, logger=logger) pk_to_field[field.pk] = field elif isinstance(obj, DocumentFieldDetector): field_detector = DeserializedDocumentFieldDetector( deserialized_object, auto_fix_validation_errors=auto_fix_validation_errors, logger=logger) field_detectors.append(field_detector) elif isinstance(obj, DocumentFieldCategory): category = DeserializedDocumentFieldCategory( deserialized_object, auto_fix_validation_errors=auto_fix_validation_errors, logger=logger) other_objects.append(category) else: raise RuntimeError('Unknown model') if document_type is None: raise RuntimeError('Unable to find document type') conflicting_document_type = DocumentType.objects \ .filter(code=document_type.object.code) \ .exclude(pk=document_type.pk) \ .first() if conflicting_document_type is not None: err_msg = 'Unable to import document type #{0} "{1}". Database already contains a document type #{2}' \ ' with code "{3}"'.format(document_type.pk, document_type.object.code, conflicting_document_type.pk, conflicting_document_type.code) raise RuntimeError(err_msg) for field_detector in field_detectors: field = pk_to_field.get(field_detector.field_pk) if field is not None: field.add_dependent_object(field_detector) else: raise RuntimeError('Unknown field #{0}'.format( field_detector.field_pk)) for field in pk_to_field.values(): if field.document_type_pk == document_type.pk: document_type.add_dependent_object(field) else: raise RuntimeError('Unknown document type #{0}'.format( document_type.pk)) for obj in other_objects: document_type.add_dependent_object(obj) logger.info('Validation of {0} ...'.format(document_type.object.code)) validation_errors = document_type.validate() logger.info('Validation of {0} is finished'.format( document_type.object.code)) if validation_errors: task.log_error( '{0} VALIDATION ERRORS HAS OCCURRED DURING VALIDATION OF {1}.'. format(len(validation_errors), document_type.object.code)) for index, validation_error in enumerate(validation_errors): # for different timestamps sleep(0.001) task.log_error('VALIDATION ERROR {0}. {1}'.format( index + 1, str(validation_error))) raise ValidationError( 'Validation errors has occurred during import of {0}'.format( document_type.object.code)) if save: logger.info('Import of {0} ...'.format(document_type.object.code)) with transaction.atomic(): document_type.save() logger.info('Import of {0} is finished'.format( document_type.object.code)) return document_type.object
def process(self, **kwargs): self.log_info( 'Going to train document field based on the datasets stored in DB...') document_type_field_id = kwargs.get('document_type_field_id') skip_training = kwargs.get('skip_training') use_only_confirmed_field_values_for_training = kwargs.get('use_only_confirmed_field_values_for_training') train_data_project_ids = kwargs.get('train_data_project_ids') skip_testing = kwargs.get('skip_testing') use_only_confirmed_field_values_for_testing = kwargs.get('use_only_confirmed_field_values_for_testing') test_data_projects_ids = kwargs.get('test_data_projects_ids') document_type_field = DocumentTypeField.objects.get(pk=document_type_field_id) document_type = document_type_field.document_type field = document_type_field.document_field # type: DocumentField if not field.is_detectable(): self.log_info('Field {0} is not detectable. Nothing to train and/or test.'.format(field.code)) new_model = None if not skip_training: if train_data_project_ids: self.log_info('Training model on the specified projects...') else: self.log_info('No training projects specified. ' 'Training model on all user-confirmed field values in the system...') new_model = field_detection \ .train_document_field_detector_model(CeleryTaskLogger(self), document_type, field, train_data_project_ids, use_only_confirmed_field_values_for_training) if new_model: ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete() new_model.save() if new_model.classifier_accuracy_report_in_sample: self.log_info('Sklearn test report for in-sample docs:\n{0}' .format(new_model.classifier_accuracy_report_in_sample)) if new_model.classifier_accuracy_report_out_of_sample: self.log_info('Sklearn test report for out-of-sample docs:\n{0}' .format(new_model.classifier_accuracy_report_out_of_sample)) else: self.log_info('No model trained. ' 'Probably the detection strategy of field {0} does not allow training'.format(field.code)) if skip_testing: return if not test_data_projects_ids: self.log_info('No test projects specified. Skiping the testing step.') return else: if not use_only_confirmed_field_values_for_testing: test_document_ids = Document.objects \ .filter(project_id__in=test_data_projects_ids, document_type_id=document_type.pk) \ .values_list('pk', flat=True) else: test_document_ids = set(field_detection_utils .get_qs_active_modified_document_ids(document_type, field, test_data_projects_ids)) test_document_ids.update(set(field_detection_utils .get_qs_finished_document_ids(document_type, test_data_projects_ids))) self.log_info('Testing field detection document-by-document...') test_tasks_args = [] for test_document_id in test_document_ids: test_tasks_args.append((field.uid, test_document_id)) if test_tasks_args: self.run_sub_tasks('Test Field Detector Model', TrainAndTest.test_field_detector_model, test_tasks_args) self.run_after_sub_tasks_finished('Join Field Detector Model Tests', TrainAndTest.join_field_detector_model_tests, [(field.uid, document_type.uid, new_model.pk if new_model else None)])
def process(self, **kwargs): if self.PARAM_CONFIG in kwargs: config_id = kwargs[self.PARAM_CONFIG]['pk'] else: config_id = kwargs[self.PARAM_CONFIG_ID] if self.PARAM_USER in kwargs: user_ids = {kwargs[self.PARAM_USER]['pk']} else: user_ids = kwargs.get(self.PARAM_USER_IDS) run_date = kwargs.get(self.PARAM_RUN_DATE) run_date_specified = run_date is not None if isinstance(run_date, str): run_date = parse(run_date) run_date = run_date or datetime.datetime.now(tz=tzlocal.get_localzone()) run_even_if_not_enabled = bool(kwargs.get(self.PARAM_RUN_EVEN_IF_NOT_ENABLED)) config = DocumentDigestConfig.objects \ .filter(pk=config_id).select_related('for_role', 'for_user').first() # type: DocumentDigestConfig if not config: self.log_error('{1} not found: #{0}'.format(config_id, DocumentDigestConfig.__name__)) return if not config.enabled and not run_even_if_not_enabled: self.log_info('{1} #{0} is disabled.'.format(config_id, DocumentDigestConfig.__name__)) return tz_msg = ' at timezone {0}'.format(run_date.tzname()) if run_date_specified else '' self.log_info('Rendering and sending {what} #{pk} ({doc_filter}) for date "{run_date}" to {n} users{tz_msg}' .format(what=DocumentDigestConfig.__name__, pk=config.pk, doc_filter=config.documents_filter, n=len(user_ids), run_date=run_date, tz_msg=tz_msg)) if user_ids: users_qr = User.objects.filter(pk__in=user_ids) elif config.for_role_id is not None: users_qr = User.objects.filter(role_id=config.for_role_id) elif config.for_user_id is not None: users_qr = User.objects.get(pk=config.for_user_id) else: self.log_error('{what} #{config_id} specifies neither for_user nor for_role.' .format(what=DocumentDigestConfig.__name__, config_id=config.pk)) return log = CeleryTaskLogger(self) for user in users_qr: # type: User if config.for_user_id != user.id and (config.for_role_id is None or config.for_role_id != user.role_id): self.log_error('{what} #{what_id} is not applicable for user {user_name} (#{user_id})' .format(what=DocumentDigestConfig.__name__, what_id=config.pk, user_name=user.get_full_name(), user_id=user.pk)) continue try: digest = render_digest(config=config, dst_user=user, run_date=run_date) if digest: digest.send(log) except Exception as e: self.log_error(f'Unable to send {RenderedDigest}.\n' f'Config: #{config.pk}\n' f'Dst user: {user.get_full_name()} #{user.pk}\n' f'Run date: {run_date}', exc_info=e)
def send_notifications_packet(ntfs: List[DocumentNotification], event: str, task: BaseTask): documents_data = list(Document.all_objects.filter( pk__in={d.document_id for d in ntfs})) # type: List[Document] doc_type_by_id = {dt.document_type.pk:dt.document_type for dt in documents_data} doc_types = [doc_type_by_id[pk] for pk in doc_type_by_id] doc_by_id = {} # type: Dict[int, Document] for doc in documents_data: doc_by_id[doc.pk] = doc users = User.objects.filter(pk__in={d.changed_by_user_id for d in ntfs}) user_by_id = {u.pk: u for u in users} handlers_by_doctype = {d: build_field_handlers(d, include_annotation_fields=False) for d in doc_types} # type:Dict[str, RawdbFieldHandler] log = CeleryTaskLogger(task) # { (doc_type, event,) : [notification0, notification1, ...], ... } messages_by_subscr_key = {} # type: Dict[Tuple[str, str], List[DocumentNotification]] # { (doc_type, event,) : [DocumentNotificationSubscription0, ... ], ... } subscr_by_key = {} # type: Dict[Tuple[str, str], List[DocumentNotificationSubscription]] for ntf in ntfs: if ntf.document_id not in doc_by_id: continue document = doc_by_id[ntf.document_id] key = (document.document_type, ntf.event,) if key in messages_by_subscr_key: messages_by_subscr_key[key].append(ntf) else: subscriptions = DocumentNotificationSubscription.objects \ .filter(enabled=True, document_type=document.document_type, event=event, recipients__isnull=False) \ .select_related('specified_user', 'specified_role') \ .prefetch_related(Prefetch('user_fields', queryset=DocumentField.objects.all().order_by('order'))) subscr_by_key[key] = subscriptions messages_by_subscr_key[key] = [ntf] notifications_to_send = [] # type: List[RenderedNotification] for key in messages_by_subscr_key: messages = messages_by_subscr_key[key] subscriptions = subscr_by_key[key] for sub in subscriptions: for msg_pack in chunks(messages, sub.max_stack): # render pack of notifications or just one notification if len(msg_pack) < 2: # render single notification if msg_pack[0].document_id not in doc_by_id or \ not doc_by_id[msg_pack[0].document_id]: raise Exception(f'Error in send_notifications_packet(1): doc ' f'with id={msg_pack[0].document_id} was not obtained') document = doc_by_id[msg_pack[0].document_id] handlers = handlers_by_doctype[document.document_type] user = user_by_id[msg_pack[0].changed_by_user_id] try: notification = NotificationRenderer.render_notification( msg_pack[0].package_id, sub, DocumentNotificationSource( document=document, field_handlers=handlers, field_values=msg_pack[0].field_values, changes=msg_pack[0].changes, changed_by_user=user)) if notification: notifications_to_send.append(notification) except Exception as e: log.error(f'Error in send_notifications_packet(1), ' f'sending render_notification()', exc_info=e) else: not_sources = [] # List[DocumentNotificationSource # render pack of notifications in a single message for msg in msg_pack: if msg.document_id not in doc_by_id or \ not doc_by_id[msg.document_id]: raise Exception(f'Error in send_notifications_packet({len(msg_pack)}: doc ' f'with id={msg.document_id} was not obtained') document = doc_by_id[msg.document_id] handlers = handlers_by_doctype[document.document_type] user = user_by_id[msg.changed_by_user_id] not_src = DocumentNotificationSource( document=document, field_handlers=handlers, field_values=msg.field_values, changes=msg.changes, changed_by_user=user) not_sources.append(not_src) try: notifications = NotificationRenderer.render_notification_pack( [m.package_id for m in msg_pack], sub, not_sources) notifications_to_send += notifications except Exception as e: log.error(f'Error in send_notifications_packet(), ' f'sending render_notification_pack()', exc_info=e) log.info(f'notification.send({len(notifications_to_send)})') for notification in notifications_to_send: notification.send(log=log)
def check_email_pool(_task) -> None: log = CeleryTaskLogger(_task) for event in EmailNotificationPool.DOC_NOTIFICATION_EVENTS: cache_key = f'{CACHE_DOC_NOTIFICATION_PREFIX}{event}' try: cached_msgs_count = ObjectStorage.objects.filter(pk__startswith=cache_key).count() if not cached_msgs_count: continue if cached_msgs_count < EmailNotificationPool.batch_size: lastest_msg_time = ObjectStorage.objects.filter( pk__startswith=cache_key).aggregate(Min('last_updated')) lastest_msg_time = [lastest_msg_time[k] for k in lastest_msg_time][0] delta = now() - lastest_msg_time if delta.seconds < EmailNotificationPool.batch_seconds: continue ntfs = [] # type:List[DocumentNotification] for raw_msg in ObjectStorage.objects.filter(pk__startswith=cache_key): # type: ObjectStorage try: msg = pickle.loads(raw_msg.data) # type: DocumentNotification ntfs.append(msg) except: log.error(f'send_notifications_packet() - error unpickling raw_msg.data') pass if not ntfs: continue except Exception as e: log.error(f'Error in check_email_pool(), extracting pool messages: {e}') continue try: log.info(f'send_notifications_packet({len(ntfs)})') EmailNotificationPool.send_notifications_packet(ntfs, event, _task) except Exception as e: log.error(f'Error in check_email_pool(), sending package: {e}') continue try: ObjectStorage.objects.filter(pk__startswith=cache_key).delete() except Exception as e: log.error(f'Error in check_email_pool(), deleting pool objects: {e}') continue