def user_full_name_change_listener(sender, **kwargs): user = kwargs.get('instance') old_user = kwargs.get('old_instance') if old_user is not None and old_user.get_full_name() != user.get_full_name(): from apps.task.tasks import call_task_func from apps.rawdb.tasks import reindex_assignee_for_all_documents_in_system call_task_func(reindex_assignee_for_all_documents_in_system, (user.pk,), None)
def project_name_change_listener(sender, **kwargs): project = kwargs.get('instance') old_project = kwargs.get('old_instance') if old_project is not None and project.name != old_project.name: from apps.task.tasks import call_task_func from apps.rawdb.tasks import reindex_all_project_documents call_task_func(reindex_all_project_documents, (project.pk, ), None)
def run_detect_field_values_for_document(document_id: int): from apps.task.tasks import call_task_func from apps.document.tasks import DetectFieldValues call_task_func(DetectFieldValues.detect_field_values_for_document, (document_id, False, False), None, visible=False)
def document_fields_change_listener_impl(_sender, signal, log: ProcessLogger, document_event: str, document: Document, field_handlers: Dict[str, FieldHandler], fields_before: Optional[Dict], fields_after: Optional[Dict], changed_by_user: User = None): from apps.task.tasks import call_task_func from apps.notifications.tasks import process_notifications_on_document_change if not changed_by_user: # we ignore changes made by system at the moment return if not fields_before and not fields_after: log.error( 'Document fields changed event appeared with both "before" and "after" fields empty.' ) return from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val: return call_task_func(process_notifications_on_document_change, (document_event, document.pk, fields_before, fields_after, changed_by_user.pk), changed_by_user.pk)
def auto_reindex_not_tracked(task: ExtendedTask, document_type_code: str = None, force: bool = False): if APP_VAR_DISABLE_RAW_DB_CACHING.val: return document_types = [DocumentType.objects.get(code=document_type_code)] \ if document_type_code is not None else DocumentType.objects.all() log = CeleryTaskLogger(task) task_model = task.task for document_type in document_types: reindex_needed = adapt_table_structure(log, document_type, force=force) if reindex_needed: force_fmt = ', forced' if force else '' task.log_info( f'Re-index from auto_reindex_not_tracked, {task.name}, ' f'for {document_type}{force_fmt}') call_task_func(manual_reindex, (document_type.code, False), task_model.user_id) else: if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \ and not any_other_reindex_task(task.request.id, document_type.code).exists(): task.log_info( f'auto_reindex_not_tracked({document_type.code}): ' f'there_are_non_indexed_docs_not_planned_to_index') call_task_func(manual_reindex, (document_type.code, False), task_model.user_id)
def review_status_save_listener(sender, **kwargs): review_status = kwargs.get('instance') old_review_status = kwargs.get('old_instance') if old_review_status is not None and review_status.name != old_review_status.name: from apps.task.tasks import call_task_func from apps.rawdb.tasks import reindex_status_name_for_all_documents_in_system call_task_func(reindex_status_name_for_all_documents_in_system, (review_status.pk,), None)
def reindex_on_doc_type_change(document_type: DocumentType): from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if APP_VAR_DISABLE_RAW_DB_CACHING.val: return from apps.rawdb.tasks import auto_reindex_not_tracked from apps.task.tasks import call_task_func call_task_func(auto_reindex_not_tracked, (document_type.code,), None, queue=settings.CELERY_QUEUE_SERIAL)
def plan_process_document_changed(doc_id: int, system_fields_changed: FieldSpec = True, generic_fields_changed: FieldSpec = True, user_fields_changed: bool = True, changed_by_user_id: int = None): call_task_func(process_document_changed, (doc_id, system_fields_changed, generic_fields_changed, user_fields_changed, changed_by_user_id), changed_by_user_id)
def run_detect_field_values_for_document(dcptrs: DocDetectFieldValuesParams, user: User = None): from apps.task.tasks import call_task_func from apps.document.tasks import DetectFieldValues call_task_func(DetectFieldValues.detect_field_values_for_document, (dcptrs,), user_id=user.pk if user else None, visible=False)
def start_task(self, data): document_type = data.get('document_type', {}) document_type_code = document_type.code if document_type else None force = data.get('recreate_tables') or False proj = data.get('project') or None proj_id = proj.pk if proj else None # type:Optional[int] call_task_func(manual_reindex, (document_type_code, force, proj_id), data['user_id'])
def cache_doc_fields_task_impl(_sender, signal, documents, status_name: str, changed_by_user: User): from apps.rawdb.repository.raw_db_repository import RawDbRepository from apps.rawdb.tasks import cache_fields_for_docs_queryset from apps.task.tasks import call_task_func repo = RawDbRepository() doc_ids = list(documents.values_list('pk', flat=True)) repo.update_documents_status(doc_ids, status_name) call_task_func(cache_fields_for_docs_queryset, (documents, changed_by_user, False, True, True, None), changed_by_user.pk)
def run_detect_field_values_for_document(document_id: int): from urls import custom_apps from apps.task.tasks import call_task_func for app_name in custom_apps: module_str = 'apps.%s.tasks' % app_name task_module = sys.modules.get(module_str) detector_task = getattr(task_module, 'DetectFieldValues', None) if detector_task and hasattr(detector_task, 'detect_field_values_for_document'): task_func = getattr(detector_task, 'detect_field_values_for_document') call_task_func(task_func, (document_id, False), None, visible=False)
def reindex_on_field_change(document_field: DocumentField): from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if APP_VAR_DISABLE_RAW_DB_CACHING.val: return from apps.rawdb.tasks import auto_reindex_not_tracked from apps.task.tasks import call_task_func from apps.document.models import DocumentField try: if document_field.document_type: call_task_func(auto_reindex_not_tracked, (document_field.document_type.code,), None, queue=settings.CELERY_QUEUE_SERIAL) except DocumentField.DoesNotExist: pass
def plan_reindex_tasks_in_chunks(all_doc_ids: Iterable, changed_by_user_id: int = None, cache_system_fields: FieldSpec = True, cache_generic_fields: FieldSpec = True, cache_user_fields: bool = True): """ Plans document reindexing. Splits the provided set of doc ids to chunks and runs N main tasks which will be displayed in the admin task list. Splitting is done to avoid overloading rabbitmq with possible too large argument list. Started tasks may split their processing to any number of sub-tasks to parallelize the work. """ for doc_ids_chunk in chunks(all_doc_ids, DOC_NUM_PER_MAIN_TASK): call_task_func(index_documents, (doc_ids_chunk, changed_by_user_id, cache_system_fields, cache_generic_fields, cache_user_fields), changed_by_user_id)
def plan_process_documents_status_changed(doc_ids: Iterable, new_status_id: int, changed_by_user_id: int): """ Plans processing of the documents status change. Starts multiple tasks, N doc ids per task, to avoid possible overloading the rabbitmq if too large set of doc ids is provided. Each started task will be shown in the admin task list and may start any number of sub-tasks to parallelize the processing. :param doc_ids: :param new_status_id: :param changed_by_user_id: :return: """ for doc_ids_chunk in chunks(doc_ids, DOC_NUMBER_PER_MAIN_TASK): call_task_func(process_documents_status_changed, (doc_ids_chunk, new_status_id, changed_by_user_id), changed_by_user_id)
def update_documents_assignees_impl(_sender, signal, documents, assignee_id: int, changed_by_user: User): from apps.rawdb.repository.raw_db_repository import RawDbRepository from apps.rawdb.tasks import cache_fields_for_docs_queryset from apps.task.tasks import call_task_func repo = RawDbRepository() doc_ids = list(documents.values_list('pk', flat=True)) old_field_values = { d.pk: { 'assignee_id': d.assignee_id, 'assignee_name': d.assignee.username if d.assignee else '' } for d in documents } repo.update_documents_assignees(doc_ids, assignee_id) task_ptrs = (documents, changed_by_user, False, True, True, None, old_field_values) call_task_func(cache_fields_for_docs_queryset, task_ptrs, changed_by_user.pk)
def _trigger_retraining_model(document, field, user_id): if settings.ANNOTATOR_RETRAIN_MODEL_ON_ANNOTATIONS_CHANGE: call_task_func(TrainDocumentFieldDetectorModel.train_model_for_field, (document.document_type_id, field.uid, None, True), user_id=user_id)
def process(self, **kwargs): session_id = kwargs['session_id'] session = UploadSession.objects.get(pk=session_id) # 1. Purge Tasks self.track_timelog('') session_tasks = Task.objects.main_tasks().filter(metadata__session_id=session_id) self.log_info(f'Purge {session_tasks.count()} session tasks.') for a_task in session_tasks: try: purge_task(a_task.id) except: # case when task is already deleted as subtask pass self.track_timelog('1 - purge tasks') # 2. Remove Documents+ document_ids = \ list(Document.objects.filter(upload_session_id=session_id).values_list('pk', flat=True)) self.log_info(f'Remove {len(document_ids)} documents') from apps.document.repository.document_bulk_delete import get_document_bulk_delete # TODO: WHY it fails with # psycopg2.errors.ForeignKeyViolation: update or delete # on table "document_textunit" violates foreign key constraint attempts = 3 delay = 60 attempts_made = 0 delete_manager = get_document_bulk_delete() error_logged = False for attempt in range(1, attempts + 1): attempts_made += 1 try: delete_manager.delete_documents(document_ids) break except Exception as e: if not error_logged: self.log_error('Error while deleting documents', exc_info=e) error_logged = True self.log_info(f'Attempt #{attempt} of {attempts} to delete documents failed, retry') time.sleep(delay) self.track_timelog(f'2 - bulk delete for {len(document_ids)} documents') if attempts_made > 1: self.log_error(f'{attempts_made} of {attempts} tried to delete documents') # 3. Remove files file_storage_exists = file_storage.document_exists(session_id) self.log_info(f'File Storage exists: {file_storage_exists}') files_removed, failed_removing = (0, 0) if file_storage_exists: files = file_storage.list_documents(session_id) self.log_info(f'Remove {len(files)} files from File Storage.') for file_path in files: file_storage.delete_document(file_path) try: file_storage.delete_document(session_id) files_removed += 1 except: # TODO: removing folders through LocalStorage is not implemented failed_removing += 1 pass self.track_timelog(f'3 - remove files ({files_removed} removed, {failed_removing} failed)') # 4. Remove Upload Session if not session: raise Exception(f"Couldn't find session by id ({session_id})") self.log_info(f'Remove session uid="{session_id}".') project = session.project session.delete() self.track_timelog('4 - delete session') # 5. Reindex Project self.log_info(f'Reindex project id="{project.id}" documents.') from apps.rawdb.tasks import reindex_all_project_documents call_task_func(reindex_all_project_documents, (project.pk,), None) self.track_timelog('5 - reindex project')
def process(self, document_type: DocumentType = None, project_ids=list, document_name: str = None, do_not_run_for_modified_documents=True, do_not_write=False, **kwargs): self.log_info("Going to detect document field values based on " "the pre-coded regexps and field values entered by users...") if isinstance(document_type, dict): document_type = DocumentType.objects.get(pk=document_type['pk']) # reindex document grid fields cache after detecting fields from apps.rawdb.tasks import auto_reindex_not_tracked doc_type_code = document_type.code \ if document_type and hasattr(document_type, 'code') else None call_task_func(auto_reindex_not_tracked, (doc_type_code,), None, queue=settings.CELERY_QUEUE_SERIAL, run_after_sub_tasks_finished=True, main_task_id=self.request.id) document_id = kwargs.get('document_id') if document_id: self.set_push_steps(1) self.run_sub_tasks('Detect Field Values For Single Document', DetectFieldValues.detect_field_values_for_document, [(document_id, False, True)]) self.push() return task_count = 0 document_types = [document_type] if document_type else DocumentType.objects.all() document_type_pks = [] for document_type in document_types: if document_type.pk and document_type.fields.exists(): document_type_pks.append(document_type.pk) else: self.log_info('Can not find any fields assigned to document type: {0}'.format(document_type)) detect_field_values_for_document_args = [] source_data = [] qs = Document.objects.filter(status__is_active=True) if document_name: qs = qs.filter(name=document_name) elif document_id: qs = qs.filter(pk=document_id) elif project_ids: qs = qs.filter(project_id__in=project_ids) elif document_type_pks: qs = qs.filter(document_type_id__in=document_type_pks) # filter out modified documents if do_not_run_for_modified_documents: modified_document_ids = DocumentFieldValue.objects \ .filter(Q(created_by__isnull=False) | Q(removed_by_user=True)) \ .distinct('document_id') \ .values_list('document_id') qs = qs.exclude(pk__in=Subquery(modified_document_ids)) for doc_id, source, name in qs.values_list('id', 'source', 'name'): detect_field_values_for_document_args.append((doc_id, do_not_write, True)) if source: source_data.append('{0}/{1}'.format(source, name)) else: source_data.append(name) task_count += 1 self.run_sub_tasks('Detect Field Values For Each Document', DetectFieldValues.detect_field_values_for_document, detect_field_values_for_document_args, source_data) if task_count > 0: self.log_info('Found {0} documents'.format(task_count)) else: self.log_info('No documents found')
def get_json_data(self, request, *args, **kwargs): call_task_func(clean_tasks, (), request.user.pk, queue=settings.CELERY_QUEUE_SERIAL) return 'Cleaning task started.'
def start_task(self, data): document_type = data.get('document_type', {}) document_type_code = document_type.code if document_type else None force = data.get('recreate_tables') or False call_task_func(manual_reindex, (document_type_code, force), data['user_id'])