def drop_clusters(self, exclude_task_ids: Set = None, exclude_project_clustering_id: int = None): project = self # Stop running tusks from apps.task.tasks import purge_task from apps.project.tasks import ClusterProjectDocuments task_qr = project.project_tasks \ .filter(name=ClusterProjectDocuments.name, status__in=UNREADY_STATES) # type: QuerySet if exclude_task_ids: task_qr = task_qr.exclude(pk__in=exclude_task_ids) for task in task_qr: purge_task(task.pk, wait=True, timeout=1.5) # delete DocumentClusters for pcl in project.projectclustering_set.all(): pcl.document_clusters.all().delete() # delete ProjectClustering project.projectclustering_set.exclude( id=exclude_project_clustering_id).delete() # delete ClusterProjectDocuments Tasks to_delete_qr = project.project_tasks.filter( name=ClusterProjectDocuments.name) # type: QuerySet if exclude_task_ids: to_delete_qr = to_delete_qr.exclude(pk__in=exclude_task_ids) to_delete_qr.delete()
def delete_file(self, request, **kwargs): """ Delete a file from session\n Params: - filename: str """ session_id = self.get_object().pk file_name = request.POST.get('filename') if not file_name: raise APIException('Provide a file name.') try: storage = FileSystemStorage(location=os.path.join( settings.MEDIA_ROOT, settings.FILEBROWSER_DOCUMENTS_DIRECTORY, session_id)) if storage.exists(file_name): storage.delete(file_name) file_tasks = Task.objects \ .filter(metadata__session_id=session_id) \ .filter(metadata__file_name=file_name) for file_task in file_tasks: if file_task.metadata.get('file_name') == file_name: purge_task(file_task.id) Document.objects \ .filter(upload_session_id=session_id, name=file_name) \ .delete() return Response('Deleted') raise APIException("File doesn't exist") except Exception as e: raise APIException(str(e))
def cleanup_document_relations(document): # delete history document.history.all().delete() DocumentNote.history.filter(document=document).delete() TextUnitNote.history.filter(text_unit__document=document).delete() DocumentFieldValue.history.filter(document=document).delete() # delete empty Parties Party.objects.filter( # partyusage__text_unit__document=document, partyusage__isnull=True).delete() # delete empty Clusters DocumentCluster.objects.filter(documents__isnull=True).delete() TextUnitCluster.objects.filter(text_units__isnull=True).delete() # delete Tasks, Task history, TaskResults, child tasks task_kwargs = dict(file_name=document.name) if document.upload_session_id: task_kwargs['session_id'] = str(document.upload_session_id) file_tasks = Task.objects.filter_metadata(**task_kwargs) for file_task in file_tasks: if file_task.metadata.get('file_name') == document.name: purge_task(file_task.id)
def purge_tasks(ids: List[int]): upload_session_ids = Document.all_objects.filter(pk__in=ids).values_list('upload_session_id', flat=True) doc_names = Document.all_objects.filter(pk__in=ids).values_list('name', flat=True) doc_names_hash = dict((key, True) for key in doc_names) file_tasks = Task.objects.filter(metadata__file_name__in=doc_names, upload_session_id__in=upload_session_ids) for file_task in file_tasks: if file_task.metadata.get('file_name') in doc_names_hash: purge_task(file_task.id)
def get_source_path(request, **kwargs): # helper to just store a file and get final source path session_id = kwargs.get('pk') project = UploadSession.objects.get(pk=session_id).project file_ = request.FILES.dict().get('file') folder_name = kwargs.get('folder') project_storages = { _session_id: FileSystemStorage( location=os.path.join(settings.MEDIA_ROOT, settings. FILEBROWSER_DIRECTORY, _session_id)) for _session_id in project.uploadsession_set.values_list('pk', flat=True) } # check existing documents with the same name this_file_documents = project.document_set.filter(name=file_.name) # check existing files with the same name in sessions' folders # but not stored yet as Document this_file_storages = { _session_id: _storage for _session_id, _storage in project_storages.items() if _storage.exists(file_.name) and not Document.objects.filter( source_path=os.path.join(_session_id, file_.name)).exists() } if this_file_documents.exists() or this_file_storages: if request.POST.get('force') == 'true': for _session_id, _storage in this_file_storages.items(): _storage.delete(file_.name) file_tasks = Task.objects \ .filter(metadata__session_id=_session_id) \ .filter(metadata__file_name=file_.name) for file_task in file_tasks: if file_task.metadata.get('file_name') == file_.name: purge_task(file_task.id) # TODO: redundant? Document.objects \ .filter(upload_session_id=_session_id, name=file_.name) \ .delete() for doc in this_file_documents: doc.delete() else: raise APIException('Already exists') if not folder_name: storage = FileSystemStorage( location=os.path.join(settings.MEDIA_ROOT, settings. FILEBROWSER_DIRECTORY, session_id)) stored_file_name = storage.save(file_.name, file_.file) return os.path.join(session_id, stored_file_name) else: return os.path.join(folder_name, file_.name)
def drop_clusters(self): project = self # Stop running tusks from apps.task.tasks import purge_task from apps.project.tasks import ClusterProjectDocuments for task in project.project_tasks.filter(name=ClusterProjectDocuments.name, status__in=UNREADY_STATES): purge_task(task.pk, wait=True, timeout=1.5) # delete DocumentClusters for pcl in project.projectclustering_set.all(): pcl.document_clusters.all().delete() # delete ProjectClustering project.projectclustering_set.all().delete() # delete ClusterProjectDocuments Tasks project.project_tasks.filter(name=ClusterProjectDocuments.name).delete()
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False, force_reindex: bool = False): """ Checks if raw table with field values of doc type needs to be altered according to the changed field structure and triggers document reindexing if needed. This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications on the same table. See settings.py/CELERY_BEAT_SCHEDULE :param task: :param document_type_code: Document type code or None to check all doc types. :param force_recreate_tables: Force re-creating tables and re-indexing from scratch. :param force_reindex: Force re-indexing of all docs even if the table was not altered. :return: """ document_types = [DocumentType.objects.get(code=document_type_code)] \ if document_type_code is not None else DocumentType.objects.all() log = CeleryTaskLogger(task) for document_type in document_types: reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables) if force_recreate_tables: # If "force" is requested - we cancel all currently planned re-index tasks # and plan (re-plan) reindexing for all documents of this task. for prev_task in any_other_reindex_task(task.request.id, document_type.code): purge_task(prev_task) args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) elif reindex_needed or force_reindex: task.log_info('Raw DB table for document type {0} has been altered. ' 'Initiating re-index for all documents of this document type.'.format(document_type.code)) # If we altered the field structure then we need to re-index all docs of this type. # If no need to force - we plan re-index tasks only # for those documents for which they are not planned yet args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) else: # If we did not alter the table but there are non-indexed docs fo this type # then we trigger the re-index task making it index non-indexed docs only. # In this case we don't stop other re-index tasks. But we can be stopped further in case of # full reindex. # It makes sense to only plan re-indexing for those docs which are: # - not indexed # - have no re-index planned for them args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
def cleanup_document_relations(document): # 1. delete history document_repo = DocumentRepository() field_repo = DocumentFieldRepository() document_repo.delete_document_history_by_ids([document.pk]) field_repo.delete_document_history_values(document.pk) # INFO: skip "delete step" (set delete=False) since we clean tasks periodically now # 2. delete Tasks, Task history, TaskResults, child tasks if document.metadata and document.metadata.get('cascade_delete_tasks', True): task_kwargs = dict(file_name=document.name) if document.upload_session_id: task_kwargs['session_id'] = str(document.upload_session_id) file_tasks = Task.objects.main_tasks().filter_metadata(**task_kwargs) for file_task in file_tasks: purge_task(file_task.id, delete=False) # 3. Remove files if file_storage.document_exists(document.source_path): file_storage.delete_document(document.source_path)
def get_json_data(self, request, *args, **kwargs): return purge_task(task_pk=request.POST.get('task_pk'))
def post(self, request, *args, **kwargs): res = purge_task(task_pk=request.POST.get('task_pk')) return JsonResponse(res)
def process(self, **kwargs): session_id = kwargs['session_id'] session = UploadSession.objects.get(pk=session_id) # 1. Purge Tasks self.track_timelog('') session_tasks = Task.objects.main_tasks().filter(metadata__session_id=session_id) self.log_info(f'Purge {session_tasks.count()} session tasks.') for a_task in session_tasks: try: purge_task(a_task.id) except: # case when task is already deleted as subtask pass self.track_timelog('1 - purge tasks') # 2. Remove Documents+ document_ids = \ list(Document.objects.filter(upload_session_id=session_id).values_list('pk', flat=True)) self.log_info(f'Remove {len(document_ids)} documents') from apps.document.repository.document_bulk_delete import get_document_bulk_delete # TODO: WHY it fails with # psycopg2.errors.ForeignKeyViolation: update or delete # on table "document_textunit" violates foreign key constraint attempts = 3 delay = 60 attempts_made = 0 delete_manager = get_document_bulk_delete() error_logged = False for attempt in range(1, attempts + 1): attempts_made += 1 try: delete_manager.delete_documents(document_ids) break except Exception as e: if not error_logged: self.log_error('Error while deleting documents', exc_info=e) error_logged = True self.log_info(f'Attempt #{attempt} of {attempts} to delete documents failed, retry') time.sleep(delay) self.track_timelog(f'2 - bulk delete for {len(document_ids)} documents') if attempts_made > 1: self.log_error(f'{attempts_made} of {attempts} tried to delete documents') # 3. Remove files file_storage_exists = file_storage.document_exists(session_id) self.log_info(f'File Storage exists: {file_storage_exists}') files_removed, failed_removing = (0, 0) if file_storage_exists: files = file_storage.list_documents(session_id) self.log_info(f'Remove {len(files)} files from File Storage.') for file_path in files: file_storage.delete_document(file_path) try: file_storage.delete_document(session_id) files_removed += 1 except: # TODO: removing folders through LocalStorage is not implemented failed_removing += 1 pass self.track_timelog(f'3 - remove files ({files_removed} removed, {failed_removing} failed)') # 4. Remove Upload Session if not session: raise Exception(f"Couldn't find session by id ({session_id})") self.log_info(f'Remove session uid="{session_id}".') project = session.project session.delete() self.track_timelog('4 - delete session') # 5. Reindex Project self.log_info(f'Reindex project id="{project.id}" documents.') from apps.rawdb.tasks import reindex_all_project_documents call_task_func(reindex_all_project_documents, (project.pk,), None) self.track_timelog('5 - reindex project')
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False, force_reindex: bool = False, project_id: Optional[int] = None): """ "RawDB: Reindex" task Checks if raw table with field values of doc type needs to be altered according to the changed field structure and triggers document reindexing if needed. This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications on the same table. See settings.py/CELERY_BEAT_SCHEDULE :param task: :param document_type_code: Document type code or None to check all doc types. :param force_recreate_tables: Force re-creating tables and re-indexing from scratch. :param force_reindex: Force re-indexing of all docs even if the table was not altered. :param project_id: project's filter :return: """ from apps.project.models import Project if project_id: project = Project.objects.get(pk=project_id) document_types = [project.type] else: document_types = [DocumentType.objects.get(code=document_type_code)] \ if document_type_code is not None else DocumentType.objects.all() log = CeleryTaskLogger(task) from apps.document.repository.document_repository import DocumentRepository doc_repo = DocumentRepository() for document_type in document_types: reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables) if force_recreate_tables: # If "force" is requested - we cancel all currently planned re-index tasks # and plan (re-plan) reindexing for all documents of this task. for prev_task in any_other_reindex_task(task.request.id, document_type.code, project_id): purge_task(prev_task) doc_ids = doc_repo.get_doc_ids_by_project(project_id, DOC_NUM_PER_SUB_TASK) if project_id \ else doc_repo.get_doc_ids_by_type(document_type.uid, DOC_NUM_PER_SUB_TASK) args = [(ids, ) for ids in doc_ids] task.log_info( f'Initiating re-index for all documents of {document_type.code} ' f' - forced tables recreating.') task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) elif reindex_needed or force_reindex: comment = 'forced' if force_reindex else 'reindex needed' task.log_info( f'Raw DB table for document type {document_type.code} ' f'has been altered ({comment}), task "{task.task_name}".\n' f'Initiating re-index for all documents of this document type.' ) # If we altered the field structure then we need to re-index all docs of this type. # If no need to force - we plan re-index tasks only # for those documents for which they are not planned yet doc_ids = get_all_doc_ids_not_planned_to_index_by_project_pk( project_id, DOC_NUM_PER_SUB_TASK) if project_id else \ get_all_doc_ids_not_planned_to_index_by_doc_type( document_type.uid, DOC_NUM_PER_SUB_TASK) args = [(ids, ) for ids in doc_ids] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args) else: # If we did not alter the table but there are non-indexed docs fo this type # then we trigger the re-index task making it index non-indexed docs only. # In this case we don't stop other re-index tasks. But we can be stopped further in case of # full reindex. # It makes sense to only plan re-indexing for those docs which are: # - not indexed # - have no re-index planned for them task.log_info( f'Initiating re-index for all documents of {document_type.code} ' f' - index not planned.') doc_ids = non_indexed_doc_ids_not_planned_to_index_by_project( document_type, project_id, DOC_NUM_PER_SUB_TASK) if project_id \ else non_indexed_doc_ids_not_planned_to_index_by_doc_type( document_type, DOC_NUM_PER_SUB_TASK) args = [(ids, ) for ids in doc_ids] task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
def upload(self, request, **kwargs): """ Upload a File\n Params: - file: file object - force: bool (optional) - whether rewrite existing file and Document """ session_id = kwargs.get('pk') project = self.get_object().project file_ = request.FILES.dict().get('file') if session_id and file_: try: project_storages = { str(_session_id): FileSystemStorage( location=os.path.join( settings.MEDIA_ROOT, settings.FILEBROWSER_DIRECTORY, str(_session_id))) for _session_id in project.uploadsession_set.values_list('pk', flat=True)} # check existing documents with the same name this_file_documents = project.document_set.filter(name=file_.name) # check existing files with the same name but not stored yet as Document this_file_storages = { _session_id: _storage for _session_id, _storage in project_storages.items() if _storage.exists(file_.name) and not Document.objects.filter( source_path=os.path.join( _session_id, file_.name)).exists()} if this_file_documents.exists() or this_file_storages: if request.POST.get('force') == 'true': for _session_id, _storage in this_file_storages.items(): _storage.delete(file_.name) file_tasks = Task.objects\ .filter(metadata__session_id=_session_id)\ .filter(metadata__file_name=file_.name) for file_task in file_tasks: if file_task.metadata.get('file_name') == file_.name: purge_task(file_task.id) # redundant? Document.objects\ .filter(upload_session_id=_session_id, name=file_.name)\ .delete() for doc in this_file_documents: doc.delete() else: raise APIException('Already exists') storage = FileSystemStorage( location=os.path.join( settings.MEDIA_ROOT, settings.FILEBROWSER_DIRECTORY, session_id)) stored_file_name = storage.save(file_.name, file_.file) required_locators = ['date', 'party', 'term', 'geoentity', 'currency', 'citation', 'definition', 'duration'] linked_tasks = [ {'task_name': 'Locate', 'locate': required_locators, 'parse': 'sentences', 'do_delete': False, 'metadata': {'session_id': session_id, 'file_name': file_.name}, 'user_id': request.user.id} ] document_type = UploadSession.objects.get(pk=session_id).project.type # if Document type specified if document_type: for app_name in custom_apps: module_str = 'apps.%s.tasks' % app_name module = sys.modules.get(module_str) if hasattr(module, 'DetectFieldValues'): linked_tasks.append( {'task_name': 'DetectFieldValues', 'module_name': module_str, 'do_not_write': False, 'metadata': {'session_id': session_id, 'file_name': file_.name}, 'user_id': request.user.id}) call_task( task_name='LoadDocuments', source_path=os.path.join(session_id, stored_file_name), user_id=request.user.id, metadata={'session_id': session_id, 'file_name': file_.name}, linked_tasks=linked_tasks) except Exception as e: raise APIException(str(e)) else: raise ValidationError('Provide session_id and file in request data.') return Response('Loaded')