def drop_clusters(self,
                      exclude_task_ids: Set = None,
                      exclude_project_clustering_id: int = None):
        project = self
        # Stop running tusks
        from apps.task.tasks import purge_task
        from apps.project.tasks import ClusterProjectDocuments
        task_qr = project.project_tasks \
            .filter(name=ClusterProjectDocuments.name, status__in=UNREADY_STATES)  # type: QuerySet
        if exclude_task_ids:
            task_qr = task_qr.exclude(pk__in=exclude_task_ids)

        for task in task_qr:
            purge_task(task.pk, wait=True, timeout=1.5)
        # delete DocumentClusters
        for pcl in project.projectclustering_set.all():
            pcl.document_clusters.all().delete()
        # delete ProjectClustering
        project.projectclustering_set.exclude(
            id=exclude_project_clustering_id).delete()
        # delete ClusterProjectDocuments Tasks
        to_delete_qr = project.project_tasks.filter(
            name=ClusterProjectDocuments.name)  # type: QuerySet
        if exclude_task_ids:
            to_delete_qr = to_delete_qr.exclude(pk__in=exclude_task_ids)
        to_delete_qr.delete()
예제 #2
0
    def delete_file(self, request, **kwargs):
        """
        Delete a file from session\n
            Params:
                - filename: str
        """
        session_id = self.get_object().pk
        file_name = request.POST.get('filename')

        if not file_name:
            raise APIException('Provide a file name.')

        try:
            storage = FileSystemStorage(location=os.path.join(
                settings.MEDIA_ROOT, settings.FILEBROWSER_DOCUMENTS_DIRECTORY,
                session_id))

            if storage.exists(file_name):
                storage.delete(file_name)
                file_tasks = Task.objects \
                    .filter(metadata__session_id=session_id) \
                    .filter(metadata__file_name=file_name)
                for file_task in file_tasks:
                    if file_task.metadata.get('file_name') == file_name:
                        purge_task(file_task.id)
                Document.objects \
                    .filter(upload_session_id=session_id, name=file_name) \
                    .delete()
                return Response('Deleted')
            raise APIException("File doesn't exist")

        except Exception as e:
            raise APIException(str(e))
예제 #3
0
def cleanup_document_relations(document):

    # delete history
    document.history.all().delete()
    DocumentNote.history.filter(document=document).delete()
    TextUnitNote.history.filter(text_unit__document=document).delete()
    DocumentFieldValue.history.filter(document=document).delete()

    # delete empty Parties
    Party.objects.filter(
        # partyusage__text_unit__document=document,
        partyusage__isnull=True).delete()

    # delete empty Clusters
    DocumentCluster.objects.filter(documents__isnull=True).delete()
    TextUnitCluster.objects.filter(text_units__isnull=True).delete()

    # delete Tasks, Task history, TaskResults, child tasks
    task_kwargs = dict(file_name=document.name)
    if document.upload_session_id:
        task_kwargs['session_id'] = str(document.upload_session_id)
    file_tasks = Task.objects.filter_metadata(**task_kwargs)
    for file_task in file_tasks:
        if file_task.metadata.get('file_name') == document.name:
            purge_task(file_task.id)
예제 #4
0
    def purge_tasks(ids: List[int]):
        upload_session_ids = Document.all_objects.filter(pk__in=ids).values_list('upload_session_id', flat=True)
        doc_names = Document.all_objects.filter(pk__in=ids).values_list('name', flat=True)
        doc_names_hash = dict((key, True) for key in doc_names)

        file_tasks = Task.objects.filter(metadata__file_name__in=doc_names, upload_session_id__in=upload_session_ids)
        for file_task in file_tasks:
            if file_task.metadata.get('file_name') in doc_names_hash:
                purge_task(file_task.id)
예제 #5
0
    def get_source_path(request, **kwargs):
        # helper to just store a file and get final source path
        session_id = kwargs.get('pk')
        project = UploadSession.objects.get(pk=session_id).project
        file_ = request.FILES.dict().get('file')
        folder_name = kwargs.get('folder')

        project_storages = {
            _session_id: FileSystemStorage(
                location=os.path.join(settings.MEDIA_ROOT, settings.
                                      FILEBROWSER_DIRECTORY, _session_id))
            for _session_id in project.uploadsession_set.values_list('pk',
                                                                     flat=True)
        }

        # check existing documents with the same name
        this_file_documents = project.document_set.filter(name=file_.name)

        # check existing files with the same name in sessions' folders
        # but not stored yet as Document
        this_file_storages = {
            _session_id: _storage
            for _session_id, _storage in project_storages.items()
            if _storage.exists(file_.name) and not Document.objects.filter(
                source_path=os.path.join(_session_id, file_.name)).exists()
        }

        if this_file_documents.exists() or this_file_storages:
            if request.POST.get('force') == 'true':
                for _session_id, _storage in this_file_storages.items():
                    _storage.delete(file_.name)
                    file_tasks = Task.objects \
                        .filter(metadata__session_id=_session_id) \
                        .filter(metadata__file_name=file_.name)
                    for file_task in file_tasks:
                        if file_task.metadata.get('file_name') == file_.name:
                            purge_task(file_task.id)
                    # TODO: redundant?
                    Document.objects \
                        .filter(upload_session_id=_session_id, name=file_.name) \
                        .delete()
                for doc in this_file_documents:
                    doc.delete()
            else:
                raise APIException('Already exists')

        if not folder_name:
            storage = FileSystemStorage(
                location=os.path.join(settings.MEDIA_ROOT, settings.
                                      FILEBROWSER_DIRECTORY, session_id))

            stored_file_name = storage.save(file_.name, file_.file)
            return os.path.join(session_id, stored_file_name)
        else:
            return os.path.join(folder_name, file_.name)
예제 #6
0
 def drop_clusters(self):
     project = self
     # Stop running tusks
     from apps.task.tasks import purge_task
     from apps.project.tasks import ClusterProjectDocuments
     for task in project.project_tasks.filter(name=ClusterProjectDocuments.name, status__in=UNREADY_STATES):
         purge_task(task.pk, wait=True, timeout=1.5)
     # delete DocumentClusters
     for pcl in project.projectclustering_set.all():
         pcl.document_clusters.all().delete()
     # delete ProjectClustering
     project.projectclustering_set.all().delete()
     # delete ClusterProjectDocuments Tasks
     project.project_tasks.filter(name=ClusterProjectDocuments.name).delete()
예제 #7
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
예제 #8
0
def cleanup_document_relations(document):

    # 1. delete history
    document_repo = DocumentRepository()
    field_repo = DocumentFieldRepository()
    document_repo.delete_document_history_by_ids([document.pk])
    field_repo.delete_document_history_values(document.pk)

    # INFO: skip "delete step" (set delete=False) since we clean tasks periodically now
    # 2. delete Tasks, Task history, TaskResults, child tasks
    if document.metadata and document.metadata.get('cascade_delete_tasks',
                                                   True):
        task_kwargs = dict(file_name=document.name)
        if document.upload_session_id:
            task_kwargs['session_id'] = str(document.upload_session_id)
        file_tasks = Task.objects.main_tasks().filter_metadata(**task_kwargs)
        for file_task in file_tasks:
            purge_task(file_task.id, delete=False)

    # 3. Remove files
    if file_storage.document_exists(document.source_path):
        file_storage.delete_document(document.source_path)
예제 #9
0
 def get_json_data(self, request, *args, **kwargs):
     return purge_task(task_pk=request.POST.get('task_pk'))
예제 #10
0
 def post(self, request, *args, **kwargs):
     res = purge_task(task_pk=request.POST.get('task_pk'))
     return JsonResponse(res)
예제 #11
0
    def process(self, **kwargs):
        session_id = kwargs['session_id']
        session = UploadSession.objects.get(pk=session_id)

        # 1. Purge Tasks
        self.track_timelog('')
        session_tasks = Task.objects.main_tasks().filter(metadata__session_id=session_id)
        self.log_info(f'Purge {session_tasks.count()} session tasks.')
        for a_task in session_tasks:
            try:
                purge_task(a_task.id)
            except:
                # case when task is already deleted as subtask
                pass
        self.track_timelog('1 - purge tasks')

        # 2. Remove Documents+
        document_ids = \
            list(Document.objects.filter(upload_session_id=session_id).values_list('pk', flat=True))
        self.log_info(f'Remove {len(document_ids)} documents')
        from apps.document.repository.document_bulk_delete import get_document_bulk_delete

        # TODO: WHY it fails with
        # psycopg2.errors.ForeignKeyViolation: update or delete
        # on table "document_textunit" violates foreign key constraint
        attempts = 3
        delay = 60
        attempts_made = 0
        delete_manager = get_document_bulk_delete()
        error_logged = False

        for attempt in range(1, attempts + 1):
            attempts_made += 1
            try:
                delete_manager.delete_documents(document_ids)
                break
            except Exception as e:
                if not error_logged:
                    self.log_error('Error while deleting documents', exc_info=e)
                    error_logged = True
                self.log_info(f'Attempt #{attempt} of {attempts} to delete documents failed, retry')

                time.sleep(delay)
        self.track_timelog(f'2 - bulk delete for {len(document_ids)} documents')
        if attempts_made > 1:
            self.log_error(f'{attempts_made} of {attempts} tried to delete documents')

        # 3. Remove files
        file_storage_exists = file_storage.document_exists(session_id)
        self.log_info(f'File Storage exists: {file_storage_exists}')

        files_removed, failed_removing = (0, 0)
        if file_storage_exists:
            files = file_storage.list_documents(session_id)
            self.log_info(f'Remove {len(files)} files from File Storage.')
            for file_path in files:
                file_storage.delete_document(file_path)
            try:
                file_storage.delete_document(session_id)
                files_removed += 1
            except:
                # TODO: removing folders through LocalStorage is not implemented
                failed_removing += 1
                pass
        self.track_timelog(f'3 - remove files ({files_removed} removed, {failed_removing} failed)')

        # 4. Remove Upload Session
        if not session:
            raise Exception(f"Couldn't find session by id ({session_id})")

        self.log_info(f'Remove session uid="{session_id}".')
        project = session.project
        session.delete()
        self.track_timelog('4 - delete session')

        # 5. Reindex Project
        self.log_info(f'Reindex project id="{project.id}" documents.')
        from apps.rawdb.tasks import reindex_all_project_documents
        call_task_func(reindex_all_project_documents, (project.pk,), None)
        self.track_timelog('5 - reindex project')
예제 #12
0
def adapt_tables_and_reindex(task: ExtendedTask,
                             document_type_code: str = None,
                             force_recreate_tables: bool = False,
                             force_reindex: bool = False,
                             project_id: Optional[int] = None):
    """
    "RawDB: Reindex" task
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :param project_id: project's filter
    :return:
    """
    from apps.project.models import Project
    if project_id:
        project = Project.objects.get(pk=project_id)
        document_types = [project.type]
    else:
        document_types = [DocumentType.objects.get(code=document_type_code)] \
            if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)
    from apps.document.repository.document_repository import DocumentRepository
    doc_repo = DocumentRepository()

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log,
                                               document_type,
                                               force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id,
                                                    document_type.code,
                                                    project_id):
                purge_task(prev_task)
            doc_ids = doc_repo.get_doc_ids_by_project(project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else doc_repo.get_doc_ids_by_type(document_type.uid, DOC_NUM_PER_SUB_TASK)

            args = [(ids, ) for ids in doc_ids]
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - forced tables recreating.')
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            comment = 'forced' if force_reindex else 'reindex needed'
            task.log_info(
                f'Raw DB table for document type {document_type.code} '
                f'has been altered ({comment}), task "{task.task_name}".\n'
                f'Initiating re-index for all documents of this document type.'
            )
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            doc_ids = get_all_doc_ids_not_planned_to_index_by_project_pk(
                project_id, DOC_NUM_PER_SUB_TASK) if project_id else \
                get_all_doc_ids_not_planned_to_index_by_doc_type(
                    document_type.uid, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - index not planned.')
            doc_ids = non_indexed_doc_ids_not_planned_to_index_by_project(
                document_type, project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else non_indexed_doc_ids_not_planned_to_index_by_doc_type(
                    document_type, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
예제 #13
0
    def upload(self, request, **kwargs):
        """
        Upload a File\n
            Params:
                - file: file object
                - force: bool (optional) - whether rewrite existing file and Document
        """
        session_id = kwargs.get('pk')
        project = self.get_object().project
        file_ = request.FILES.dict().get('file')

        if session_id and file_:
            try:
                project_storages = {
                    str(_session_id): FileSystemStorage(
                        location=os.path.join(
                            settings.MEDIA_ROOT,
                            settings.FILEBROWSER_DIRECTORY,
                            str(_session_id)))
                    for _session_id in project.uploadsession_set.values_list('pk', flat=True)}

                # check existing documents with the same name
                this_file_documents = project.document_set.filter(name=file_.name)

                # check existing files with the same name but not stored yet as Document
                this_file_storages = {
                    _session_id: _storage
                    for _session_id, _storage in project_storages.items()
                    if _storage.exists(file_.name) and not Document.objects.filter(
                        source_path=os.path.join(
                            _session_id, file_.name)).exists()}

                if this_file_documents.exists() or this_file_storages:
                    if request.POST.get('force') == 'true':
                        for _session_id, _storage in this_file_storages.items():
                            _storage.delete(file_.name)
                            file_tasks = Task.objects\
                                .filter(metadata__session_id=_session_id)\
                                .filter(metadata__file_name=file_.name)
                            for file_task in file_tasks:
                                if file_task.metadata.get('file_name') == file_.name:
                                    purge_task(file_task.id)
                            # redundant?
                            Document.objects\
                                .filter(upload_session_id=_session_id, name=file_.name)\
                                .delete()
                        for doc in this_file_documents:
                            doc.delete()
                    else:
                        raise APIException('Already exists')

                storage = FileSystemStorage(
                    location=os.path.join(
                        settings.MEDIA_ROOT,
                        settings.FILEBROWSER_DIRECTORY,
                        session_id))

                stored_file_name = storage.save(file_.name, file_.file)

                required_locators = ['date',
                                     'party',
                                     'term',
                                     'geoentity',
                                     'currency',
                                     'citation',
                                     'definition',
                                     'duration']

                linked_tasks = [
                    {'task_name': 'Locate',
                     'locate': required_locators,
                     'parse': 'sentences',
                     'do_delete': False,
                     'metadata': {'session_id': session_id, 'file_name': file_.name},
                     'user_id': request.user.id}
                ]

                document_type = UploadSession.objects.get(pk=session_id).project.type

                # if Document type specified
                if document_type:

                    for app_name in custom_apps:
                        module_str = 'apps.%s.tasks' % app_name
                        module = sys.modules.get(module_str)
                        if hasattr(module, 'DetectFieldValues'):
                            linked_tasks.append(
                                {'task_name': 'DetectFieldValues',
                                 'module_name': module_str,
                                 'do_not_write': False,
                                 'metadata': {'session_id': session_id, 'file_name': file_.name},
                                 'user_id': request.user.id})

                call_task(
                    task_name='LoadDocuments',
                    source_path=os.path.join(session_id, stored_file_name),
                    user_id=request.user.id,
                    metadata={'session_id': session_id, 'file_name': file_.name},
                    linked_tasks=linked_tasks)
            except Exception as e:
                raise APIException(str(e))
        else:
            raise ValidationError('Provide session_id and file in request data.')
        return Response('Loaded')