Exemple #1
0
def do_save_document_field_value(
        field_val: FieldValueDTO,
        document_id: int,
        field_id: str,
        user) -> \
        Tuple[Document, DocumentField, Dict]:
    field_repo = DocumentFieldRepository()

    document = DocumentRepository().get_document_by_id(document_id)
    field = field_repo.get_document_field_by_id(field_id)
    field_val, field_ants = field_repo.update_field_value_with_dto(
        document=document, field=field, field_value_dto=field_val, user=user)

    annotation = field_ants[0]

    field_value = {
        'document': document.pk,
        'document_name': document.name,
        'field': field.uid,
        'field_name': field.code,
        'value': field_val.value,
        'pk': annotation.pk,
        'project': document.project.name,
        'location_start': annotation.location_start,
        'location_end': annotation.location_end,
        'location_text': annotation.location_text,
        'modified_by': annotation.modified_by.pk,
        'modified_date': annotation.modified_date
    }

    # return field_repo.save_posted_field_value(request_data, user)
    return document, field, field_value
def cleanup_document_relations(document):

    # 1. delete history
    document_repo = DocumentRepository()
    field_repo = DocumentFieldRepository()
    document_repo.delete_document_history_by_ids([document.pk])
    field_repo.delete_document_history_values(document.pk)

    # INFO: skip "delete step" (set delete=False) since we clean tasks periodically now
    # 2. delete Tasks, Task history, TaskResults, child tasks
    if document.metadata and document.metadata.get('cascade_delete_tasks',
                                                   True):
        task_kwargs = dict(file_name=document.name)
        if document.upload_session_id:
            task_kwargs['session_id'] = str(document.upload_session_id)
        file_tasks = Task.objects.main_tasks().filter_metadata(**task_kwargs)
        for file_task in file_tasks:
            purge_task(file_task.id, delete=False)

    # 3. Remove files
    if file_storage.document_exists(document.source_path):
        file_storage.delete_document(document.source_path)
Exemple #3
0
def get_document_bulk_delete():
    return DocumentBulkDelete(DocumentRepository())
Exemple #4
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.document_repository = kwargs['repository'] \
         if 'repository' in kwargs \
         else DocumentRepository()  # type:BaseDocumentRepository
def adapt_tables_and_reindex(task: ExtendedTask,
                             document_type_code: str = None,
                             force_recreate_tables: bool = False,
                             force_reindex: bool = False,
                             project_id: Optional[int] = None):
    """
    "RawDB: Reindex" task
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :param project_id: project's filter
    :return:
    """
    from apps.project.models import Project
    if project_id:
        project = Project.objects.get(pk=project_id)
        document_types = [project.type]
    else:
        document_types = [DocumentType.objects.get(code=document_type_code)] \
            if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)
    from apps.document.repository.document_repository import DocumentRepository
    doc_repo = DocumentRepository()

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log,
                                               document_type,
                                               force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id,
                                                    document_type.code,
                                                    project_id):
                purge_task(prev_task)
            doc_ids = doc_repo.get_doc_ids_by_project(project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else doc_repo.get_doc_ids_by_type(document_type.uid, DOC_NUM_PER_SUB_TASK)

            args = [(ids, ) for ids in doc_ids]
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - forced tables recreating.')
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            comment = 'forced' if force_reindex else 'reindex needed'
            task.log_info(
                f'Raw DB table for document type {document_type.code} '
                f'has been altered ({comment}), task "{task.task_name}".\n'
                f'Initiating re-index for all documents of this document type.'
            )
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            doc_ids = get_all_doc_ids_not_planned_to_index_by_project_pk(
                project_id, DOC_NUM_PER_SUB_TASK) if project_id else \
                get_all_doc_ids_not_planned_to_index_by_doc_type(
                    document_type.uid, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - index not planned.')
            doc_ids = non_indexed_doc_ids_not_planned_to_index_by_project(
                document_type, project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else non_indexed_doc_ids_not_planned_to_index_by_doc_type(
                    document_type, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
Exemple #6
0
def get_document_bulk_delete(safe_mode: bool = True):
    return DocumentBulkDelete(DocumentRepository(), safe_mode)