Exemple #1
0
    def process(self,
                document_type_config_csv_file: Dict,
                action: str,
                update_cache: bool,
                **kwargs):

        if action == 'validate':
            save = False
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'validate|import':
            save = True
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'import|auto_fix|retain_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = False
        elif action == 'import|auto_fix|remove_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = True
        else:
            raise RuntimeError('Unknown action')

        try:
            json_bytes = DbCache.get(document_type_config_csv_file['cache_key'])
            document_type = import_document_type(json_bytes=json_bytes,
                                                 save=save,
                                                 auto_fix_validation_errors=auto_fix_validation_errors,
                                                 remove_missed_in_dump_objects=remove_missed_objects,
                                                 task=self)
        finally:
            DbCache.clean_cache(document_type_config_csv_file['cache_key'])

        if not (save and update_cache):
            return

        from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
        if not APP_VAR_DISABLE_RAW_DB_CACHING.val:
            self.log_info('Adapting RawDB table structure after import ...')
            adapt_table_structure(CeleryTaskLogger(self), document_type, force=False)
        ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True)
        ids = list(ids)
        self.log_info('Caching document field values ...')

        for chunk in chunks(ids, 50):
            self.run_sub_tasks('Cache field values for a set of documents',
                               ImportDocumentType.cache_document_fields_for_doc_ids,
                               [(list(chunk),)])
def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            force_fmt = ', forced' if force else ''
            task.log_info(
                f'Re-index from auto_reindex_not_tracked, {task.name}, '
                f'for {document_type}{force_fmt}')
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                task.log_info(
                    f'auto_reindex_not_tracked({document_type.code}): '
                    f'there_are_non_indexed_docs_not_planned_to_index')
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
Exemple #3
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
def adapt_tables_and_reindex(task: ExtendedTask,
                             document_type_code: str = None,
                             force_recreate_tables: bool = False,
                             force_reindex: bool = False,
                             project_id: Optional[int] = None):
    """
    "RawDB: Reindex" task
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :param project_id: project's filter
    :return:
    """
    from apps.project.models import Project
    if project_id:
        project = Project.objects.get(pk=project_id)
        document_types = [project.type]
    else:
        document_types = [DocumentType.objects.get(code=document_type_code)] \
            if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)
    from apps.document.repository.document_repository import DocumentRepository
    doc_repo = DocumentRepository()

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log,
                                               document_type,
                                               force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id,
                                                    document_type.code,
                                                    project_id):
                purge_task(prev_task)
            doc_ids = doc_repo.get_doc_ids_by_project(project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else doc_repo.get_doc_ids_by_type(document_type.uid, DOC_NUM_PER_SUB_TASK)

            args = [(ids, ) for ids in doc_ids]
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - forced tables recreating.')
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            comment = 'forced' if force_reindex else 'reindex needed'
            task.log_info(
                f'Raw DB table for document type {document_type.code} '
                f'has been altered ({comment}), task "{task.task_name}".\n'
                f'Initiating re-index for all documents of this document type.'
            )
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            doc_ids = get_all_doc_ids_not_planned_to_index_by_project_pk(
                project_id, DOC_NUM_PER_SUB_TASK) if project_id else \
                get_all_doc_ids_not_planned_to_index_by_doc_type(
                    document_type.uid, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - index not planned.')
            doc_ids = non_indexed_doc_ids_not_planned_to_index_by_project(
                document_type, project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else non_indexed_doc_ids_not_planned_to_index_by_doc_type(
                    document_type, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)