Example #1
0
    def run_train_model_for_field(task: ExtendedTask, document_type_uid,
                                  field_uid, no_field_sentences,
                                  trigger_re_detecting_field_values):
        document_type = DocumentType.objects.get(pk=document_type_uid)
        field = DocumentField.objects.get(pk=field_uid)

        task.log_info('Training model for field #{0} ({1})...'.format(
            field_uid, field.code))

        document_type_field = DocumentTypeField.objects.get(
            document_type=document_type, document_field=field)

        if document_type_field.use_regexp_always:
            task.log_info(
                'Regexp will used for document_type #{0} and field #{1}.'.
                format(document_type_uid, field_uid))
            return None

        train_data = TrainDocumentFieldDetectorModel.get_train_data(
            document_type, field,
            document_type_field.trained_after_documents_number)
        if train_data:
            train_data = TrainDirtyDocumentFieldDetectorModel.get_train_data_generator(
                train_data)
            total_samples = TrainDirtyDocumentFieldDetectorModel.train_model(
                train_data, document_type, field, no_field_sentences)
            task.log_info(
                'Finished training model for document_type #{0} and field #{1}. '
                'Total number of samples: {2}'.format(document_type_uid,
                                                      field_uid,
                                                      total_samples))
        else:
            task.log_info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(document_type_uid, field_uid))
            return None

        if trigger_re_detecting_field_values:
            detect_field_values_for_document_args = []

            documents = Document.objects.active()\
                .filter(document_type=document_type)\
                .values_list('pk', 'name', 'source')
            source_data = []

            for document_id, name, source in documents:
                if source:
                    source_data.append('{0}/{1}'.format(source, name))
                else:
                    source_data.append(name)

                detect_field_values_for_document_args.append(
                    (document_id, False, field_uid))
            task.run_sub_tasks(
                'Detect Values of Field {0} for Each Document'.format(
                    field.code),
                DetectFieldValues.detect_field_values_for_document,
                detect_field_values_for_document_args, source_data)
Example #2
0
def _reindex_document_ids_packets(task: ExtendedTask, ids_packets: Generator[List[int], None, None]) -> None:
    reindex_task_name = 'Reindex set of documents'
    args = []
    for ids in ids_packets:
        args.append((ids,))
        if len(args) >= 100:
            task.run_sub_tasks(reindex_task_name, cache_document_fields_for_doc_ids_tracked, args)
            args = []
    task.run_sub_tasks(reindex_task_name, cache_document_fields_for_doc_ids_tracked, args)
Example #3
0
 def start_cache_document_fields_for_doc_ids(cls, task: ExtendedTask, ids: Iterable[Any]) -> None:
     doc_id_pack = set()
     for doc_id in ids:
         doc_id_pack.add(doc_id)
         if len(doc_id_pack) >= 10:
             task.run_sub_tasks('Cache field values for a set of documents',
                                cls.cache_document_fields_for_doc_ids,
                                [(list(doc_id_pack),)])
             doc_id_pack = set()
     if len(doc_id_pack) > 0:
         task.run_sub_tasks('Cache field values for a set of documents', cls.cache_document_fields_for_doc_ids,
                            [(list(doc_id_pack),)])
Example #4
0
def index_documents(task: ExtendedTask,
                    doc_ids: List,
                    changed_by_user_id: int = None,
                    cache_system_fields: FieldSpec = True,
                    cache_generic_fields: FieldSpec = True,
                    cache_user_fields: bool = True):
    """
    Index documents (cache document fields) in parallel. Document ids set is split to chunks and a
    sub-task is started for each sub-list.
    """
    args = [(sub_list, changed_by_user_id, cache_system_fields, cache_generic_fields, cache_user_fields)
            for sub_list in chunks(doc_ids, DOC_NUM_PER_SUB_TASK)]
    task.run_sub_tasks('Reindex documents', cache_document_fields_for_doc_ids_tracked, args)
Example #5
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
Example #6
0
def process_documents_assignee_changed(task: ExtendedTask, doc_ids: List, new_assignee_id: int,
                                       changed_by_user_id: int):
    task.run_sub_tasks('Process doc assignee change',
                       _process_documents_assignee_changed,
                       [(l, new_assignee_id, changed_by_user_id) for l in chunks(doc_ids, DOC_NUMBER_PER_SUB_TASK)])
def adapt_tables_and_reindex(task: ExtendedTask,
                             document_type_code: str = None,
                             force_recreate_tables: bool = False,
                             force_reindex: bool = False,
                             project_id: Optional[int] = None):
    """
    "RawDB: Reindex" task
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :param project_id: project's filter
    :return:
    """
    from apps.project.models import Project
    if project_id:
        project = Project.objects.get(pk=project_id)
        document_types = [project.type]
    else:
        document_types = [DocumentType.objects.get(code=document_type_code)] \
            if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)
    from apps.document.repository.document_repository import DocumentRepository
    doc_repo = DocumentRepository()

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log,
                                               document_type,
                                               force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id,
                                                    document_type.code,
                                                    project_id):
                purge_task(prev_task)
            doc_ids = doc_repo.get_doc_ids_by_project(project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else doc_repo.get_doc_ids_by_type(document_type.uid, DOC_NUM_PER_SUB_TASK)

            args = [(ids, ) for ids in doc_ids]
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - forced tables recreating.')
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            comment = 'forced' if force_reindex else 'reindex needed'
            task.log_info(
                f'Raw DB table for document type {document_type.code} '
                f'has been altered ({comment}), task "{task.task_name}".\n'
                f'Initiating re-index for all documents of this document type.'
            )
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            doc_ids = get_all_doc_ids_not_planned_to_index_by_project_pk(
                project_id, DOC_NUM_PER_SUB_TASK) if project_id else \
                get_all_doc_ids_not_planned_to_index_by_doc_type(
                    document_type.uid, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                f' - index not planned.')
            doc_ids = non_indexed_doc_ids_not_planned_to_index_by_project(
                document_type, project_id, DOC_NUM_PER_SUB_TASK) if project_id \
                else non_indexed_doc_ids_not_planned_to_index_by_doc_type(
                    document_type, DOC_NUM_PER_SUB_TASK)
            args = [(ids, ) for ids in doc_ids]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)