コード例 #1
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    msg = f'manual_reindex called for {document_type_code}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True)
コード例 #2
0
    def rollback(task, project_clustering_id, project_id, new_project_id, reassign_cluster_ids):
        """
        Undo reassigning, update ProjectClustering.metadata
        """
        task.log_error('Rollback Reassigning Document Cluster documents.')
        log = CeleryTaskLogger(task)

        p_cl = ProjectClustering.objects.get(id=project_clustering_id)

        # get reassigned documents queryset
        documents = Document.objects.filter(documentcluster__pk__in=reassign_cluster_ids)

        # update rawdb cache for target doc type
        for document in documents:
            signals.document_deleted.send(task.__name__, user=None, document=document)

        # get back Doc Type to Generic Doc Type
        documents.update(project_id=project_id, document_type=DocumentType.generic())

        # update rawdb cache forGeneric Doc Type
        for document in documents:
            signals.document_changed.send(task.__name__, log=log, document=document)

        # update info about reassignings in metadata
        reassignings = p_cl.metadata.get('reassigning', [])
        reassigning = {
            'date': now().isoformat(),
            'new_project_id': new_project_id,
            'cluster_ids': reassign_cluster_ids,
            'status': FAILURE,
        }
        reassignings.append(reassigning)
        p_cl.metadata['reassigning'] = reassignings
        p_cl.save()
コード例 #3
0
 def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set):
     for doc in Document.objects.filter(pk__in=doc_ids):
         log = CeleryTaskLogger(_task)
         field_value_cache.cache_generic_values(doc, log=log)
         suggested_values = field_detection.detect_and_cache_field_values_for_document(log, doc, False,
                                                                                       clear_old_values=False)
         field_value_cache.cache_field_values(doc, suggested_values, save=True, log=log)
コード例 #4
0
def cache_document_fields_for_doc_ids(task: ExtendedTask, doc_ids: Set):
    # This task is added to exclude-from-tracking list and is not seen in task list at /advanced
    # Also if running it as a aub-task it will not participate in the parent task's progress.
    log = CeleryTaskLogger(task)
    for doc in Document.all_objects.filter(pk__in=doc_ids) \
            .select_related('document_type', 'assignee', 'status'):  # type: Document
        cache_document_fields(log, doc)
コード例 #5
0
 def train_model_for_field(task: ExtendedTask, field_uid: str) -> None:
     field = DocumentField.objects.get(pk=field_uid)
     new_model = field_detection \
         .train_document_field_detector_model(CeleryTaskLogger(task), field, None)  # type: ClassifierModel
     if new_model:
         ClassifierModel.objects.filter(document_field=field).delete()
         new_model.save()
コード例 #6
0
def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            force_fmt = ', forced' if force else ''
            task.log_info(
                f'Re-index from auto_reindex_not_tracked, {task.name}, '
                f'for {document_type}{force_fmt}')
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                task.log_info(
                    f'auto_reindex_not_tracked({document_type.code}): '
                    f'there_are_non_indexed_docs_not_planned_to_index')
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
コード例 #7
0
    def rollback(task, project_id, cluster_ids, new_project_id):

        task.log_error('Rollback Reassigning Document Cluster documents.')
        log = CeleryTaskLogger(task)

        documents = Document.objects.filter(documentcluster__pk__in=cluster_ids)

        for document in documents:
            signals.document_deleted.send(task.__name__, user=None, document=document)

        documents.update(project_id=project_id, document_type=DocumentType.generic())

        for document in documents:
            signals.document_changed.send(task.__name__, log=log, document=document)

        p_cl = ProjectClustering.objects.get(document_clusters__pk=cluster_ids[0])
        reassignings = p_cl.metadata.get('reassigning', [])
        reassigning = {
            'date': now().isoformat(),
            'new_project_id': new_project_id,
            'cluster_ids': cluster_ids,
            'status': FAILURE,
        }
        reassignings.append(reassigning)
        p_cl.metadata['reassigning'] = reassignings
        p_cl.save()
コード例 #8
0
    def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool,
                 filed_owners: dict = None):
        filed_owners = filed_owners if filed_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners)
        log = CeleryTaskLogger(task)

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            if not new_document:
                DocumentFieldValue.objects \
                    .filter(document=document,
                            removed_by_user=False,
                            created_by__isnull=True,
                            modified_by__isnull=True) \
                    .delete()

            for field, values in fields_to_values.items():
                field_detection.save_detected_values(document, field, values)

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log, document, True)
            else:
                dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False)
                field_value_cache.cache_field_values(document, dfvs, save=True)

        task.log_info('Loaded {0} field values for document #{1} ({2})'
                      .format(len(fields_to_values), document.pk, document.name))
コード例 #9
0
    def process(self, **kwargs):

        n_clusters = kwargs.get('n_clusters', 3)
        method = kwargs.get('method', 'kmeans')
        cluster_by = kwargs.get('cluster_by', 'term')

        self.project_clustering_id = kwargs.get('project_clustering_id')
        project_clustering = ProjectClustering.objects.get(pk=self.project_clustering_id)
        project_clustering.status = PENDING
        project_clustering.task = self.task
        project_clustering.save()

        project = project_clustering.project

        self.log_info('Start clustering documents for project id={}'.format(project.id))
        self.log_info('Clustering method: "{}", n_clusters={}'.format(method, n_clusters))
        self.log_info('Cluster by: {}'.format(str(cluster_by)))

        self.set_push_steps(4)

        self.push()

        # clear previous clusters, their tasks and cluster sessions
        project.drop_clusters(exclude_task_ids={self.request.id},
                              exclude_project_clustering_id=self.project_clustering_id)
        self.push()

        cluster_model = ClusterDocuments(project_id=project.id,
                                         cluster_algorithm=method,
                                         n_clusters=n_clusters,
                                         cluster_by=cluster_by,
                                         use_default_name=True,
                                         log_message=self.log_wo_flooding)
        result = cluster_model.run()

        project_clustering.metadata = result.metadata
        project_clustering.save()
        project_clustering.document_clusters.add(*result.metadata['cluster_obj_ids'])

        self.push()
        self.log_info('Clustering completed. Updating document cache.')

        log = CeleryTaskLogger(self)
        for doc in Document.objects.filter(project__pk=project.id):
            signals.fire_document_changed(sender=self,
                                          log=log,
                                          document=doc,
                                          changed_by_user=None,
                                          system_fields_changed=False,
                                          user_fields_changed=False,
                                          generic_fields_changed=[DocumentGenericField.cluster_id.value])

        project_clustering.status = SUCCESS
        project_clustering.save()

        self.push()
        self.log_info('Finished.')
        return result.metadata
コード例 #10
0
    def test_field_detector_model(task: ExtendedTask, field_id,
                                  document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField

        expected_dfvs = field_detection.detect_and_cache_field_values(
            CeleryTaskLogger(task), document, field,
            save=False)  # type: List[DetectedFieldValue]
        actual_dfvs = list(
            DocumentFieldValue.objects.filter(
                document=document, field=field,
                removed_by_user=False).all())  # type: List[DocumentFieldValue]

        if field.is_value_aware():
            # dates, numbers, e.t.c.
            expected_field_values = field_detection.merge_detected_field_values_to_python_value(
                expected_dfvs)
            expected_field_value = expected_field_values.get(field.code)

            actual_field_values = merge_document_field_values_to_python_value(
                actual_dfvs)
            actual_field_value = actual_field_values.get(field.code)

            matches = bool(expected_field_value == actual_field_value)
        else:
            # related-info e.t.c.
            expected_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in expected_dfvs if dfv.text_unit
            }
            expected_field_value = '; '.join(sorted(expected_set))

            actual_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in actual_dfvs if dfv.text_unit
            }
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            task.log_info(
                '{3} Test doc: {0} (Project: {5}). '
                'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----'
                .format(
                    document.name, expected_field_value, actual_field_value,
                    '[  OK  ]' if matches else '[ ERR  ]', '\n---\n'.join(
                        [dfv.text_unit.text
                         for dfv in expected_dfvs]) if expected_dfvs else '',
                    document.project.name if document.project else ''))

        text_units_number = TextUnit.objects.filter(
            document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches
        }
コード例 #11
0
def plan_refreshing_materialized_views(_celery_task):
    mat_views_repo = MaterializedViews()

    def refresh_view(view_name: str):
        refresh_materialized_view.apply_async((view_name,))

    mat_views_repo.plan_refreshes(CeleryTaskLogger(_celery_task),
                                  task_names.TASK_NAME_REFRESH_MATERIALIZED_VIEW,
                                  refresh_view)
コード例 #12
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False,
                   project_id: Optional[int] = None):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    run_parameters = {'document type': document_type_code}
    if project_id:
        run_parameters['project'] = project_id
    if force:
        run_parameters['force'] = True
    ptrs_str = ', '.join([f'{p}={run_parameters[p]}' for p in run_parameters])

    msg = f'manual_reindex called for {ptrs_str}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True, project_id)
コード例 #13
0
    def reassign_document(task, document_id):
        document = Document.objects.get(pk=document_id)
        signals.document_deleted.send(task.__class__, user=None, document=document)
        log = CeleryTaskLogger(task)
        dfvs = field_detection.detect_and_cache_field_values_for_document(
            log=log,
            document=document,
            system_fields_changed=True,
            generic_fields_changed=True)

        task.log_info(
            f'Detected {len(dfvs)} field values for document ' + f'#{document.id} ({document.name})')
コード例 #14
0
 def train_model_for_dirty_field(task: ExtendedTask, dirty_field_id: Any) -> None:
     dirty_field = DocumentField.objects.get(pk=dirty_field_id)
     if dirty_field.can_retrain():
         dirty_field.dirty = False
         dirty_field.save()
         train_docs_count = field_detection_utils.get_approved_documents_number(dirty_field, None)
         if train_docs_count >= dirty_field.trained_after_documents_number:
             new_model = field_detection.train_document_field_detector_model(CeleryTaskLogger(task),
                                                                             dirty_field,
                                                                             None)
             if new_model:
                 ClassifierModel.objects.filter(document_field=dirty_field).delete()
                 new_model.save()
コード例 #15
0
 def process(self, document_field: Dict, config_csv_file: Dict,
             drop_previous_field_detectors: bool,
             update_field_choice_values: bool, **kwargs):
     try:
         self.log_info(
             'Going to configure simple field detection config...')
         document_field = DocumentField.objects.get(pk=document_field['pk'])
         csv_bytes = DbCache.get(config_csv_file['cache_key'])
         apply_simple_config(CeleryTaskLogger(self), document_field,
                             csv_bytes, drop_previous_field_detectors,
                             update_field_choice_values)
     finally:
         DbCache.clean_cache(config_csv_file['cache_key'])
コード例 #16
0
    def process(self,
                document_type_config_csv_file: Dict,
                action: str,
                update_cache: bool,
                **kwargs):

        if action == 'validate':
            save = False
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'validate|import':
            save = True
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'import|auto_fix|retain_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = False
        elif action == 'import|auto_fix|remove_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = True
        else:
            raise RuntimeError('Unknown action')

        try:
            json_bytes = DbCache.get(document_type_config_csv_file['cache_key'])
            document_type = import_document_type(json_bytes=json_bytes,
                                                 save=save,
                                                 auto_fix_validation_errors=auto_fix_validation_errors,
                                                 remove_missed_in_dump_objects=remove_missed_objects,
                                                 task=self)
        finally:
            DbCache.clean_cache(document_type_config_csv_file['cache_key'])

        if not (save and update_cache):
            return

        from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
        if not APP_VAR_DISABLE_RAW_DB_CACHING.val:
            self.log_info('Adapting RawDB table structure after import ...')
            adapt_table_structure(CeleryTaskLogger(self), document_type, force=False)
        ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True)
        ids = list(ids)
        self.log_info('Caching document field values ...')

        for chunk in chunks(ids, 50):
            self.run_sub_tasks('Cache field values for a set of documents',
                               ImportDocumentType.cache_document_fields_for_doc_ids,
                               [(list(chunk),)])
コード例 #17
0
    def _detect_field_values_for_document(task: ExtendedTask, document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info(
            'Detected {0} field values for document #{1} ({2})'.format(
                len(dfvs), document_id, doc.name))
コード例 #18
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
コード例 #19
0
    def _detect_field_values_for_document(task: ExtendedTask,
                                          document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        if doc.status and not doc.status.is_active:
            task.log_info('Forbidden detecting field values for document with "completed"'
                          ' status, document #{} ({})'.format(document_id, doc.name))
            return

        log = CeleryTaskLogger(task)

        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info('Detected {0} field values for document #{1} ({2})'.format(
            len(dfvs), document_id, doc.name))
コード例 #20
0
    def train_model_for_dirty_field(task: ExtendedTask, dirty_field_id: Any) -> None:
        dirty_field = DocumentTypeField.objects \
            .filter(pk=dirty_field_id) \
            .prefetch_related('document_type', 'document_field')[0]

        if dirty_field.can_retrain():
            dirty_field.dirty = False
            dirty_field.save()
            document_type = dirty_field.document_type
            field = dirty_field.document_field
            train_docs_count = field_detection_utils.get_approved_documents_number(document_type, field, None)
            if train_docs_count >= settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                new_model = field_detection.train_document_field_detector_model(CeleryTaskLogger(task),
                                                                                document_type, field, None)
                if new_model:
                    ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete()
                    new_model.save()
コード例 #21
0
def cache_document_fields_for_doc_ids(task: ExtendedTask,
                                      doc_ids: Iterable,
                                      changed_by_user_id: int = None,
                                      cache_system_fields: FieldSpec = True,
                                      cache_generic_fields: FieldSpec = True,
                                      cache_user_fields: bool = True):
    log = CeleryTaskLogger(task)
    changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None
    for doc in Document.all_objects.filter(pk__in=doc_ids) \
            .select_related('document_type', 'assignee', 'status'):  # type: Document
        try:
            cache_document_fields(log, doc, changed_by_user=changed_by_user,
                                  cache_system_fields=cache_system_fields,
                                  cache_generic_fields=cache_generic_fields,
                                  cache_user_fields=cache_user_fields)
        except Document.DoesNotExist:
            pass
コード例 #22
0
    def detect_field_values_for_document(
            task: ExtendedTask, detect_ptrs: DocDetectFieldValuesParams):
        doc = Document.all_objects.get(pk=detect_ptrs.document_id)
        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log,
                                                        doc,
                                                        changed_by_user=task.task.user,
                                                        save=not detect_ptrs.do_not_write,
                                                        clear_old_values=detect_ptrs.clear_old_values,
                                                        updated_field_codes=detect_ptrs.updated_field_codes)

        task.log_info(f'Detected {len(dfvs)} field values for document ' +
                      f'#{detect_ptrs.document_id} ({doc.name})')
コード例 #23
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
コード例 #24
0
def process_document_changed(task: ExtendedTask,
                             doc_id: int,
                             system_fields_changed: FieldSpec = True,
                             generic_fields_changed: FieldSpec = True,
                             user_fields_changed: bool = True,
                             changed_by_user_id: int = None):
    from apps.document.repository.document_field_repository import DocumentFieldRepository

    dfr = DocumentFieldRepository()

    doc = Document.objects.get(pk=doc_id)  # type: Document
    changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None
    if DocumentSystemField.status.specified_in(system_fields_changed):
        dfr.delete_hidden_field_values_if_needed(doc, event_sender=task)
    fire_document_changed(sender=task,
                          log=CeleryTaskLogger(task),
                          document=doc,
                          changed_by_user=changed_by_user,
                          document_initial_load=False,
                          system_fields_changed=system_fields_changed,
                          generic_fields_changed=generic_fields_changed,
                          user_fields_changed=user_fields_changed)
コード例 #25
0
def cache_fields_for_docs_queryset(
        task: ExtendedTask,
        doc_qr,
        changed_by_user: User = None,
        document_initial_load: bool = False,
        generic_fields_changed: bool = True,
        user_fields_changed: bool = True,
        pre_detected_field_values: Optional[Dict[str, Any]] = None,
        old_field_values: Dict[int, Dict[str, Any]] = None):
    from apps.rawdb.field_value_tables import cache_document_fields
    old_field_values = old_field_values or {}
    for doc in doc_qr.select_related('document_type', 'project',
                                     'status'):  # type: Document
        log = CeleryTaskLogger(task)
        cache_document_fields(log=log,
                              document=doc,
                              cache_generic_fields=generic_fields_changed,
                              cache_user_fields=user_fields_changed,
                              pre_detected_field_codes_to_suggested_values=
                              pre_detected_field_values,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              old_field_values=old_field_values.get(doc.pk))
コード例 #26
0
def import_document_type(json_bytes: bytes, save: bool,
                         auto_fix_validation_errors: bool,
                         remove_missed_in_dump_objects: bool,
                         task: ExtendedTask) -> DocumentType:
    tasks = Task.objects \
        .get_active_user_tasks() \
        .exclude(pk=task.task.pk) \
        .distinct('name') \
        .order_by('name') \
        .values_list('name', flat=True)
    tasks = list(tasks)
    if tasks:
        msg = 'The following user tasks are running: {0}. This import can cause their crashing because of document' \
              ' type / field structure changes.'.format(', '.join(tasks))
        raise RuntimeError(msg)

    objects = serializers.deserialize("json", json_bytes.decode("utf-8"))
    document_type = None
    pk_to_field = {}
    field_detectors = []
    other_objects = []
    logger = CeleryTaskLogger(task)
    for deserialized_object in objects:
        obj = deserialized_object.object
        if isinstance(obj, DocumentType):
            if document_type is not None:
                raise RuntimeError('More than one document types was detected')
            document_type = DeserializedDocumentType(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
        elif isinstance(obj, DocumentField):
            field = DeserializedDocumentField(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
            pk_to_field[field.pk] = field
        elif isinstance(obj, DocumentFieldDetector):
            field_detector = DeserializedDocumentFieldDetector(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            field_detectors.append(field_detector)
        elif isinstance(obj, DocumentFieldCategory):
            category = DeserializedDocumentFieldCategory(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            other_objects.append(category)
        else:
            raise RuntimeError('Unknown model')

    if document_type is None:
        raise RuntimeError('Unable to find document type')

    conflicting_document_type = DocumentType.objects \
        .filter(code=document_type.object.code) \
        .exclude(pk=document_type.pk) \
        .first()
    if conflicting_document_type is not None:
        err_msg = 'Unable to import document type #{0} "{1}". Database already contains a document type #{2}' \
                  ' with code "{3}"'.format(document_type.pk,
                                            document_type.object.code,
                                            conflicting_document_type.pk,
                                            conflicting_document_type.code)
        raise RuntimeError(err_msg)

    for field_detector in field_detectors:
        field = pk_to_field.get(field_detector.field_pk)
        if field is not None:
            field.add_dependent_object(field_detector)
        else:
            raise RuntimeError('Unknown field #{0}'.format(
                field_detector.field_pk))

    for field in pk_to_field.values():
        if field.document_type_pk == document_type.pk:
            document_type.add_dependent_object(field)
        else:
            raise RuntimeError('Unknown document type #{0}'.format(
                document_type.pk))

    for obj in other_objects:
        document_type.add_dependent_object(obj)

    logger.info('Validation of {0} ...'.format(document_type.object.code))
    validation_errors = document_type.validate()
    logger.info('Validation of {0} is finished'.format(
        document_type.object.code))
    if validation_errors:
        task.log_error(
            '{0} VALIDATION ERRORS HAS OCCURRED DURING VALIDATION OF {1}.'.
            format(len(validation_errors), document_type.object.code))
        for index, validation_error in enumerate(validation_errors):
            # for different timestamps
            sleep(0.001)
            task.log_error('VALIDATION ERROR {0}. {1}'.format(
                index + 1, str(validation_error)))
        raise ValidationError(
            'Validation errors has occurred during import of {0}'.format(
                document_type.object.code))

    if save:
        logger.info('Import of {0} ...'.format(document_type.object.code))
        with transaction.atomic():
            document_type.save()
        logger.info('Import of {0} is finished'.format(
            document_type.object.code))

    return document_type.object
コード例 #27
0
    def process(self, **kwargs):
        self.log_info(
            'Going to train document field based on the datasets stored in DB...')

        document_type_field_id = kwargs.get('document_type_field_id')
        skip_training = kwargs.get('skip_training')
        use_only_confirmed_field_values_for_training = kwargs.get('use_only_confirmed_field_values_for_training')
        train_data_project_ids = kwargs.get('train_data_project_ids')

        skip_testing = kwargs.get('skip_testing')
        use_only_confirmed_field_values_for_testing = kwargs.get('use_only_confirmed_field_values_for_testing')
        test_data_projects_ids = kwargs.get('test_data_projects_ids')

        document_type_field = DocumentTypeField.objects.get(pk=document_type_field_id)
        document_type = document_type_field.document_type
        field = document_type_field.document_field  # type: DocumentField

        if not field.is_detectable():
            self.log_info('Field {0} is not detectable. Nothing to train and/or test.'.format(field.code))

        new_model = None

        if not skip_training:
            if train_data_project_ids:
                self.log_info('Training model on the specified projects...')
            else:
                self.log_info('No training projects specified. '
                              'Training model on all user-confirmed field values in the system...')

            new_model = field_detection \
                .train_document_field_detector_model(CeleryTaskLogger(self),
                                                     document_type,
                                                     field,
                                                     train_data_project_ids,
                                                     use_only_confirmed_field_values_for_training)
            if new_model:
                ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete()
                new_model.save()

                if new_model.classifier_accuracy_report_in_sample:
                    self.log_info('Sklearn test report for in-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_in_sample))

                if new_model.classifier_accuracy_report_out_of_sample:
                    self.log_info('Sklearn test report for out-of-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_out_of_sample))
            else:
                self.log_info('No model trained. '
                              'Probably the detection strategy of field {0} does not allow training'.format(field.code))

        if skip_testing:
            return

        if not test_data_projects_ids:
            self.log_info('No test projects specified. Skiping the testing step.')
            return
        else:
            if not use_only_confirmed_field_values_for_testing:
                test_document_ids = Document.objects \
                    .filter(project_id__in=test_data_projects_ids, document_type_id=document_type.pk) \
                    .values_list('pk', flat=True)
            else:
                test_document_ids = set(field_detection_utils
                                        .get_qs_active_modified_document_ids(document_type,
                                                                             field,
                                                                             test_data_projects_ids))
                test_document_ids.update(set(field_detection_utils
                                             .get_qs_finished_document_ids(document_type,
                                                                           test_data_projects_ids)))

            self.log_info('Testing field detection document-by-document...')
            test_tasks_args = []
            for test_document_id in test_document_ids:
                test_tasks_args.append((field.uid, test_document_id))

            if test_tasks_args:
                self.run_sub_tasks('Test Field Detector Model', TrainAndTest.test_field_detector_model,
                                   test_tasks_args)

                self.run_after_sub_tasks_finished('Join Field Detector Model Tests',
                                                  TrainAndTest.join_field_detector_model_tests,
                                                  [(field.uid, document_type.uid, new_model.pk if new_model else None)])
コード例 #28
0
    def process(self, **kwargs):
        if self.PARAM_CONFIG in kwargs:
            config_id = kwargs[self.PARAM_CONFIG]['pk']
        else:
            config_id = kwargs[self.PARAM_CONFIG_ID]

        if self.PARAM_USER in kwargs:
            user_ids = {kwargs[self.PARAM_USER]['pk']}
        else:
            user_ids = kwargs.get(self.PARAM_USER_IDS)

        run_date = kwargs.get(self.PARAM_RUN_DATE)
        run_date_specified = run_date is not None

        if isinstance(run_date, str):
            run_date = parse(run_date)

        run_date = run_date or datetime.datetime.now(tz=tzlocal.get_localzone())

        run_even_if_not_enabled = bool(kwargs.get(self.PARAM_RUN_EVEN_IF_NOT_ENABLED))

        config = DocumentDigestConfig.objects \
            .filter(pk=config_id).select_related('for_role', 'for_user').first()  # type: DocumentDigestConfig
        if not config:
            self.log_error('{1} not found: #{0}'.format(config_id, DocumentDigestConfig.__name__))
            return

        if not config.enabled and not run_even_if_not_enabled:
            self.log_info('{1} #{0} is disabled.'.format(config_id, DocumentDigestConfig.__name__))
            return

        tz_msg = ' at timezone {0}'.format(run_date.tzname()) if run_date_specified else ''
        self.log_info('Rendering and sending {what} #{pk} ({doc_filter}) for date "{run_date}" to {n} users{tz_msg}'
                      .format(what=DocumentDigestConfig.__name__,
                              pk=config.pk, doc_filter=config.documents_filter, n=len(user_ids), run_date=run_date,
                              tz_msg=tz_msg))

        if user_ids:
            users_qr = User.objects.filter(pk__in=user_ids)
        elif config.for_role_id is not None:
            users_qr = User.objects.filter(role_id=config.for_role_id)
        elif config.for_user_id is not None:
            users_qr = User.objects.get(pk=config.for_user_id)
        else:
            self.log_error('{what} #{config_id} specifies neither for_user nor for_role.'
                           .format(what=DocumentDigestConfig.__name__, config_id=config.pk))
            return

        log = CeleryTaskLogger(self)
        for user in users_qr:  # type: User
            if config.for_user_id != user.id and (config.for_role_id is None or config.for_role_id != user.role_id):
                self.log_error('{what} #{what_id} is not applicable for user {user_name} (#{user_id})'
                               .format(what=DocumentDigestConfig.__name__,
                                       what_id=config.pk,
                                       user_name=user.get_full_name(),
                                       user_id=user.pk))
                continue

            try:
                digest = render_digest(config=config, dst_user=user, run_date=run_date)
                if digest:
                    digest.send(log)
            except Exception as e:
                self.log_error(f'Unable to send {RenderedDigest}.\n'
                               f'Config: #{config.pk}\n'
                               f'Dst user: {user.get_full_name()} #{user.pk}\n'
                               f'Run date: {run_date}', exc_info=e)
コード例 #29
0
    def send_notifications_packet(ntfs: List[DocumentNotification],
                                  event: str,
                                  task: BaseTask):
        documents_data = list(Document.all_objects.filter(
            pk__in={d.document_id for d in ntfs}))  # type: List[Document]
        doc_type_by_id = {dt.document_type.pk:dt.document_type for dt in documents_data}
        doc_types = [doc_type_by_id[pk] for pk in doc_type_by_id]

        doc_by_id = {}  # type: Dict[int, Document]
        for doc in documents_data:
            doc_by_id[doc.pk] = doc

        users = User.objects.filter(pk__in={d.changed_by_user_id for d in ntfs})
        user_by_id = {u.pk: u for u in users}

        handlers_by_doctype = {d: build_field_handlers(d, include_annotation_fields=False)
                               for d in doc_types}  # type:Dict[str, RawdbFieldHandler]

        log = CeleryTaskLogger(task)

        # { (doc_type, event,) : [notification0, notification1, ...], ... }
        messages_by_subscr_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotification]]
        # { (doc_type, event,) : [DocumentNotificationSubscription0, ... ], ... }
        subscr_by_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotificationSubscription]]

        for ntf in ntfs:
            if ntf.document_id not in doc_by_id:
                continue
            document = doc_by_id[ntf.document_id]
            key = (document.document_type, ntf.event,)
            if key in messages_by_subscr_key:
                messages_by_subscr_key[key].append(ntf)
            else:
                subscriptions = DocumentNotificationSubscription.objects \
                    .filter(enabled=True,
                            document_type=document.document_type,
                            event=event,
                            recipients__isnull=False) \
                    .select_related('specified_user', 'specified_role') \
                    .prefetch_related(Prefetch('user_fields',
                                               queryset=DocumentField.objects.all().order_by('order')))
                subscr_by_key[key] = subscriptions
                messages_by_subscr_key[key] = [ntf]

        notifications_to_send = []  # type: List[RenderedNotification]

        for key in messages_by_subscr_key:
            messages = messages_by_subscr_key[key]
            subscriptions = subscr_by_key[key]
            for sub in subscriptions:
                for msg_pack in chunks(messages, sub.max_stack):
                    # render pack of notifications or just one notification
                    if len(msg_pack) < 2:
                        # render single notification
                        if msg_pack[0].document_id not in doc_by_id or \
                                not doc_by_id[msg_pack[0].document_id]:
                            raise Exception(f'Error in send_notifications_packet(1): doc '
                                            f'with id={msg_pack[0].document_id} was not obtained')
                        document = doc_by_id[msg_pack[0].document_id]
                        handlers = handlers_by_doctype[document.document_type]
                        user = user_by_id[msg_pack[0].changed_by_user_id]

                        try:
                            notification = NotificationRenderer.render_notification(
                                msg_pack[0].package_id,
                                sub,
                                DocumentNotificationSource(
                                    document=document,
                                    field_handlers=handlers,
                                    field_values=msg_pack[0].field_values,
                                    changes=msg_pack[0].changes,
                                    changed_by_user=user))
                            if notification:
                                notifications_to_send.append(notification)
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(1), '
                                      f'sending render_notification()', exc_info=e)
                    else:
                        not_sources = []  # List[DocumentNotificationSource
                        # render pack of notifications in a single message
                        for msg in msg_pack:
                            if msg.document_id not in doc_by_id or \
                                    not doc_by_id[msg.document_id]:
                                raise Exception(f'Error in send_notifications_packet({len(msg_pack)}: doc '
                                                f'with id={msg.document_id} was not obtained')

                            document = doc_by_id[msg.document_id]
                            handlers = handlers_by_doctype[document.document_type]
                            user = user_by_id[msg.changed_by_user_id]
                            not_src = DocumentNotificationSource(
                                document=document,
                                field_handlers=handlers,
                                field_values=msg.field_values,
                                changes=msg.changes,
                                changed_by_user=user)
                            not_sources.append(not_src)
                        try:
                            notifications = NotificationRenderer.render_notification_pack(
                                [m.package_id for m in msg_pack],
                                sub, not_sources)
                            notifications_to_send += notifications
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(), '
                                      f'sending render_notification_pack()', exc_info=e)

        log.info(f'notification.send({len(notifications_to_send)})')
        for notification in notifications_to_send:
            notification.send(log=log)
コード例 #30
0
    def check_email_pool(_task) -> None:
        log = CeleryTaskLogger(_task)
        for event in EmailNotificationPool.DOC_NOTIFICATION_EVENTS:
            cache_key = f'{CACHE_DOC_NOTIFICATION_PREFIX}{event}'
            try:
                cached_msgs_count = ObjectStorage.objects.filter(pk__startswith=cache_key).count()
                if not cached_msgs_count:
                    continue
                if cached_msgs_count < EmailNotificationPool.batch_size:
                    lastest_msg_time = ObjectStorage.objects.filter(
                        pk__startswith=cache_key).aggregate(Min('last_updated'))
                    lastest_msg_time = [lastest_msg_time[k] for k in lastest_msg_time][0]
                    delta = now() - lastest_msg_time
                    if delta.seconds < EmailNotificationPool.batch_seconds:
                        continue

                ntfs = []  # type:List[DocumentNotification]
                for raw_msg in ObjectStorage.objects.filter(pk__startswith=cache_key):  # type: ObjectStorage
                    try:
                        msg = pickle.loads(raw_msg.data)  # type: DocumentNotification
                        ntfs.append(msg)
                    except:
                        log.error(f'send_notifications_packet() - error unpickling raw_msg.data')
                        pass

                if not ntfs:
                    continue
            except Exception as e:
                log.error(f'Error in check_email_pool(), extracting pool messages: {e}')
                continue
            try:
                log.info(f'send_notifications_packet({len(ntfs)})')
                EmailNotificationPool.send_notifications_packet(ntfs, event, _task)
            except Exception as e:
                log.error(f'Error in check_email_pool(), sending package: {e}')
                continue

            try:
                ObjectStorage.objects.filter(pk__startswith=cache_key).delete()
            except Exception as e:
                log.error(f'Error in check_email_pool(), deleting pool objects: {e}')
                continue