Beispiel #1
0
    def process(self, **kwargs):
        document_field_arg = kwargs.get('document_field')
        document_field_id = document_field_arg['pk'] if document_field_arg else None

        delete_broken = kwargs.get('delete_broken')

        # check FieldValue-s
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        qs_field_values = field_repo.get_field_value_ids_by_doc_field(document_field_id)
        total_num = qs_field_values.count()
        for ids_chunk in chunks(qs_field_values.values_list('pk', flat=True), 100):
            id_list = list(ids_chunk)
            self.run_sub_tasks('Check FieldValues',
                               self.check_field_values,
                               [(id_list, delete_broken)])
            self.log_info(f'Sub-tasks started for {len(id_list)} FieldValues of total {total_num}')

        # check FieldAnnotation-s
        qs_ants = field_repo.get_fieldant_ids_by_doc_field(document_field_id)
        total_num = qs_ants.count()
        for ids_chunk in chunks(qs_ants.values_list('pk', flat=True), 100):
            id_list = list(ids_chunk)
            self.run_sub_tasks('Check FieldAnnotations',
                               self.check_annotations,
                               [(id_list, delete_broken)])
            self.log_info(f'Sub-tasks started for {len(id_list)} FieldAnnotations of total {total_num}')
 def test_slicing_used(self):
     col = MySequence([1, 2, 3, 4, 5, 6, 7, 8, 9])
     actual = list(chunks(col, 4))
     expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9]]
     self.assertEqual(expected, actual)
     self.assertTrue(col.slicing_used)
     self.assertFalse(col.iterator_used)
    def sync_imanage_config(self, imanage_config: IManageConfig):
        auth_token = imanage_config.login()

        # Step 1: Find documents about which we don't know and store their ids in IManageDocument table
        imanage_docs = imanage_config.search_documents(auth_token)
        self.log_info('Found {0} documents at imanage server'.format(len(imanage_docs) if imanage_docs else None))

        with connection.cursor() as cursor:
            for docs_chunk in chunks(imanage_docs, 50):  # type: List[Dict]
                insert_clause = 'insert into "{table_name}" ' \
                                '(imanage_config_id, imanage_doc_id, imanage_doc_number, imanage_doc_data, ' \
                                ' import_problem) ' \
                                'values {values_place_holders} on conflict do nothing'.format(
                    table_name=IManageDocument._meta.db_table,
                    values_place_holders=', '.join(['(%s, %s, %s, %s, %s)'] * len(docs_chunk)))
                params = list(chain(*[(imanage_config.pk,
                                       str(doc['id']),
                                       str(doc.get('document_number')) if 'document_number' in doc else None,
                                       json.dumps(doc),
                                       False)
                                      for doc in docs_chunk]))
                cursor.execute(insert_clause, params)

        # Step 2. Get iManage doc ids for which we don't have Contraxsuite Documents created
        # Further we can add re-reading them from iManage by some logic
        args = [(imanage_config.id, imanage_doc_id) for imanage_doc_id in
                IManageDocument.objects.filter(imanage_config=imanage_config,
                                               import_problem=False,
                                               document__isnull=True).values_list('imanage_doc_id', flat=True)]
        imanage_config.last_sync_start = timezone.now()
        imanage_config.save(update_fields=['last_sync_start'])
        self.log_info('Found {0} new imanage documents for which we do not have Contraxsute documents'
                      .format(len(args) if args else 0))
        self.run_sub_tasks('Sync iManage documents for config: {0}'.format(imanage_config.code),
                           IManageSynchronization.sync_imanage_document, args)
    def test_generators(self):
        def gen() -> Generator[int, None, None]:
            for i in range(1, 10, 1):
                yield i

        actual = list(chunks(gen(), 4))
        expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9]]
        self.assertEqual(expected, actual)
Beispiel #5
0
def index_documents(task: ExtendedTask,
                    doc_ids: List,
                    changed_by_user_id: int = None,
                    cache_system_fields: FieldSpec = True,
                    cache_generic_fields: FieldSpec = True,
                    cache_user_fields: bool = True):
    """
    Index documents (cache document fields) in parallel. Document ids set is split to chunks and a
    sub-task is started for each sub-list.
    """
    args = [(sub_list, changed_by_user_id, cache_system_fields, cache_generic_fields, cache_user_fields)
            for sub_list in chunks(doc_ids, DOC_NUM_PER_SUB_TASK)]
    task.run_sub_tasks('Reindex documents', cache_document_fields_for_doc_ids_tracked, args)
Beispiel #6
0
    def process(self,
                document_type_config_csv_file: Dict,
                action: str,
                update_cache: bool,
                **kwargs):

        if action == 'validate':
            save = False
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'validate|import':
            save = True
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'import|auto_fix|retain_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = False
        elif action == 'import|auto_fix|remove_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = True
        else:
            raise RuntimeError('Unknown action')

        try:
            json_bytes = DbCache.get(document_type_config_csv_file['cache_key'])
            document_type = import_document_type(json_bytes=json_bytes,
                                                 save=save,
                                                 auto_fix_validation_errors=auto_fix_validation_errors,
                                                 remove_missed_in_dump_objects=remove_missed_objects,
                                                 task=self)
        finally:
            DbCache.clean_cache(document_type_config_csv_file['cache_key'])

        if not (save and update_cache):
            return

        from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
        if not APP_VAR_DISABLE_RAW_DB_CACHING.val:
            self.log_info('Adapting RawDB table structure after import ...')
            adapt_table_structure(CeleryTaskLogger(self), document_type, force=False)
        ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True)
        ids = list(ids)
        self.log_info('Caching document field values ...')

        for chunk in chunks(ids, 50):
            self.run_sub_tasks('Cache field values for a set of documents',
                               ImportDocumentType.cache_document_fields_for_doc_ids,
                               [(list(chunk),)])
Beispiel #7
0
def plan_process_documents_status_changed(doc_ids: Iterable, new_status_id: int, changed_by_user_id: int):
    """
    Plans processing of the documents status change. Starts multiple tasks, N doc ids per task, to avoid
    possible overloading the rabbitmq if too large set of doc ids is provided.
    Each started task will be shown in the admin task list and may start any number of sub-tasks to parallelize
    the processing.
    :param doc_ids:
    :param new_status_id:
    :param changed_by_user_id:
    :return:
    """
    for doc_ids_chunk in chunks(doc_ids, DOC_NUMBER_PER_MAIN_TASK):
        call_task_func(process_documents_status_changed,
                       (doc_ids_chunk, new_status_id, changed_by_user_id),
                       changed_by_user_id)
def plan_reindex_tasks_in_chunks(all_doc_ids: Iterable,
                                 changed_by_user_id: int = None,
                                 cache_system_fields: FieldSpec = True,
                                 cache_generic_fields: FieldSpec = True,
                                 cache_user_fields: bool = True):
    """
    Plans document reindexing. Splits the provided set of doc ids to chunks and runs N main tasks which will be
    displayed in the admin task list. Splitting is done to avoid overloading rabbitmq with possible too large
    argument list. Started tasks may split their processing to any number of sub-tasks to parallelize the work.
    """
    for doc_ids_chunk in chunks(all_doc_ids, DOC_NUM_PER_MAIN_TASK):
        call_task_func(index_documents,
                       (doc_ids_chunk, changed_by_user_id, cache_system_fields,
                        cache_generic_fields, cache_user_fields),
                       changed_by_user_id)
 def test_iterator_used(self):
     col = MyIterable([1, 2, 3, 4, 5, 6, 7, 8, 9])
     actual = list(chunks(col, 4))
     expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9]]
     self.assertEqual(expected, actual)
     self.assertTrue(col.iterator_used)
 def test_sets(self):
     col = {1, 2, 3, 4, 5, 6, 7, 8, 9}
     actual = list(chunks(col, 4))
     expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9]]
     self.assertEqual(expected, actual)
    def send_notifications_packet(ntfs: List[DocumentNotification],
                                  event: str,
                                  task: BaseTask):
        documents_data = list(Document.all_objects.filter(
            pk__in={d.document_id for d in ntfs}))  # type: List[Document]
        doc_type_by_id = {dt.document_type.pk:dt.document_type for dt in documents_data}
        doc_types = [doc_type_by_id[pk] for pk in doc_type_by_id]

        doc_by_id = {}  # type: Dict[int, Document]
        for doc in documents_data:
            doc_by_id[doc.pk] = doc

        users = User.objects.filter(pk__in={d.changed_by_user_id for d in ntfs})
        user_by_id = {u.pk: u for u in users}

        handlers_by_doctype = {d: build_field_handlers(d, include_annotation_fields=False)
                               for d in doc_types}  # type:Dict[str, RawdbFieldHandler]

        log = CeleryTaskLogger(task)

        # { (doc_type, event,) : [notification0, notification1, ...], ... }
        messages_by_subscr_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotification]]
        # { (doc_type, event,) : [DocumentNotificationSubscription0, ... ], ... }
        subscr_by_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotificationSubscription]]

        for ntf in ntfs:
            if ntf.document_id not in doc_by_id:
                continue
            document = doc_by_id[ntf.document_id]
            key = (document.document_type, ntf.event,)
            if key in messages_by_subscr_key:
                messages_by_subscr_key[key].append(ntf)
            else:
                subscriptions = DocumentNotificationSubscription.objects \
                    .filter(enabled=True,
                            document_type=document.document_type,
                            event=event,
                            recipients__isnull=False) \
                    .select_related('specified_user', 'specified_role') \
                    .prefetch_related(Prefetch('user_fields',
                                               queryset=DocumentField.objects.all().order_by('order')))
                subscr_by_key[key] = subscriptions
                messages_by_subscr_key[key] = [ntf]

        notifications_to_send = []  # type: List[RenderedNotification]

        for key in messages_by_subscr_key:
            messages = messages_by_subscr_key[key]
            subscriptions = subscr_by_key[key]
            for sub in subscriptions:
                for msg_pack in chunks(messages, sub.max_stack):
                    # render pack of notifications or just one notification
                    if len(msg_pack) < 2:
                        # render single notification
                        if msg_pack[0].document_id not in doc_by_id or \
                                not doc_by_id[msg_pack[0].document_id]:
                            raise Exception(f'Error in send_notifications_packet(1): doc '
                                            f'with id={msg_pack[0].document_id} was not obtained')
                        document = doc_by_id[msg_pack[0].document_id]
                        handlers = handlers_by_doctype[document.document_type]
                        user = user_by_id[msg_pack[0].changed_by_user_id]

                        try:
                            notification = NotificationRenderer.render_notification(
                                msg_pack[0].package_id,
                                sub,
                                DocumentNotificationSource(
                                    document=document,
                                    field_handlers=handlers,
                                    field_values=msg_pack[0].field_values,
                                    changes=msg_pack[0].changes,
                                    changed_by_user=user))
                            if notification:
                                notifications_to_send.append(notification)
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(1), '
                                      f'sending render_notification()', exc_info=e)
                    else:
                        not_sources = []  # List[DocumentNotificationSource
                        # render pack of notifications in a single message
                        for msg in msg_pack:
                            if msg.document_id not in doc_by_id or \
                                    not doc_by_id[msg.document_id]:
                                raise Exception(f'Error in send_notifications_packet({len(msg_pack)}: doc '
                                                f'with id={msg.document_id} was not obtained')

                            document = doc_by_id[msg.document_id]
                            handlers = handlers_by_doctype[document.document_type]
                            user = user_by_id[msg.changed_by_user_id]
                            not_src = DocumentNotificationSource(
                                document=document,
                                field_handlers=handlers,
                                field_values=msg.field_values,
                                changes=msg.changes,
                                changed_by_user=user)
                            not_sources.append(not_src)
                        try:
                            notifications = NotificationRenderer.render_notification_pack(
                                [m.package_id for m in msg_pack],
                                sub, not_sources)
                            notifications_to_send += notifications
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(), '
                                      f'sending render_notification_pack()', exc_info=e)

        log.info(f'notification.send({len(notifications_to_send)})')
        for notification in notifications_to_send:
            notification.send(log=log)
Beispiel #12
0
def process_documents_assignee_changed(task: ExtendedTask, doc_ids: List, new_assignee_id: int,
                                       changed_by_user_id: int):
    task.run_sub_tasks('Process doc assignee change',
                       _process_documents_assignee_changed,
                       [(l, new_assignee_id, changed_by_user_id) for l in chunks(doc_ids, DOC_NUMBER_PER_SUB_TASK)])