Exemple #1
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info(
            'Forbidden storing detected field values for document with "completed"'
            ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set())
                       for f in all_fields]
    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    field_values_pre_cached = False

    res = list()
    for field_code in sorted_codes:
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy
        if not field_values_pre_cached \
                and field_detection_strategy.uses_cached_document_field_values(field):
            # Pre-cache Document.field_values structure for the usage in field detection strategies
            document.field_values = field_value_cache.cache_field_values(
                document, None, save=False)
            field_values_pre_cached = True

        detected_values = field_detection_strategy.detect_field_values(
            log, document, field)  # type: List[DetectedFieldValue]
        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, res, save=True, log=log)

    return res
Exemple #2
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
Exemple #3
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities, ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'.format(
                        count, len(tag_models), len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
        self.save_summary(log, user_id)
def _recreate_document_fields_table(log: ProcessLogger, table_name: str,
                                    column_defs: Dict[str, str],
                                    index_defs: Dict[str, str]):
    log.info('Recreating raw sql table: {0}'.format(table_name))

    column_def_clauses = [
        SQLClause('"{column}" {pg_type}'.format(column=column,
                                                pg_type=pg_type))
        for column, pg_type in column_defs.items()
    ]

    create_table = format_clause(
        'CREATE TABLE "{table_name}" (\n'
        '{columns}, \n'
        'FOREIGN KEY ({field_document_id}) '
        'REFERENCES document_document (id) ON DELETE CASCADE)',
        table_name=table_name,
        columns=join_clauses(', \n', column_def_clauses),
        field_document_id=FIELD_CODE_DOC_ID)  # type: SQLClause

    log.info('Create table SQL for table {0}:\n{1}\nParams: {2}'.format(
        table_name, create_table.sql, create_table.params))

    with connection.cursor() as cursor:
        cursor.execute('drop table if exists "{table_name}"'.format(
            table_name=table_name))
        cursor.execute(create_table.sql, create_table.params)
        for index_name, index_def in index_defs.items():  # type: str, str
            create_index = _build_create_index_statement(
                table_name, index_name, index_def)
            cursor.execute(create_index, [])
Exemple #5
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    if not self.document_initial_load:
                        TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                        for entity_class in self.processed_usage_entity_classes:
                            entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                tags_saved = 0
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models)

            # save "_usage" objects
            count = 0
            for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                if not entities:
                    continue
                count += SafeBulkCreate.bulk_create(entity_class.objects, entities)

            log.info(
                'Stored {0} usage entities and {1} tags for {2} text units'.format(
                    count, tags_saved, len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
    def refresh_materialized_view(self, log: ProcessLogger, view_name: str):
        """
        Refresh the specified materialized view and delete all refresh requests older or equal to the last request date
        taken at this method start.

        Additionally this method acquires a PG advisory lock to prevent
        parallel refreshing of the same view.
        The lock is used by the planning routine which tries to acquire the lock
        to prevent re-planning the same refresh if it is already running.
        :param view_name:
        :param log
        :return:
        """
        try:
            with connection.cursor() as cursor:
                cursor.execute(f'update {TABLE_M_VIEW} '
                               'set status=%s where view_name=%s;',
                               [MaterializedView.VIEW_STATUS_UPDATING, view_name])
        except Exception as e:
            log.error(f'Error saving updated status for view "{view_name}": {e}')

        with transaction.atomic():
            with connection.cursor() as cursor:
                if not self.advisory_lock_by_relation_name(cursor, view_name):
                    log.info(f'Canceled refreshing materialized view: {view_name}. '
                             f'Unable to acquire the advisory lock.')
                    cursor.execute(f'update {TABLE_M_VIEW} '
                                   'set status=%s where view_name=%s;',
                                   [MaterializedView.VIEW_STATUS_UPDATED, view_name])
                    return
                log.info(f'Refreshing materialized view: {view_name}.')
                cursor.execute('select max(request_date) '
                               f'from {TABLE_M_VIEW_REQUEST} '
                               'where view_name = %s;', [view_name])
                row = cursor.fetchone()
                request_date = row[0] if row else None

                concurency_clause = ''
                from apps.materialized_views.app_vars import CONCURRENCY_UPDATE
                if CONCURRENCY_UPDATE.val:
                    concurency_clause = ' CONCURRENTLY'
                cursor.execute(f'refresh materialized view{concurency_clause} {view_name};')

                if request_date is not None:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s and request_date <= %s',
                                   [view_name, request_date])
                else:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s',
                                   [view_name])
                dt_now = timezone.now()
                cursor.execute(f'insert into {TABLE_M_VIEW} '
                               '(view_name, refresh_date, status) '
                               'values (%s, %s, %s) '
                               'on conflict (view_name) do update set refresh_date = %s, '
                               'status = %s;',
                               [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED,
                                dt_now, MaterializedView.VIEW_STATUS_UPDATED])
Exemple #7
0
    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        for cmd_list in [
                self.tika_default_command_list,
                self.tika_lexnlp_default_command_list
        ]:
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]

            last_try = cmd == self.tika_lexnlp_default_command_list
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = len(output.text) if output and output.text else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex
Exemple #8
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
def there_are_non_indexed_docs_not_planned_to_index(
        document_type: DocumentType, log: ProcessLogger) -> bool:
    for doc_id in non_indexed_doc_ids_not_planned_to_index(document_type, 1):
        if doc_id:
            task_name = _get_reindex_task_name()
            fields_table = doc_fields_table_name(document_type.code)
            log.info(
                f'there_are_non_indexed_docs_not_planned_to_index: '
                f'found document id={doc_id} of type {document_type.code}, '
                f'task {task_name}. Fields table: {fields_table}')
            return True
    return False
    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT
        # don't use at all TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag
        os.environ[self.TIKA_PARSER_DETAIL] = ''

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
Exemple #11
0
    def plan_refreshes(self, log: ProcessLogger, refresh_task_name: str,
                       plan_task_func: Callable[[str, datetime], None]):
        """
        Checks if there are materialized view refresh requests older than N seconds and plans the refreshing.
        The requests are inserted into the corresponding table by the document loading routines or any other
        code which changes the data on which these views are based.
        Maybe they will be replaced by a DB trigger in future.
        :param plan_task_func:
        :param log
        :return:
        """

        from apps.materialized_views.app_vars import REFRESH_DELAY
        refresh_delay_sec = REFRESH_DELAY.val
        to_refresh = list()
        with connection.cursor() as cursor:
            cursor.execute(
                f'''select view_name, max(request_date) 
                               from {TABLE_M_VIEW_REQUEST}
                               where to_jsonb(view_name) not in 
                                     (select args->0 from task_task where name = %s and own_status = %s) 
                               group by view_name''',
                (refresh_task_name, PENDING))
            for view_name, max_request_date in cursor.fetchall(
            ):  # type: str, datetime
                if timezone.now() - max_request_date > timedelta(
                        seconds=refresh_delay_sec):
                    to_refresh.append(view_name)

        # Here we use PG advisory locks to prevent planning the materialized view refresh it the refresh
        # is already being executed.
        # The same lock is acquired in refresh_materialized_view() by any Celery worker (maybe on a different machine)
        # which is running the refresh of the same view.

        # And the following code running in Celery-beat on the master machine checks is the "refresh" is in progress
        # by trying to acquire the lock.

        for view_name in to_refresh:
            with transaction.atomic():
                # We need to execute it in a separate transaction to release the PG advisory lock
                # before executing plan_task_func.
                # Cursor is closed on the transaction end. So we initialize it here and don't re-use.
                with connection.cursor() as cursor:
                    locked = self.advisory_lock_by_relation_name(
                        cursor, view_name)

            if locked:
                log.info(
                    f'Planning refresh for materialized view {view_name}.')
                plan_task_func(view_name)
Exemple #12
0
def document_fields_change_listener_impl(
        _sender,
        signal,
        log: ProcessLogger,
        document_event: str,
        document_pk: int,
        field_handlers: Dict[str, RawdbFieldHandler],
        fields_before: Optional[Dict],
        fields_after: Optional[Dict],
        changed_by_user: User = None):
    from apps.notifications.tasks import process_notifications_on_document_change
    if not changed_by_user:
        # we ignore changes made by system at the moment
        return

    if not fields_before and not fields_after:
        log.error(
            'Document fields changed event appeared with both "before" and "after" fields empty.'
        )
        return

    from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS
    if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val:
        return

    process_notifications_on_document_change(lambda m: log.info(m),
                                             document_event, document_pk,
                                             fields_before, fields_after,
                                             changed_by_user.pk)
Exemple #13
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(
                        text_unit_id__in=self.processed_text_unit_ids).delete(
                        )
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(
                            text_unit_id__in=self.processed_text_unit_ids
                        ).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items(
                ):  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities,
                                                         ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                for text_unit_id, tags in self.tags.items():
                    for tag in tags:
                        tag_models.append(
                            TextUnitTag(user_id=user_id,
                                        text_unit_id=text_unit_id,
                                        tag=tag))
                TextUnitTag.objects.bulk_create(tag_models,
                                                ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'
                    .format(count, len(tag_models),
                            len(self.processed_text_unit_ids)))
        except:
            msg = render_error(
                'Unable to store location results.\n'
                'Text unit ids: {text_unit_ids}\n'
                'Usage models caused the problem:\n{entities}'.format(
                    text_unit_ids=self.processed_text_unit_ids,
                    entities='\n'.join([
                        str(e) for e in self.processed_usage_entity_classes
                    ])))
            log.error(msg)
Exemple #14
0
    def train_document_field_detector_model(
        cls,
        log: ProcessLogger,
        field: DocumentField,
        train_data_project_ids: Optional[List],
        use_only_confirmed_field_values: bool = False
    ) -> Optional[ClassifierModel]:
        log.info(f'Training model for field {field.code} (#{field.pk})...')

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data_sets = cls.get_train_datasets_from_projects(
                field.pk, train_data_project_ids)
        else:
            train_data_sets = cls.get_train_data_sets(field,
                                                      train_data_project_ids)

        if not train_data_sets:
            log.info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(field.document_type.pk, field.pk))
            return None

        classifier_model = cls.train_model(field, train_data_sets)
        log.info(
            'Finished training model for document_type #{0} and field #{1}.'.
            format(field.document_type.pk, field.pk))

        return classifier_model
Exemple #15
0
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False,
                                            split_and_log_out_of_sample_test_report: bool = False) \
            -> Optional[ClassifierModel]:
        log.info(f'Training model for field {field.code} (#{field.pk})...')

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data_sets = cls.get_train_datasets_from_projects(
                field.pk, train_data_project_ids)
        else:
            train_data_sets = cls.get_train_data_sets(field,
                                                      train_data_project_ids)

        if not train_data_sets:
            log.info(
                f'Not enough data to train model for document_type {field.document_type.code}, field: {field.code}.'
            )
            return None

        classifier_model = cls.train_model(
            log, field, train_data_sets,
            split_and_log_out_of_sample_test_report)
        log.info(
            f'Finished training model for document_type {field.document_type.code}, field: {field.code}.'
        )

        return classifier_model
    def refresh_materialized_view(self, log: ProcessLogger, view_name: str):
        """
        Refresh the specified materialized view and delete all refresh requests older or equal to the last request date
        taken at this method start.

        Additionally this method acquires a PG advisory lock to prevent
        parallel refreshing of the same view.
        The lock is used by the planning routine which tries to acquire the lock
        to prevent re-planning the same refresh if it is already running.
        :param view_name:
        :param log
        :return:
        """
        with transaction.atomic():
            with connection.cursor() as cursor:
                if not self.advisory_lock_by_relation_name(cursor, view_name):
                    log.info(f'Canceled refreshing materialized view: {view_name}. '
                             f'Unable to acquire the advisory lock.')
                    return
                log.info(f'Refreshing materialized view: {view_name}.')
                cursor.execute('select max(request_date) '
                               'from materialized_views_materializedviewrefreshrequest '
                               'where view_name = %s', [view_name])
                row = cursor.fetchone()
                request_date = row[0] if row else None
                cursor.execute(f'refresh materialized view {view_name}')
                if request_date is not None:
                    cursor.execute('delete from materialized_views_materializedviewrefreshrequest '
                                   'where view_name = %s and request_date <= %s',
                                   [view_name, request_date])
                else:
                    cursor.execute('delete from materialized_views_materializedviewrefreshrequest '
                                   'where view_name = %s',
                                   [view_name])
                dt_now = timezone.now()
                cursor.execute('insert into materialized_views_materializedview '
                               '(view_name, refresh_date) values (%s, %s) '
                               'on conflict (view_name) do update set refresh_date = %s', [view_name, dt_now, dt_now])
Exemple #17
0
def apply_simple_config(log: ProcessLogger,
                        document_field: DocumentField,
                        csv: bytes,
                        drop_previous_field_detectors: bool,
                        update_field_choice_values: bool,
                        csv_contains_regexps: bool = False):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        if len(row) == 0:
            continue

        includes = row.dropna()

        if not csv_contains_regexps:
            includes = [i.strip().replace(' ', '\s{1,100}') for i in includes]
        includes = [i for i in includes if i]

        if len(includes) == 1:
            log.info(
                'There are no search strings specified for detected value {0}'.
                format(row[0]))
            continue

        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(includes[1:])
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info('Forbidden storing detected field values for document with "completed"'
                 ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields]
    required_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    res = list()
    for field_code in sorted_codes:
        if ignore_field_codes and field_code in ignore_field_codes:
            continue
        if required_fields and field_code not in required_fields:
            continue

        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            field_vals = field_value_cache.cache_field_values(document, None, save=False)
            detected_values = field_detection_strategy.detect_field_values(log,
                                                                           document,
                                                                           field,
                                                                           field_vals)  # type: List[DetectedFieldValue]
        except Exception as e:
            msg = '''Unable to detect field value. 
            Document type: {0} 
            Document: {1} 
            Field: {2}'''.format(document_type.code, document.pk, field.code)
            log.error(render_error(msg, e))
            raise e

        if save_detected and clear_old_values:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=document,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .exclude(field__value_detection_strategy=DocumentField.VD_DISABLED) \
                .delete()

        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, suggested_field_values=res,
                                             save=True, log=log,
                                             changed_by_user=changed_by_user,
                                             system_fields_changed=system_fields_changed,
                                             generic_fields_changed=generic_fields_changed,
                                             document_initial_load=document_initial_load)

    return res
    def get_values(self, log: ProcessLogger, field: DocumentField, doc: Document, text: str) \
            -> List[Tuple[Any, Optional[int], Optional[int]]]:

        try:
            conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR
                           )  # type: DocumentSimilarityConfig
        except DocumentSimilarityConfig.DoesNotExist:
            conf = None

        if conf:
            conf.self_validate()

        similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD
        feature_vector_fields = field.depends_on_fields.all()
        date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None
        date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS
        document_type = doc.document_type

        feature_vector_field_codes = {f.code for f in feature_vector_fields}

        # TODO: replace with the corresponding method call when ready
        doc_field_values = dict()
        for fv in doc.documentfieldvalue_set \
                .filter(field__code__in=feature_vector_field_codes.union({date_constraint_field_code})):
            if fv.removed_by_user:
                continue

            field = fv.field
            field_type = fv.field.get_field_type()  # type: FieldType
            doc_field_values[field.code] = field_type \
                .merge_multi_python_values(doc_field_values.get(field.code), fv.python_value)
        doc_field_values[FIELD_CODE_DOC_ID] = doc.pk

        doc_date = doc_field_values.get(
            date_constraint_field_code) if date_constraint_field_code else None
        if not doc_date:
            doc_date = doc.history.last().history_date
            date_constraint_field_code = FIELD_CODE_CREATE_DATE

        date_start = doc_date - timedelta(days=date_constraint_days)
        date_end = doc_date + timedelta(days=date_constraint_days)

        try:
            vectorizer = document_feature_vector_pipeline(
                feature_vector_fields, use_field_codes=True)

            rawdb = RawDbRepository()
            where = SQLClause(
                f'"{FIELD_CODE_DOC_ID}" != %s '
                f'and "{date_constraint_field_code}" >= %s '
                f'and "{date_constraint_field_code}" <= %s',
                [doc.pk, date_start, date_end])

            field_values_list = list(
                rawdb.get_field_values(
                    document_type=document_type,
                    where_sql=where,
                    field_codes=feature_vector_field_codes.union(
                        {FIELD_CODE_DOC_ID, date_constraint_field_code})))

            if not field_values_list:
                return []

            field_values_list = [doc_field_values] + field_values_list
            feature_vectors = vectorizer.fit_transform(field_values_list)
            doc_feature_vectors = feature_vectors[0]
        except ValueError as ve:
            if 'empty vocabulary' in str(ve):
                log.info(
                    f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs '
                    f'contains any value in the feature vector fields.')
                return []
            raise ve

        similarities = cosine_similarity(doc_feature_vectors, feature_vectors)

        # TODO: Think about removing usage of other_field_values_list here and switching it to generator
        # to avoid storing the list of all field values. We only need feature vectors but they have no doc id.
        res = list()  # type: List[Tuple[Any, Optional[int], Optional[int]]]:
        for y, field_values in enumerate(field_values_list):
            other_doc_pk = field_values[FIELD_CODE_DOC_ID]
            if doc.pk == other_doc_pk:
                continue
            similarity = similarities[0, y]
            if similarity < similarity_threshold:
                continue
            res.append((other_doc_pk, None, None))
            self._maybe_save_reverse_similarity_value(
                log=log, field=field, document=doc, other_doc_id=other_doc_pk)

        return res
Exemple #20
0
    def get_value(self,
                  log: ProcessLogger,
                  field: DocumentField,
                  doc: Document,
                  cur_field_code_to_value: Dict[str, Any],
                  location_text: Optional[str],
                  location_start: int = 0,
                  location_end: int = 0) -> Optional[FieldValueDTO]:

        try:
            conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR
                           )  # type: Optional[DocumentSimilarityConfig]
        except DocumentSimilarityConfig.DoesNotExist:
            conf = None

        if conf:
            conf.self_validate()

        similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD
        feature_vector_fields = field.depends_on_fields.all()
        date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None
        date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS
        document_type = doc.document_type

        feature_vector_field_codes = {f.code for f in feature_vector_fields}

        doc_field_values = dict(cur_field_code_to_value)
        doc_field_values[FIELD_CODE_DOC_ID] = doc.pk

        if date_constraint_field_code:
            doc_date = doc_field_values.get(date_constraint_field_code)
            date_start = doc_date - timedelta(days=date_constraint_days)
            date_end = doc_date + timedelta(days=date_constraint_days)

            doc_ids_query = FieldValue.objects \
                .filter(field__code=date_constraint_field_code) \
                .filter(value__gte=date_start) \
                .filter(value__lte=date_end) \
                .filter(document__document_type_id=document_type.pk) \
                .exclude(document_id=doc.pk) \
                .values_list('document_id', flat=True)
        else:
            doc_date = doc.history.last().history_date
            date_start = doc_date - timedelta(days=date_constraint_days)
            date_end = doc_date + timedelta(days=date_constraint_days)

            doc_ids_query = Document.history \
                .filter(history_type='+',
                        history_date__gte=date_start,
                        history_date__lte=date_end,
                        document_type_id=document_type.pk) \
                .exclude(id=doc.pk) \
                .values_list('pk', flat=True)

        try:
            vectorizer = document_feature_vector_pipeline(
                feature_vector_fields, use_field_codes=True)

            field_repo = DocumentFieldRepository()

            field_values_list = list()

            for doc_id, field_values in field_repo \
                    .get_field_code_to_python_value_multiple_docs(document_type_id=document_type.pk,
                                                                  doc_ids=doc_ids_query,
                                                                  field_codes_only=feature_vector_field_codes):
                d = dict(field_values)
                d[FIELD_CODE_DOC_ID] = doc_id
                field_values_list.append(d)

            if not field_values_list:
                return None

            field_values_list = [doc_field_values] + field_values_list
            feature_vectors = vectorizer.fit_transform(field_values_list)
            doc_feature_vectors = feature_vectors[0]
        except ValueError as ve:
            if 'empty vocabulary' in str(ve):
                log.info(
                    f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs '
                    f'contains any value in the feature vector fields.')
                return None
            raise ve

        similarities = cosine_similarity(doc_feature_vectors, feature_vectors)

        # TODO: Think about removing usage of other_field_values_list here and switching it to generator
        # to avoid storing the list of all field values. We only need feature vectors but they have no doc id.
        res = set()  # type: Set[int]
        for y, field_values in enumerate(field_values_list):
            other_doc_pk = field_values[FIELD_CODE_DOC_ID]
            if doc.pk == other_doc_pk:
                continue
            similarity = similarities[0, y]
            if similarity < similarity_threshold:
                continue
            res.add(other_doc_pk)
            self._maybe_save_reverse_similarity_value(
                log=log, field=field, document=doc, other_doc_id=other_doc_pk)

        if res:
            field_value = sorted(res)[0]
            return FieldValueDTO(field_value)
        return None
def adapt_table_structure(log: ProcessLogger,
                          document_type: DocumentType,
                          force: bool = False,
                          check_only: bool = False) -> bool:
    """
    Create or alter raw db table for it to match the field structure of the specified document type.
    :param log:
    :param document_type:
    :param force: Force re-creating the table.
    :param check_only Do not really alter the table but only check if the re-indexing will be required.
    :return: True/False - if any column has been added/removed/altered and re-index is required for this doc type.
    """
    table_name = doc_fields_table_name(document_type.code)

    fields = build_field_handlers(document_type, table_name)
    should_be_columns = dict()  # type: Dict[str, str]
    should_be_indexes = dict()  # type: Dict[str, str]
    for field_handler in fields:
        field_columns = field_handler.get_pg_column_definitions(
        )  # type: Dict[str, field_handlers.PgTypes]
        should_be_columns.update(
            {name: pg_type.value
             for name, pg_type in field_columns.items()})
        index_defs = field_handler.get_pg_index_definitions()
        if index_defs:
            should_be_indexes.update({
                build_index_name(table_name, index_def): index_def
                for index_def in index_defs
            })

    if not check_only and (force or not table_exists(table_name)):
        _recreate_document_fields_table(log, table_name, should_be_columns,
                                        should_be_indexes)
        cleanup_saved_filters(document_type, set(should_be_columns.keys()))
        return True

    reindex_needed = False

    dropped_columns = list()  # type: List[Tuple[str, str]]
    added_columns = list()  # type: List[Tuple[str, str]]

    with connection.cursor() as cursor:
        with transaction.atomic():
            existing_columns = get_table_columns_from_pg(
                cursor, table_name)  # type: Dict[str, str]

            alter_table_actions = list()  # type: List[str]

            for existing_name, existing_type in existing_columns.items():
                should_be_type = should_be_columns.get(existing_name)
                if not should_be_type or should_be_type != existing_type:
                    # column does not exist in "should_be_columns" or has different type
                    alter_table_actions.append(
                        'drop column "{column}"'.format(column=existing_name))
                    dropped_columns.append((existing_name, existing_type))

            for should_be_name, should_be_type in should_be_columns.items():
                existing_type = existing_columns.get(should_be_name)
                if not existing_type or existing_type != should_be_type:
                    # column does not exist in "existing_columns" or has
                    # different type (and has been dropped in prev loop)
                    alter_table_actions.append(
                        'add column "{column}" {pg_type}'.format(
                            column=should_be_name, pg_type=should_be_type))
                    added_columns.append((should_be_name, should_be_type))

            if alter_table_actions:
                if not check_only:
                    alter_table_sql = 'alter table "{table_name}"\n{actions}' \
                        .format(table_name=table_name, actions=',\n'.join(alter_table_actions))
                    cursor.execute(alter_table_sql, [])
                    log.info(
                        'Altered table: {0}\nDropped columns:\n{1}\nAdded columns:\n{2}'
                        .format(
                            table_name, '\n'.join(
                                [c + ': ' + t for c, t in dropped_columns]),
                            '\n'.join([c + ': ' + t
                                       for c, t in added_columns])))
                    cleanup_saved_filters(document_type,
                                          set(should_be_columns.keys()))
                reindex_needed = True

        if not check_only:
            # Changes in indexes do not require document re-indexing - the values will already be in the columns.
            existing_indexes = get_table_index_names_from_pg(
                cursor, table_name)  # type: Set[str]
            for existing_index_name in existing_indexes:
                if existing_index_name not in should_be_indexes:
                    cursor.execute(
                        'drop index concurrently "{index_name}"'.format(
                            index_name=existing_index_name), [])

            for should_be_index_name, should_be_index_def in should_be_indexes.items(
            ):
                if should_be_index_name not in existing_indexes:
                    create_index_sql = _build_create_index_statement(
                        table_name, should_be_index_name, should_be_index_def)
                    cursor.execute(create_index_sql, [])

    return reindex_needed
Exemple #22
0
def check_task_health(log: ProcessLogger, restart_task_func: Callable[[str],
                                                                      None]):
    """
    Find and process unhealthy tasks - the tasks which are hanging in PENDING while there is at least one
    free worker of each kind (default, high, doc_load).
    This is intended to wait silently until all other tasks processed and next re-send the hanged PENDING tasks.


    Goal state: if there are PENDING tasks which are not known by any worker
                - there should not be free workers of all types.
    """
    start_time = time()

    inspect_start_time = time()
    celery_stats = get_celery_stats()
    inspect_time_spent = time() - inspect_start_time

    if not celery_stats.free_workers_available_of_any_kind:
        log.info(
            f'Task health check: there are no workers at all or at least some kind of worker is still busy.\n'
            f'Not checking for the hanged tasks.'
            f'Celery inspect time: {inspect_time_spent:.3f}s\n')
        return

    query_time_start = time()

    # There is at least one free worker of each kind.
    # This means there should be no PENDING tasks not known to workers.
    # Increasing bad health check counter for the PENDING tasks not known to workers.
    Task.objects \
        .filter(own_status='PENDING', bad_health_check_num__lt=TASK_BAD_HEALTH_CHECK_RETRIES) \
        .exclude(queue=settings.CELERY_QUEUE_SERIAL) \
        .exclude(name__in=settings.EXCLUDE_FROM_TRACKING) \
        .exclude(pk__in=celery_stats.tasks_on_workers) \
        .update(bad_health_check_num=F('bad_health_check_num') + 1)

    # Set bad counter to zero for all tasks on workers
    Task.objects \
        .filter(pk__in=celery_stats.tasks_on_workers) \
        .exclude(bad_health_check_num=0) \
        .update(bad_health_check_num=0)

    # Restarts those having the counter >= threshold

    to_restart = list(
        Task.objects.filter(
            own_status='PENDING',
            bad_health_check_num=TASK_BAD_HEALTH_CHECK_RETRIES).values_list(
                'pk', 'name'))

    query_time_spent = time() - query_time_start

    restarted_tasks = list()
    could_not_restart_tasks = list()
    for task_id, task_name in to_restart:
        try:
            restart_task_func(task_id)
            restarted_tasks.append((task_id, task_name))
        except Exception as ex:
            log.error(f'Unable to restart task {task_name} ({task_id})',
                      exc_info=ex)
            could_not_restart_tasks.append((task_id, task_name))

    restarted_msg = '\n'.join(
        task_id + " " + task_name
        for task_id, task_name in restarted_tasks) if restarted_tasks else 'no'
    problem_restarting_msg = '\n'.join(
        task_id + " " + task_name for task_id, task_name in
        could_not_restart_tasks) if restarted_tasks else 'no'
    log.info(
        f'Checked task health. Found {len(to_restart)} unhealthy tasks.\n'
        f'Total time: {time() - start_time:.3f}s\n'
        f'Celery inspect time: {inspect_time_spent:.3f}s\n'
        f'DB query time: {query_time_spent:.3f}s\n'
        f'Restarted tasks:\n{restarted_msg}\n'
        f'Could not restart tasks:\n{problem_restarting_msg}',
        extra={'log_unhealthy_tasks': bool(to_restart)})
Exemple #23
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None,
                                               skip_modified_values: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n'
                           f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
             f'Updated fields: {updated_field_codes or "All"}.\n'
             f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
             f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [c for c in sorted_codes
                        if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk,
                                                                    doc_id=document.pk,
                                                                    field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(list(FieldValue.objects.filter(
            modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk,
                                                list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(log=log,
                                                                              doc=document,
                                                                              field=field,
                                                                              field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'")

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(document=document,
                                                                                  field=field,
                                                                                  field_value_dto=new_field_value_dto,
                                                                                  user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                      exc_info=e)
            detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(sender=detect_and_cache_field_values_for_document,
                              log=log,
                              document=document,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              system_fields_changed=system_fields_changed,
                              generic_fields_changed=generic_fields_changed,
                              user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                              for k in current_field_values])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg)

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res
Exemple #24
0
    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               task: Any,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list

        parse_commands = [
            tika_default_command_list, self.tika_default_command_list
        ]
        from apps.document.app_vars import TIKA_PROCESS_RAM_MB_LIMIT
        ram_limit = TIKA_PROCESS_RAM_MB_LIMIT.val

        for cmd_index in range(len(parse_commands)):
            cmd_list = parse_commands[cmd_index]
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]
            if ram_limit:
                java_index = cmd.index('java')
                cmd = cmd[:java_index + 1] + [f'-Xmx{ram_limit}m'
                                              ] + cmd[java_index + 1:]
            logger.info(f'Tika (XHTML) args: {", ".join(cmd)}')

            last_try = cmd_index == len(parse_commands) - 1
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout,
                               task=task) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = output.pure_text_length if output else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex
Exemple #25
0
    def train_model(cls, log: ProcessLogger, field: DocumentField, train_data_sets: List[List[dict]],
                    split_and_log_out_of_sample_test_report: bool = False) -> ClassifierModel:
        typed_field = TypedField.by(field)
        df = pd.DataFrame.from_records(train_data_sets.pop(0))
        # add transferred external data
        for train_data in train_data_sets:
            df = df.append(pd.DataFrame.from_records(train_data))

        df['target_name'] = df.apply(lambda row: encode_category(
            field.code,
            row.value if typed_field.is_choice_field else None,
            row.extraction_hint), axis=1)

        df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

        df = df.append(
            [{'text_unit__textunittext__text': i} for i in
             cls.get_no_field_text_units(field.document_type, field.text_unit_type)])

        df['target_index'] = df['target_index'].fillna(0).astype('int')
        df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
            'str')
        df['user_input'] = df['modified_by'].fillna(0).astype('bool')

        res_df = pd.DataFrame()

        for group_index, group_df in df.groupby('target_index'):
            if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                group_df = shuffle(
                    group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
            res_df = res_df.append(group_df)
        res_df = shuffle(res_df)

        target_names = sorted(res_df['target_name'].unique())

        if field.classifier_init_script:
            try:
                clf = cls.init_classifier(field)
            except Exception as e:
                log.error(f'Unable to initialize classifier for field {field.code}. '
                          f'Classifier init script: {field.classifier_init_script}', exc_info=e)
        else:
            clf = SGDClassifier(loss='hinge', penalty='l2',
                                alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                class_weight='balanced')

        log.info(f'Classifier initialized: {clf}')

        text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                      stop_words='english',
                                                      tokenizer=word_position_tokenizer)),
                             ('tfidf', TfidfTransformer()),
                             ('clf', clf),
                             ])
        x = res_df['text_unit__textunittext__text']
        y = res_df['target_index']

        if split_and_log_out_of_sample_test_report:
            x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
        else:
            x_train, x_test_os, y_train, y_test_os = x, None, y, None

        sklearn_model = text_clf.fit(x_train, y_train)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

        classifier_model = ClassifierModel()
        classifier_model.set_trained_model_obj(model)
        classifier_model.document_field = field

        classifier_model.classifier_accuracy_report_in_sample = \
            classification_report(y,
                                  text_clf.predict(x),
                                  target_names=target_names)

        if y_test_os is not None and x_test_os is not None:
            classifier_model.classifier_accuracy_report_out_of_sample = \
                classification_report(y_test_os,
                                      text_clf.predict(x_test_os),
                                      target_names=target_names)

        return classifier_model