Esempio n. 1
0
 def save_summary(self, log: ProcessLogger, user_id):
     # save DocumentTermUsage
     if self.located_usage_entities and TermUsage in self.located_usage_entities:
         term_usages = self.located_usage_entities[TermUsage]
         # update DocumentTermUsage records
         doc_term_usgs = {}  # type: Dict[Tuple[int, int], DocumentTermUsage]
         for tu in term_usages:  # type: TermUsage
             key = (tu.text_unit.document_id, tu.term.pk,)
             doc_usg = doc_term_usgs.get(key)
             if doc_usg:
                 doc_usg.count += 1
             else:
                 doc_usg = DocumentTermUsage()
                 doc_usg.document_id = tu.text_unit.document_id
                 doc_usg.term_id = tu.term.pk
                 doc_usg.count = 1
                 doc_term_usgs[key] = doc_usg
         if doc_term_usgs:
             doc_term_usgs_lst = [v for _, v in doc_term_usgs.items()]
             try:
                 with transaction.atomic():
                     DocumentTermUsage.objects.bulk_create(doc_term_usgs_lst, ignore_conflicts=True)
             except Exception as e:
                 log.error(f'Unable to store {len(doc_term_usgs)} DocumentTermUsage records.\n',
                           exc_info=e)
Esempio n. 2
0
def send_email(log: ProcessLogger, dst_user, subject: str, txt: str, html: str, image_dir: str, cc: Set[str] = None):
    if not dst_user.email:
        log.error('Destination user {0} has no email assigned'.format(dst_user.get_full_name()))
        return

    try:
        email = EmailMultiAlternatives(subject=subject,
                                       body=txt,
                                       cc=list(cc) if cc else None,
                                       from_email=settings.DEFAULT_FROM_EMAIL,
                                       to=['"{0}" <{1}>'.format(dst_user.get_full_name(), dst_user.email)])
        if html:
            images = [m.group(3) for m in RE_SRC_ATTACHMENT.finditer(html)]
            email_html = RE_SRC_ATTACHMENT.sub(r'\1cid:\3\4', html)
            email.attach_alternative(email_html, 'text/html')

            for image_fn in images:
                data = get_notification_template_resource(os.path.join(image_dir, image_fn))
                mime_type = get_predefined_mime_type(image_fn)
                try:
                    img = MIMEImage(data, _subtype=mime_type) if mime_type else MIMEImage(data)
                except TypeError as e:
                    raise RuntimeError(f"Couldn't guess MIME type for tile {image_fn}") from e
                img.add_header('Content-Id', '<' + image_fn + '>')
                img.add_header("Content-Disposition", "inline", filename=image_fn)
                email.attach(img)

        email.send(fail_silently=False)
    except Exception as caused_by:
        log.error(f'Unable to send email to user "{dst_user.get_full_name()}" (#{dst_user.pk})',
                  exc_info=caused_by)
Esempio n. 3
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info(
            'Forbidden storing detected field values for document with "completed"'
            ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set())
                       for f in all_fields]
    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    field_values_pre_cached = False

    res = list()
    for field_code in sorted_codes:
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy
        if not field_values_pre_cached \
                and field_detection_strategy.uses_cached_document_field_values(field):
            # Pre-cache Document.field_values structure for the usage in field detection strategies
            document.field_values = field_value_cache.cache_field_values(
                document, None, save=False)
            field_values_pre_cached = True

        detected_values = field_detection_strategy.detect_field_values(
            log, document, field)  # type: List[DetectedFieldValue]
        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, res, save=True, log=log)

    return res
Esempio n. 4
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
def _recreate_document_fields_table(log: ProcessLogger, table_name: str,
                                    column_defs: Dict[str, str],
                                    index_defs: Dict[str, str]):
    log.info('Recreating raw sql table: {0}'.format(table_name))

    column_def_clauses = [
        SQLClause('"{column}" {pg_type}'.format(column=column,
                                                pg_type=pg_type))
        for column, pg_type in column_defs.items()
    ]

    create_table = format_clause(
        'CREATE TABLE "{table_name}" (\n'
        '{columns}, \n'
        'FOREIGN KEY ({field_document_id}) '
        'REFERENCES document_document (id) ON DELETE CASCADE)',
        table_name=table_name,
        columns=join_clauses(', \n', column_def_clauses),
        field_document_id=FIELD_CODE_DOC_ID)  # type: SQLClause

    log.info('Create table SQL for table {0}:\n{1}\nParams: {2}'.format(
        table_name, create_table.sql, create_table.params))

    with connection.cursor() as cursor:
        cursor.execute('drop table if exists "{table_name}"'.format(
            table_name=table_name))
        cursor.execute(create_table.sql, create_table.params)
        for index_name, index_def in index_defs.items():  # type: str, str
            create_index = _build_create_index_statement(
                table_name, index_name, index_def)
            cursor.execute(create_index, [])
Esempio n. 6
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults,
                 text: str, text_unit_id: int, text_unit_lang: str,
                 document_id: int, document_project_id: int, **kwargs):
     if not text:
         return
     start = datetime.datetime.now()
     try:
         parse_results = self.parse(log, text, text_unit_id, text_unit_lang,
                                    locate_results.document_initial_load,
                                    **kwargs)  # type: ParseResults
         if parse_results:
             parse_results.update_doc_project_ids(document_id,
                                                  document_project_id)
             locate_results.collect(self, text_unit_id, parse_results)
         elapsed = (datetime.datetime.now() - start).total_seconds()
         LocatingPerformanceMeter().add_record(str(type(self).__name__),
                                               elapsed, text_unit_id, text)
     except Exception as e:
         log.error(
             f'Exception caught while trying to run locator on a text unit.\n'
             f'Locator: {self.__class__.__name__}\n'
             f'Text unit id: {text_unit_id}\n'
             f'Text: {text[:1024]}\n'
             f'Text unit language: {text_unit_lang}\n',
             exc_info=e)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        typed_field = TypedField.by(field)  # type: TypedField

        if python_coded_field.type != typed_field.type_code:
            raise RuntimeError(
                f'Python-coded field {python_coded_field.__class__.__name__} is '
                f'for fields of type {python_coded_field.type} and field {field.code} '
                f'is of type {typed_field.type_code}')

        log.debug('detect_field_value: python_coded_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        field_value_dto = python_coded_field.get_value(
            log=log,
            field=field,
            doc=doc,
            cur_field_code_to_value=field_code_to_value)
        if not typed_field.is_json_field_value_ok(field_value_dto.field_value):
            raise ValueError(
                f'Python coded field class {field.python_coded_field} returned value not suitable for '
                f'field {field.code} ({typed_field.type_code})')
        return field_value_dto
def _build_insert_clause(log: ProcessLogger, table_name: str,
                         handlers: List[field_handlers.FieldHandler],
                         document: Document,
                         fields_to_python_values: Dict[str, Any]) -> SQLClause:
    insert_clauses = list()

    for handler in handlers:  # type: field_handlers.FieldHandler
        python_value = fields_to_python_values.get(handler.field_code)
        try:
            insert_clause = handler.get_pg_sql_insert_clause(
                document.language, python_value)  # type: SQLInsertClause
            insert_clauses.append(insert_clause)
        except Exception as ex:
            msg = render_error('Unable to cache field values.\n'
                               'Document: {0} (#{1}).\n'
                               'Field: {2}'.format(document.name, document.id,
                                                   handler.field_code),
                               caused_by=ex)
            log.error(msg)

    columns_clause, values_clause = SQLInsertClause.join(insert_clauses)

    insert_clause = format_clause(
        'insert into "{table_name}" ({columns}) '
        'values ({values}) on conflict ({column_document_id}) '
        'do update set ({columns}) = ({values})',
        table_name=table_name,
        columns=columns_clause,
        values=values_clause,
        column_document_id=FIELD_CODE_DOC_ID)

    return insert_clause
Esempio n. 9
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    if not self.document_initial_load:
                        TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                        for entity_class in self.processed_usage_entity_classes:
                            entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                tags_saved = 0
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models)

            # save "_usage" objects
            count = 0
            for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                if not entities:
                    continue
                count += SafeBulkCreate.bulk_create(entity_class.objects, entities)

            log.info(
                'Stored {0} usage entities and {1} tags for {2} text units'.format(
                    count, tags_saved, len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
def document_fields_change_listener_impl(_sender,
                                         signal,
                                         log: ProcessLogger,
                                         document_event: str,
                                         document: Document,
                                         field_handlers: Dict[str,
                                                              FieldHandler],
                                         fields_before: Optional[Dict],
                                         fields_after: Optional[Dict],
                                         changed_by_user: User = None):
    from apps.task.tasks import call_task_func
    from apps.notifications.tasks import process_notifications_on_document_change
    if not changed_by_user:
        # we ignore changes made by system at the moment
        return

    if not fields_before and not fields_after:
        log.error(
            'Document fields changed event appeared with both "before" and "after" fields empty.'
        )
        return

    from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS
    if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val:
        return
    call_task_func(process_notifications_on_document_change,
                   (document_event, document.pk, fields_before, fields_after,
                    changed_by_user.pk), changed_by_user.pk)
Esempio n. 11
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities, ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'.format(
                        count, len(tag_models), len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
        self.save_summary(log, user_id)
Esempio n. 12
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
Esempio n. 13
0
    def parse_file_local_xhtml(self,
                               local_path: str,
                               original_file_name: str,
                               timeout: int = 60,
                               encoding_name: str = 'utf-8',
                               logger: ProcessLogger = None,
                               enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will return XHTML and TikaXhtmlParser then will parse XHTML into plain text
        plus extra formatting information plus metadata.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        for cmd_list in [
                self.tika_default_command_list,
                self.tika_lexnlp_default_command_list
        ]:
            cmd = cmd_list + ['-x', f'-e{encoding_name}', local_path]

            last_try = cmd == self.tika_lexnlp_default_command_list
            text = read_output(cmd,
                               stderr_callback=err,
                               encoding=encoding_name,
                               timeout_sec=timeout) or ''
            try:
                output = self.xhtml_parser.parse_text(text)
                output_len = len(output.text) if output and output.text else 0
                logger.info(
                    f'parse_file_local_xhtml: {len(text)} source boiled down to {output_len}'
                )
                if not output_len and not last_try:
                    continue

                output.meta[Document.DocumentMetadataKey.KEY_PARSING_STATISTICS] = \
                    {
                        'extracted_text_length': self.xhtml_parser.parse_stat.parsed_text_len,
                        'images_text_length': self.xhtml_parser.parse_stat.parsed_ocr_text_len,
                    }
                return output
            except Exception as ex:
                text_sample = text[:255] if text and isinstance(
                    text, str) else str(text)
                raise Exception(
                    'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                    text_sample) from ex
Esempio n. 14
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT
        # don't use at all TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag
        os.environ[self.TIKA_PARSER_DETAIL] = ''

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
def there_are_non_indexed_docs_not_planned_to_index(
        document_type: DocumentType, log: ProcessLogger) -> bool:
    for doc_id in non_indexed_doc_ids_not_planned_to_index(document_type, 1):
        if doc_id:
            task_name = _get_reindex_task_name()
            fields_table = doc_fields_table_name(document_type.code)
            log.info(
                f'there_are_non_indexed_docs_not_planned_to_index: '
                f'found document id={doc_id} of type {document_type.code}, '
                f'task {task_name}. Fields table: {fields_table}')
            return True
    return False
Esempio n. 17
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str,
                 text_unit_id: int, text_unit_lang: str, **kwargs):
     try:
         parse_results = self.parse(log, text, text_unit_id, text_unit_lang, **kwargs)  # type: ParseResults
         if parse_results:
             locate_results.collect(self, text_unit_id, parse_results)
     except Exception as e:
         log.error(f'Exception caught while trying to run locator on a text unit.\n'
                   f'Locator: {self.__class__.__name__}\n'
                   f'Text unit id: {text_unit_id}\n'
                   f'Text: {text[:1024]}\n'
                   f'Text unit language: {text_unit_lang}\n', exc_info=e)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        log.debug('detect_field_value: csv_regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        detectors = cls.detecting_cache.get_detectors(
            field.pk,
            lambda msg, er: log.error(msg, field_code=field.code, exc_info=er))
        if not detectors:
            return None

        is_multichoice = field.type == MultiChoiceField.type_code
        doc_text = cls.get_document_text(doc)

        annotations = []

        for detector in detectors:
            found_item = detector.find_value(doc_text)
            if not found_item:
                continue

            # TODO: implement reading values from full text (TextParts.FULL.value)
            # as it is done now, or from text units - paragraphs or sentences
            # based on field.text_unit_type - for other detector.text_part options
            """            
            if detector.text_part == TextParts.BEFORE_REGEXP.value:
                return matching_string[:begin], 0, begin
            elif detector.text_part == TextParts.AFTER_REGEXP.value:
                return matching_string[end:], end, len(text)
            elif detector.text_part == TextParts.INSIDE_REGEXP.value:
                return matching_string[begin:end], begin, end
            else:
                return text, 0, len(text)
            """

            # starting position has to be shifted backward by 1 symbol for FE
            ant = AnnotationDTO(annotation_value=found_item[0],
                                location_in_doc_start=max(
                                    found_item[1] - 1, 0),
                                location_in_doc_end=found_item[2],
                                extraction_hint_name='')
            if not is_multichoice:
                return FieldValueDTO(field_value=found_item[0],
                                     annotations=[ant])
            else:
                annotations.append(ant)

        if annotations:
            f_val = [a.annotation_value for a in annotations]
            return FieldValueDTO(field_value=f_val, annotations=annotations)
        return None
Esempio n. 19
0
def apply_simple_config(log: ProcessLogger,
                        document_field: DocumentField,
                        csv: bytes,
                        drop_previous_field_detectors: bool,
                        update_field_choice_values: bool,
                        csv_contains_regexps: bool = False):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        if len(row) == 0:
            continue

        includes = row.dropna()

        if not csv_contains_regexps:
            includes = [i.strip().replace(' ', '\s{1,100}') for i in includes]
        includes = [i for i in includes if i]

        if len(includes) == 1:
            log.info(
                'There are no search strings specified for detected value {0}'.
                format(row[0]))
            continue

        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(includes[1:])
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
Esempio n. 20
0
    def plan_refreshes(self, log: ProcessLogger, refresh_task_name: str,
                       plan_task_func: Callable[[str, datetime], None]):
        """
        Checks if there are materialized view refresh requests older than N seconds and plans the refreshing.
        The requests are inserted into the corresponding table by the document loading routines or any other
        code which changes the data on which these views are based.
        Maybe they will be replaced by a DB trigger in future.
        :param plan_task_func:
        :param log
        :return:
        """

        from apps.materialized_views.app_vars import REFRESH_DELAY
        refresh_delay_sec = REFRESH_DELAY.val
        to_refresh = list()
        with connection.cursor() as cursor:
            cursor.execute(
                f'''select view_name, max(request_date) 
                               from {TABLE_M_VIEW_REQUEST}
                               where to_jsonb(view_name) not in 
                                     (select args->0 from task_task where name = %s and own_status = %s) 
                               group by view_name''',
                (refresh_task_name, PENDING))
            for view_name, max_request_date in cursor.fetchall(
            ):  # type: str, datetime
                if timezone.now() - max_request_date > timedelta(
                        seconds=refresh_delay_sec):
                    to_refresh.append(view_name)

        # Here we use PG advisory locks to prevent planning the materialized view refresh it the refresh
        # is already being executed.
        # The same lock is acquired in refresh_materialized_view() by any Celery worker (maybe on a different machine)
        # which is running the refresh of the same view.

        # And the following code running in Celery-beat on the master machine checks is the "refresh" is in progress
        # by trying to acquire the lock.

        for view_name in to_refresh:
            with transaction.atomic():
                # We need to execute it in a separate transaction to release the PG advisory lock
                # before executing plan_task_func.
                # Cursor is closed on the transaction end. So we initialize it here and don't re-use.
                with connection.cursor() as cursor:
                    locked = self.advisory_lock_by_relation_name(
                        cursor, view_name)

            if locked:
                log.info(
                    f'Planning refresh for materialized view {view_name}.')
                plan_task_func(view_name)
Esempio n. 21
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
Esempio n. 22
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(
                        text_unit_id__in=self.processed_text_unit_ids).delete(
                        )
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(
                            text_unit_id__in=self.processed_text_unit_ids
                        ).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items(
                ):  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities,
                                                         ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                for text_unit_id, tags in self.tags.items():
                    for tag in tags:
                        tag_models.append(
                            TextUnitTag(user_id=user_id,
                                        text_unit_id=text_unit_id,
                                        tag=tag))
                TextUnitTag.objects.bulk_create(tag_models,
                                                ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'
                    .format(count, len(tag_models),
                            len(self.processed_text_unit_ids)))
        except:
            msg = render_error(
                'Unable to store location results.\n'
                'Text unit ids: {text_unit_ids}\n'
                'Usage models caused the problem:\n{entities}'.format(
                    text_unit_ids=self.processed_text_unit_ids,
                    entities='\n'.join([
                        str(e) for e in self.processed_usage_entity_classes
                    ])))
            log.error(msg)
Esempio n. 23
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults,
                 text: str, text_unit_id: int, text_unit_lang: str,
                 **kwargs):
     try:
         parse_results = self.parse(text, text_unit_id, text_unit_lang,
                                    **kwargs)  # type: ParseResults
         locate_results.collect(self, text_unit_id, parse_results)
     except:
         msg = render_error(
             'Exception caught while trying to run locator on a text unit.\n'
             'Locator: {locator}\n'
             'Text unit id: {text_unit_id}\n'
             'Text: {text}\n'
             'Text unit language: {text_unit_lang}\n'.format(
                 locator=self.__class__.__name__,
                 text_unit_id=text_unit_id,
                 text=text[:1024],
                 text_unit_lang=text_unit_lang))
         log.error(msg)
Esempio n. 24
0
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False,
                                            split_and_log_out_of_sample_test_report: bool = False) \
            -> Optional[ClassifierModel]:
        log.info(f'Training model for field {field.code} (#{field.pk})...')

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data_sets = cls.get_train_datasets_from_projects(
                field.pk, train_data_project_ids)
        else:
            train_data_sets = cls.get_train_data_sets(field,
                                                      train_data_project_ids)

        if not train_data_sets:
            log.info(
                f'Not enough data to train model for document_type {field.document_type.code}, field: {field.code}.'
            )
            return None

        classifier_model = cls.train_model(
            log, field, train_data_sets,
            split_and_log_out_of_sample_test_report)
        log.info(
            f'Finished training model for document_type {field.document_type.code}, field: {field.code}.'
        )

        return classifier_model
Esempio n. 25
0
    def train_document_field_detector_model(
        cls,
        log: ProcessLogger,
        field: DocumentField,
        train_data_project_ids: Optional[List],
        use_only_confirmed_field_values: bool = False
    ) -> Optional[ClassifierModel]:
        log.info(f'Training model for field {field.code} (#{field.pk})...')

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data_sets = cls.get_train_datasets_from_projects(
                field.pk, train_data_project_ids)
        else:
            train_data_sets = cls.get_train_data_sets(field,
                                                      train_data_project_ids)

        if not train_data_sets:
            log.info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(field.document_type.pk, field.pk))
            return None

        classifier_model = cls.train_model(field, train_data_sets)
        log.info(
            'Finished training model for document_type #{0} and field #{1}.'.
            format(field.document_type.pk, field.pk))

        return classifier_model
    def refresh_materialized_view(self, log: ProcessLogger, view_name: str):
        """
        Refresh the specified materialized view and delete all refresh requests older or equal to the last request date
        taken at this method start.

        Additionally this method acquires a PG advisory lock to prevent
        parallel refreshing of the same view.
        The lock is used by the planning routine which tries to acquire the lock
        to prevent re-planning the same refresh if it is already running.
        :param view_name:
        :param log
        :return:
        """
        try:
            with connection.cursor() as cursor:
                cursor.execute(f'update {TABLE_M_VIEW} '
                               'set status=%s where view_name=%s;',
                               [MaterializedView.VIEW_STATUS_UPDATING, view_name])
        except Exception as e:
            log.error(f'Error saving updated status for view "{view_name}": {e}')

        with transaction.atomic():
            with connection.cursor() as cursor:
                if not self.advisory_lock_by_relation_name(cursor, view_name):
                    log.info(f'Canceled refreshing materialized view: {view_name}. '
                             f'Unable to acquire the advisory lock.')
                    cursor.execute(f'update {TABLE_M_VIEW} '
                                   'set status=%s where view_name=%s;',
                                   [MaterializedView.VIEW_STATUS_UPDATED, view_name])
                    return
                log.info(f'Refreshing materialized view: {view_name}.')
                cursor.execute('select max(request_date) '
                               f'from {TABLE_M_VIEW_REQUEST} '
                               'where view_name = %s;', [view_name])
                row = cursor.fetchone()
                request_date = row[0] if row else None

                concurency_clause = ''
                from apps.materialized_views.app_vars import CONCURRENCY_UPDATE
                if CONCURRENCY_UPDATE.val:
                    concurency_clause = ' CONCURRENTLY'
                cursor.execute(f'refresh materialized view{concurency_clause} {view_name};')

                if request_date is not None:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s and request_date <= %s',
                                   [view_name, request_date])
                else:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s',
                                   [view_name])
                dt_now = timezone.now()
                cursor.execute(f'insert into {TABLE_M_VIEW} '
                               '(view_name, refresh_date, status) '
                               'values (%s, %s, %s) '
                               'on conflict (view_name) do update set refresh_date = %s, '
                               'status = %s;',
                               [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED,
                                dt_now, MaterializedView.VIEW_STATUS_UPDATED])
Esempio n. 27
0
 def document_change_listener(event: events.DocumentChangedEvent):
     from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
     if APP_VAR_DISABLE_RAW_DB_CACHING.val:
         return
     from apps.rawdb.field_value_tables import cache_document_fields
     log = event.log or ProcessLogger()
     cache_document_fields(
         log=log,
         document=event.document,
         cache_generic_fields=event.generic_fields_changed,
         cache_user_fields=event.user_fields_changed,
         pre_detected_field_codes_to_suggested_values=event.
         pre_detected_field_values)
    def refresh_materialized_view(self, log: ProcessLogger, view_name: str):
        """
        Refresh the specified materialized view and delete all refresh requests older or equal to the last request date
        taken at this method start.

        Additionally this method acquires a PG advisory lock to prevent
        parallel refreshing of the same view.
        The lock is used by the planning routine which tries to acquire the lock
        to prevent re-planning the same refresh if it is already running.
        :param view_name:
        :param log
        :return:
        """
        with transaction.atomic():
            with connection.cursor() as cursor:
                if not self.advisory_lock_by_relation_name(cursor, view_name):
                    log.info(f'Canceled refreshing materialized view: {view_name}. '
                             f'Unable to acquire the advisory lock.')
                    return
                log.info(f'Refreshing materialized view: {view_name}.')
                cursor.execute('select max(request_date) '
                               'from materialized_views_materializedviewrefreshrequest '
                               'where view_name = %s', [view_name])
                row = cursor.fetchone()
                request_date = row[0] if row else None
                cursor.execute(f'refresh materialized view {view_name}')
                if request_date is not None:
                    cursor.execute('delete from materialized_views_materializedviewrefreshrequest '
                                   'where view_name = %s and request_date <= %s',
                                   [view_name, request_date])
                else:
                    cursor.execute('delete from materialized_views_materializedviewrefreshrequest '
                                   'where view_name = %s',
                                   [view_name])
                dt_now = timezone.now()
                cursor.execute('insert into materialized_views_materializedview '
                               '(view_name, refresh_date) values (%s, %s) '
                               'on conflict (view_name) do update set refresh_date = %s', [view_name, dt_now, dt_now])
Esempio n. 29
0
 def send_email(self, log: ProcessLogger = None,
                subject: str = None, text: str = None, html: str = None):
     from apps.notifications.notifications import send_email
     link = self.get_link(abs_path=True, as_html=False)
     default_subject = 'Document Files Ready to Download'
     default_msg_template = 'You can download your documents {}'
     default_text = default_msg_template.format(link)
     default_html = default_msg_template.format(f'<a href="{link}">here</a>')
     send_email(
         log=log or ProcessLogger(),
         dst_user=self.user,
         subject=subject or default_subject,
         txt=text or default_text,
         html=html or default_html)
     self.email_sent = True
     self.save()
Esempio n. 30
0
def document_change_listener_impl(sender,
                                  signal,
                                  log: ProcessLogger,
                                  document: Document,
                                  system_fields_changed: FieldSpec = True,
                                  generic_fields_changed: FieldSpec = True,
                                  user_fields_changed: bool = True,
                                  changed_by_user: User = None,
                                  document_initial_load: bool = False):
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    from apps.rawdb.field_value_tables import cache_document_fields
    log = log or ProcessLogger()
    cache_document_fields(log=log,
                          document=document,
                          cache_system_fields=system_fields_changed,
                          cache_generic_fields=generic_fields_changed,
                          cache_user_fields=user_fields_changed,
                          changed_by_user=changed_by_user,
                          document_initial_load=document_initial_load)