Exemple #1
0
class Page(models.Model, index.Indexed):
    document = models.ForeignKey(Document, on_delete=models.CASCADE)
    user = models.ForeignKey('User', models.CASCADE)

    number = models.IntegerField(default=1)
    page_count = models.IntegerField(default=1)

    text = models.TextField(default='')

    # inherited/normalized title from parent document
    norm_doc_title = models.CharField(max_length=200, default='')

    # inherited/normalized title of immediate parent folder
    norm_folder_title = models.CharField(max_length=200, default='')

    # normalized space delimited path (by folder title) of parent folder
    norm_breadcrump = models.CharField(max_length=1024, default='')

    # text from all pages of the document
    norm_text = models.TextField(default='')

    # hm, this one should be norm_lang as well
    lang = models.CharField(max_length=8,
                            blank=False,
                            null=False,
                            default='deu')

    search_fields = [
        index.SearchField('norm_doc_title', partial_match=True, boost=3),
        index.SearchField('norm_folder_title', partial_match=True),
        index.SearchField('norm_breadcrump', partial_match=True),
        index.SearchField('norm_text', partial_match=True, boost=1),
        index.SearchField('text', partial_match=True, boost=2),
        index.FilterField('lang')
    ]

    objects = PageQuerySet.as_manager()

    @property
    def is_last(self):
        return self.number == self.page_count

    @property
    def is_first(self):
        return self.number == 1

    @property
    def path(self):
        return PagePath(document_ep=self.document.doc_ep,
                        page_num=self.number,
                        page_count=self.page_count)

    def update_text_field(self):
        """Update text field from associated .txt file.

        Returns non-empty text string value if .txt file was found.
        If file was not found - will return an empty string.
        """
        text = ''
        url = default_storage.abspath(self.txt_url)

        if not os.path.exists(url):
            logger.debug(f"Missing page txt {url}.")
            return

        with open(url) as file_handle:
            self.text = file_handle.read()
            self.save()
            logger.debug(f"text saved. len(page.text)=={len(self.text)}")
            text = self.text

        return text

    image = models.CharField(max_length=1024, default='')

    @property
    def txt_url(self):

        result = PagePath(document_path=self.document.path,
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_url()

    @property
    def txt_exists(self):

        result = PagePath(document_path=self.document.path,
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_exists()

    def norm(self):
        """shortcut normalization method"""
        self.normalize_doc_title()
        self.normalize_folder_title()
        self.normalize_breadcrump()
        self.normalize_text()
        self.normalize_lang()

    def normalize_doc_title(self):
        """
        Save containing document's title
        """
        self.norm_doc_title = self.document.title
        self.save()

    def normalize_folder_title(self):
        """
        Save direct parent folder (containing folder) title
        """
        if self.document.parent:
            self.norm_folder_title = self.document.parent.title
            self.save()

    def normalize_breadcrump(self):
        pass

    def normalize_text(self):
        pass

    def normalize_lang(self):
        pass
Exemple #2
0
class Page(models.Model, index.Indexed):
    document = models.ForeignKey(
        Document,
        on_delete=models.CASCADE,
        related_name='pages'
    )
    user = models.ForeignKey('User', models.CASCADE)

    number = models.IntegerField(default=1)
    page_count = models.IntegerField(default=1)

    text = models.TextField(default='')

    # inherited/normalized title from parent document
    norm_doc_title = models.CharField(
        max_length=200,
        default=''
    )

    # inherited/normalized title of immediate parent folder
    norm_folder_title = models.CharField(
        max_length=200,
        default=''
    )

    # normalized space delimited path (by folder title) of parent folder
    norm_breadcrump = models.CharField(
        max_length=1024,
        default=''
    )

    # text from all pages of the document
    norm_text = models.TextField(default='')

    # hm, this one should be norm_lang as well
    lang = models.CharField(
        max_length=8,
        blank=False,
        null=False,
        default='deu'
    )

    search_fields = [
        index.SearchField('norm_doc_title', partial_match=True, boost=3),
        index.SearchField('norm_folder_title', partial_match=True),
        index.SearchField('norm_breadcrump', partial_match=True),
        index.SearchField('norm_text', partial_match=True, boost=1),
        index.SearchField('text', partial_match=True, boost=2),
        index.FilterField('lang')
    ]

    objects = PageQuerySet.as_manager()

    @property
    def kv(self):
        return KVPage(instance=self)

    @property
    def kvcomp(self):
        return KVCompPage(instance=self)

    def _apply_diff_add(self, diff):
        self.kv.apply_additions(
            [
                {
                    'kv_inherited': True,
                    'key': _model.key,
                    'kv_format': _model.kv_format,
                    'kv_type': _model.kv_type
                }
                for _model in diff
            ]
        )

    def _apply_diff_update(self, diff, attr_updates):
        pass

    def _apply_diff_delete(self, diff):
        pass

    def apply_diff(self, diffs_list, attr_updates):
        for diff in diffs_list:
            if diff.is_add():
                self._apply_diff_add(diff)
            elif diff.is_update():
                self._apply_diff_update(diff, attr_updates)
            elif diff.is_delete():
                self._apply_diff_delete(diff)
            else:
                raise ValueError("Unexpected diff {diff} type")

    def inherit_kv_from(self, document):
        instances_set = []

        for kvstore in document.kv.all():
            instances_set.append(
                KVStorePage(
                    key=kvstore.key,
                    kv_format=kvstore.kv_format,
                    kv_type=kvstore.kv_type,
                    value=kvstore.value,
                    kv_inherited=True,
                    page=self
                )
            )

        diff = Diff(
            operation=Diff.ADD,
            instances_set=instances_set
        )

        self.propagate_changes(
            diffs_set=[diff],
        )

    def propagate_changes(
        self,
        diffs_set,
        apply_to_self=None,
        attr_updates=[]
    ):
        """
        apply_to_self argument does not make sense here.
        apply_to_self argument is here to make this function
        similar to node.propagate_changes.
        """
        self.apply_diff(
            diffs_list=diffs_set,
            attr_updates=attr_updates
        )

    @property
    def is_last(self):
        return self.number == self.page_count

    @property
    def is_first(self):
        return self.number == 1

    @property
    def path(self):
        return PagePath(
            document_path=self.document.path,
            page_num=self.number,
            page_count=self.page_count
        )

    def update_text_field(self):
        """Update text field from associated .txt file.

        Returns non-empty text string value if .txt file was found.
        If file was not found - will return an empty string.
        """
        text = ''
        url = default_storage.abspath(self.txt_url)

        if not os.path.exists(url):
            logger.debug(
                f"Missing page txt {url}."
            )
            return

        with open(url) as file_handle:
            self.text = file_handle.read()
            self.save()
            logger.debug(
                f"text saved. len(page.text)=={len(self.text)}"
            )
            text = self.text

        return text

    image = models.CharField(
        max_length=1024,
        default=''
    )

    @property
    def txt_url(self):

        result = PagePath(
            document_path=self.document.path,
            page_num=self.number,
            page_count=self.page_count
        )

        return result.txt_url()

    @property
    def txt_exists(self):

        result = PagePath(
            document_path=self.document.path,
            page_num=self.number,
            page_count=self.page_count
        )

        return result.txt_exists()

    def norm(self):
        """shortcut normalization method"""
        self.normalize_doc_title()
        self.normalize_folder_title()
        self.normalize_breadcrump()
        self.normalize_text()
        self.normalize_lang()

    def normalize_doc_title(self):
        """
        Save containing document's title
        """
        self.norm_doc_title = self.document.title
        self.save()

    def normalize_folder_title(self):
        """
        Save direct parent folder (containing folder) title
        """
        if self.document.parent:
            self.norm_folder_title = self.document.parent.title
            self.save()

    def normalize_breadcrump(self):
        pass

    def normalize_text(self):
        pass

    def normalize_lang(self):
        pass
Exemple #3
0
class Page(models.Model, index.Indexed):
    document = models.ForeignKey(Document,
                                 on_delete=models.CASCADE,
                                 related_name='pages')
    user = models.ForeignKey('User', models.CASCADE)

    number = models.IntegerField(default=1)
    page_count = models.IntegerField(default=1)

    text = models.TextField(default='')

    # inherited/normalized title from parent document
    norm_doc_title = models.CharField(max_length=200, default='')

    # inherited/normalized title of immediate parent folder
    norm_folder_title = models.CharField(max_length=200, default='')

    # normalized space delimited path (by folder title) of parent folder
    norm_breadcrump = models.CharField(max_length=1024, default='')

    # text from all pages of the document
    norm_text = models.TextField(default='')

    # hm, this one should be norm_lang as well
    lang = models.CharField(max_length=8,
                            blank=False,
                            null=False,
                            default='deu')

    search_fields = [
        index.SearchField('norm_doc_title', partial_match=True, boost=3),
        index.SearchField('norm_folder_title', partial_match=True),
        index.SearchField('norm_breadcrump', partial_match=True),
        index.SearchField('norm_text', partial_match=True, boost=1),
        index.SearchField('text', partial_match=True, boost=2),
        index.FilterField('lang')
    ]

    image = models.CharField(max_length=1024, default='')
    # The hocr (text) fields corresponding
    # to step=0, step=1, ..., step=3 will be saved in hocr_step_0, ...,
    # hocr_step_3 fields.
    #
    # Read in header comment of python module provided by URL below about
    # the concept of steps and why it was bad design
    # decision to introduce them:
    #
    #   https://github.com/papermerge/mglib/blob/master/mglib/step.py
    #
    # In future releases 4 fields: hocr_step_0,..., hocr_step_3 will be
    # replaced with just one: hocr = models.TextField(default='')
    hocr_step_0 = models.TextField(default='')
    hocr_step_1 = models.TextField(default='')
    hocr_step_2 = models.TextField(default='')
    hocr_step_3 = models.TextField(default='')

    objects = PageQuerySet.as_manager()

    def to_dict(self):

        item = {}
        item['id'] = self.id
        item['number'] = self.number
        item['kvstore'] = [item.to_dict() for item in self.kv.all()]

        return item

    @property
    def kv(self):
        return KVPage(instance=self)

    @property
    def kvcomp(self):
        return KVCompPage(instance=self)

    def _apply_diff_add(self, diff):

        self.kv.apply_additions([{
            'kv_inherited': True,
            'key': _model.key,
            'kv_format': _model.kv_format,
            'kv_type': _model.kv_type
        } for _model in diff])

    def _apply_diff_update(self, diff, attr_updates):
        updates = [{
            'kv_inherited': True,
            'key': _model.key,
            'kv_format': _model.kv_format,
            'kv_type': _model.kv_type,
            'id': _model.id
        } for _model in diff]

        updates.extend(attr_updates)

        self.kv.apply_updates(updates)

    def _apply_diff_delete(self, diff):
        pass

    def apply_diff(self, diffs_list, attr_updates):

        for diff in diffs_list:
            if diff.is_add():
                self._apply_diff_add(diff)
            elif diff.is_update():
                self._apply_diff_update(diff, attr_updates)
            elif diff.is_delete():
                self._apply_diff_delete(diff)
            elif diff.is_replace():
                # not applicable to page model
                # replace is used in access permissions
                # propagation
                pass
            else:
                raise ValueError(f"Unexpected diff {diff} type")

    def inherit_kv_from(self, document):
        instances_set = []

        for kvstore in document.kv.all():
            instances_set.append(
                KVStorePage(key=kvstore.key,
                            kv_format=kvstore.kv_format,
                            kv_type=kvstore.kv_type,
                            value=kvstore.value,
                            kv_inherited=True,
                            page=self))

        diff = Diff(operation=Diff.ADD, instances_set=instances_set)

        self.propagate_changes(diffs_set=[diff], )

    def propagate_changes(self,
                          diffs_set,
                          apply_to_self=None,
                          attr_updates=[]):
        """
        apply_to_self argument does not make sense here.
        apply_to_self argument is here to make this function
        similar to node.propagate_changes.
        """
        self.apply_diff(diffs_list=diffs_set, attr_updates=attr_updates)

    @property
    def is_last(self):
        return self.number == self.page_count

    @property
    def is_first(self):
        return self.number == 1

    def path(self, version=None):

        return PagePath(document_path=self.document.path(version=version),
                        page_num=self.number,
                        page_count=self.page_count)

    def update_text_field(self):
        """Update text field from associated .txt file.

        Returns non-empty text string value if .txt file was found.
        If file was not found - will return an empty string.
        """
        text = ''
        url = default_storage.abspath(self.txt_url)

        if not os.path.exists(url):
            logger.debug(f"Missing page txt {url}.")
            return

        with open(url) as file_handle:
            self.text = file_handle.read()
            self.save()
            logger.debug(f"text saved. len(page.text)=={len(self.text)}")
            text = self.text

        return text

    @property
    def txt_url(self):

        result = PagePath(document_path=self.document.path(),
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_url()

    @property
    def txt_exists(self):

        result = PagePath(document_path=self.document.path(),
                          page_num=self.number,
                          page_count=self.page_count)

        return result.txt_exists()

    def norm(self):
        """shortcut normalization method"""
        self.normalize_doc_title()
        self.normalize_folder_title()
        self.normalize_breadcrump()
        self.normalize_text()
        self.normalize_lang()

    def normalize_doc_title(self):
        """
        Save containing document's title
        """
        self.norm_doc_title = self.document.title
        self.save()

    def normalize_folder_title(self):
        """
        Save direct parent folder (containing folder) title
        """
        if self.document.parent:
            self.norm_folder_title = self.document.parent.title
            self.save()

    def normalize_breadcrump(self):
        pass

    def normalize_text(self):
        pass

    def normalize_lang(self):
        pass

    class Meta:
        # Guarantees that
        # doc.pages.all() will return pages ordered by number.
        # test by
        # test_page.TestPage.test_pages_all_returns_pages_ordered
        ordering = ['number']