コード例 #1
0
ファイル: docximporter.py プロジェクト: 80000v/Booktype
    def __init__(self):
        self.notifier = Notifier()
        self.delegate = Delegate()

        # Attachment objects indexed by image file name
        self._attachments = {}
        # Chapter objects indexed by document file name
        self._chapters = {}

        self.endnotes = {}
        self.footnotes = {}
コード例 #2
0
    def __init__(self):
        self.notifier = Notifier()
        self.delegate = Delegate()

        # Attachment objects indexed by image file name
        self._attachments = {}
        # Chapter objects indexed by document file name
        self._chapters = {}

        self.endnotes = {}
        self.footnotes = {}
コード例 #3
0
class WordImporter(object):

    def __init__(self):
        self.notifier = Notifier()
        self.delegate = Delegate()

        # Attachment objects indexed by image file name
        self._attachments = {}
        # Chapter objects indexed by document file name
        self._chapters = {}

        self.endnotes = {}
        self.footnotes = {}

    def _check_for_elements(self):
        from ooxml import doc

        found_math = False

        for elem in self.dfile.document.elements:
            if isinstance(elem, doc.Paragraph):
                for el in elem.elements:
                    if isinstance(el, doc.Math):
                        found_math = True

        if found_math:
            warn_msg = _("Please note: Mathematical formulae have been found, and highlighted in the text. These formulae are not supported by many e-readers, or the Booktype editor at present.")  # noqa
            self.notifier.warning(warn_msg)

    def import_file(self, file_path, book, options=None):
        self.delegate.notifier = self.notifier
        self.broken_images = []
        self.converted_images = []

        def serialize_empty(ctx, document, elem, root):
            return root

        def serialize_endnote(ctx, document, el, root):
            # <sup class="endnote" data-id="1454855960556">1</sup>

            if el.rid not in self.endnotes:
                data_id = str(uuid.uuid1()).replace('-', '')
                self.endnotes[el.rid] = data_id
            else:
                data_id = self.endnotes[el.rid]

            note = lxml.etree.SubElement(
                root, 'sup', {'class': 'endnote', 'data-id': data_id})
            note.text = '1'

            return root

        def serialize_footnote(ctx, document, el, root):
            # <sup class="endnote" data-id="1454855960556">1</sup>

            if el.rid not in self.footnotes:
                data_id = str(uuid.uuid1()).replace('-', '')
                self.footnotes[el.rid] = data_id
            else:
                data_id = self.footnotes[el.rid]

            note = lxml.etree.SubElement(
                root, 'sup', {'class': 'endnote', 'data-id': data_id})
            note.text = '1'

            return root

        if not options:
            options = {'scale_font_size': True}

        try:
            self.dfile = ooxml.read_from_file(file_path)

            serialize_options = {
                'embed_styles': True,
                'embed_fontsize': True,
                # 'empty_paragraph_as_nbsp': True,
                'serializers': {
                    doc.Math: serialize_empty,
                    doc.Footnote: serialize_footnote,
                    doc.Endnote: serialize_endnote
                }
            }

            chapters = importer.get_chapters(
                self.dfile.document, options=options,
                serialize_options=serialize_options)

            self._import_attachments(book, self.dfile.document)
            self._import_chapters(book, chapters)

            # get the styles
            self._import_styles(book)
            self.dfile.close()

            self._check_for_elements()
        except zipfile.BadZipfile:
            notif_msg = _("The file could not be imported because it was not saved in the .docx format. Try to open the file in Word and save it as a .docx.")  # noqa
            self.notifier.error(notif_msg)
        except Exception as err:
            err_msg = _("The docx file you uploaded contains errors and cannot be converted. Please contact customer support.")  # noqa
            self.notifier.error(err_msg)
            logger.exception("Error trying to import docx file. Msg: %s" % err)

    def _import_styles(self, book):
        from django.conf import settings

        options = {}

        if hasattr(self.dfile.document, 'base_font_size') and \
           self.dfile.document.base_font_size != -1:
            options['scale_to_size'] = self.dfile.document.base_font_size
        elif len(self.dfile.document.possible_text) > 0:
            options['scale_to_size'] = self.dfile.document.possible_text[-1]

        editor_style = serialize.serialize_styles(
            self.dfile.document, prefix='#contenteditor', options=options)

        epub_style = serialize.serialize_styles(
            self.dfile.document, prefix='', options=options)

        dir_name = '{}/styles/{}/'.format(settings.DATA_ROOT, book.url_title)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        f = open('{}/editor_style.css'.format(dir_name), 'wt')
        f.write(STYLE_EDITOR)
        f.write(editor_style)
        f.close()

        f = open('{}/epub_style.css'.format(dir_name), 'wt')
        f.write(STYLE_EPUB)
        f.write(epub_style)
        f.close()

    def _import_attachments(self, book, doc):
        stat = models.BookStatus.objects.filter(book=book, name='new')[0]

        unimportable_image = False
        not_supported = False

        for rel_id, rel_value in doc.relationships['document'].iteritems():
            if rel_value.get('type', '') == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image':
                att = models.Attachment(book=book, version=book.version, status=stat)

                valid_extensions = ['.jpg', '.jpeg', '.png', '.gif']
                import_msg = _("The file format you uploaded is not supported. Please save the image as jpg file and upload it again.")  # noqa

                try:
                    with ContentFile(self.dfile.read_file(rel_value['target'])) as content_file:
                        att_name, att_ext = os.path.splitext(
                            os.path.basename(rel_value['target']))
                        original_ext = att_ext[:]

                        if att_ext.lower() in ['.tif', '.tiff']:
                            try:
                                content_file = convert_image('tiff', content_file)
                                self.converted_images.append(
                                    'static/{}{}'.format(rel_id, original_ext))

                                att_ext = '.png'
                            except:
                                # broken image
                                if not unimportable_image:
                                    self.notifier.warning(import_msg)
                                    unimportable_image = True

                                content_file = None

                        elif att_ext.lower() not in valid_extensions:
                            if not unimportable_image:
                                self.notifier.warning(import_msg)
                                unimportable_image = True

                            content_file = None

                        if content_file:
                            att.attachment.save('{}{}'.format(rel_id, att_ext), content_file, save=False)
                            att.save()
                        else:
                            if not not_supported:
                                self.notifier.warning(_("An error occurred while importing images. Some images couldn't be imported. Missing images are marked within the text. Please upload missing images manually."))  # noqa
                                not_supported = True

                            self.broken_images.append('static/{}{}'.format(rel_id, original_ext))

                            assets_dir = os.path.join(os.path.dirname(__file__), "assets/")
                            pholder_path = '{}placeholder_broken_img.jpg'.format(assets_dir)
                            data = open(pholder_path, 'rb').read()
                            content_file = ContentFile(data)

                            att.attachment.save('{}.jpg'.format(rel_id), content_file, save=False)
                            att.save()
                except Exception as err:
                    logger.exception("Exception while importing attachments. Msg: %s" % err)

    def _import_chapters(self, book, chapters):
        now = datetime.datetime.now()
        stat = models.BookStatus.objects.filter(book=book, name="new")[0]
        n = 100

        for chapter_title, chapter_content in chapters:
            if len(chapter_title) > 100:
                chapter_title = u'{}...'.format(chapter_title[:100])

            if chapter_title == '':
                if n == 100:
                    chapter_title = _('Title Page')
                else:
                    chapter_title = _('Title')

            chapter_n = 0
            possible_title = chapter_title

            while True:
                does_exists = models.Chapter.objects.filter(
                    book=book, version=book.version,
                    url_title=booktype_slugify(possible_title)
                ).exists()

                if does_exists:
                    chapter_n += 1
                    possible_title = u'{} - {}'.format(
                        chapter_title, chapter_n)
                else:
                    break

            if chapter_content[6:-8].strip() == '':
                continue

            chapter_content = unidecode(self._parse_chapter(chapter_content))

            chapter = models.Chapter(
                book=book,
                version=book.version,
                url_title=booktype_slugify(possible_title),
                title=possible_title,
                status=stat,
                content=chapter_content[6:-8],
                created=now,
                modified=now
            )
            chapter.save()

            toc_item = models.BookToc(
                book=book,
                version=book.version,
                name=chapter.title,
                chapter=chapter,
                weight=n,
                typeof=1
            )
            toc_item.save()
            n -= 1

            # time to save revisions correctly
            history = logChapterHistory(
                chapter=chapter,
                content=chapter.content,
                user=book.owner,
                comment='',
                revision=chapter.revision
            )

            if history:
                logBookHistory(
                    book=book,
                    version=book.version,
                    chapter=chapter,
                    chapter_history=history,
                    user=book.owner,
                    kind='chapter_create'
                )

    def _parse_chapter(self, content):
        def _find(tag):
            return tree.xpath('//' + tag)

        from lxml import html, etree

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        headers = []

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                if h1.text == 'Unknown':
                    # Translators: Default chapter title when importing DOCX
                    # files. In case title does not exists.
                    h1.text = _('Title')

        for n in range(5):
            headers.append(_find('h{}'.format(n + 1)))

        level = 2

        if len(headers[0]) > 1:
            for header in headers[0][1:]:
                header.tag = 'h{}'.format(level)
            level += 1

        for levels in headers[1:]:
            has_changed = False

            for header in levels:
                header.tag = 'h{}'.format(level)

            if has_changed:
                if level < 6:
                    level += 1

        imgs = tree.xpath('.//img')

        for _img in imgs:
            image_name = _img.get('src')
            att_name, att_ext = os.path.splitext(os.path.basename(image_name))

            if image_name in self.broken_images:
                _img.set('src', 'static/{}.jpg'.format(att_name))

            if image_name in self.converted_images:
                _img.set('src', 'static/{}.png'.format(att_name))

        has_endnotes = False
        endnotes = None
        idx_endnote = 1

        for endnote in tree.xpath('.//sup[@class="endnote"]'):

            key = endnote.get('data-id', '')
            if key == '':
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnote_key = None
            footnote_key = None

            for k, v in self.endnotes.iteritems():
                if v == key:
                    endnote_key = k

            for k, v in self.footnotes.iteritems():
                if v == key:
                    footnote_key = k

            note_content = None

            if endnote_key:
                endnote = self.dfile.document.endnotes[endnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False, 'pretty_print': False,
                        'relationship': 'endnotes'
                    })

            if footnote_key:
                endnote = self.dfile.document.footnotes[footnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False, 'pretty_print': False,
                        'relationship': 'footnotes'
                    })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content, create_parent=True,
                    parser=lxml.html.HTMLParser(
                        encoding='utf-8', remove_blank_text=True, remove_comments=True)
                )
                li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

        return etree.tostring(
            tree, pretty_print=True, encoding='utf-8', xml_declaration=False)
コード例 #4
0
ファイル: docximporter.py プロジェクト: 80000v/Booktype
class WordImporter(object):
    def __init__(self):
        self.notifier = Notifier()
        self.delegate = Delegate()

        # Attachment objects indexed by image file name
        self._attachments = {}
        # Chapter objects indexed by document file name
        self._chapters = {}

        self.endnotes = {}
        self.footnotes = {}

    def _check_for_elements(self):
        from ooxml import doc

        found_math = False

        for elem in self.dfile.document.elements:
            if isinstance(elem, doc.Paragraph):
                for el in elem.elements:
                    if isinstance(el, doc.Math):
                        found_math = True

        if found_math:
            warn_msg = _(
                "Please note: Mathematical formulae have been found, and highlighted in the text. These formulae are not supported by many e-readers, or the Booktype editor at present."
            )  # noqa
            self.notifier.warning(warn_msg)

    def import_file(self, file_path, book, options=None):
        self.delegate.notifier = self.notifier
        self.broken_images = []
        self.converted_images = []

        def serialize_empty(ctx, document, elem, root):
            return root

        def serialize_endnote(ctx, document, el, root):
            # <sup class="endnote" data-id="1454855960556">1</sup>

            if el.rid not in self.endnotes:
                data_id = str(uuid.uuid1()).replace('-', '')
                self.endnotes[el.rid] = data_id
            else:
                data_id = self.endnotes[el.rid]

            note = lxml.etree.SubElement(root, 'sup', {
                'class': 'endnote',
                'data-id': data_id
            })
            note.text = '1'

            return root

        def serialize_footnote(ctx, document, el, root):
            # <sup class="endnote" data-id="1454855960556">1</sup>

            if el.rid not in self.footnotes:
                data_id = str(uuid.uuid1()).replace('-', '')
                self.footnotes[el.rid] = data_id
            else:
                data_id = self.footnotes[el.rid]

            note = lxml.etree.SubElement(root, 'sup', {
                'class': 'endnote',
                'data-id': data_id
            })
            note.text = '1'

            return root

        if not options:
            options = {'scale_font_size': True}

        try:
            self.dfile = ooxml.read_from_file(file_path)

            # TODO: move this into a more customisable place.
            serialize_options = {
                'header': docutils.DocHeaderContext,
                'embed_styles': True,
                'embed_fontsize': True,
                # 'empty_paragraph_as_nbsp': True,
                'serializers': {
                    doc.Math: serialize_empty,
                    doc.Footnote: serialize_footnote,
                    doc.Endnote: serialize_endnote
                },
                'hooks': {
                    'p': [docutils.hook_p],
                    'h': [docutils.check_h_tags_hook],
                    'table': [docutils.hook_infobox_table]
                }
            }

            chapters = importer.get_chapters(
                self.dfile.document,
                options=options,
                serialize_options=serialize_options)

            self._import_attachments(book, self.dfile.document)
            self._import_chapters(book, chapters)

            # get the styles
            self._import_styles(book)
            self.dfile.close()

            self._check_for_elements()
        except zipfile.BadZipfile:
            notif_msg = _(
                "The file could not be imported because it was not saved in the .docx format. Try to open the file in Word and save it as a .docx."
            )  # noqa
            self.notifier.error(notif_msg)
        except Exception as err:
            err_msg = _(
                "The docx file you uploaded contains errors and cannot be converted. Please contact customer support."
            )  # noqa
            self.notifier.error(err_msg)
            logger.exception("Error trying to import docx file. Msg: %s" % err)

    def _import_styles(self, book):
        from django.conf import settings

        options = {}

        if hasattr(self.dfile.document, 'base_font_size') and \
           self.dfile.document.base_font_size != -1:
            options['scale_to_size'] = self.dfile.document.base_font_size
        elif len(self.dfile.document.possible_text) > 0:
            options['scale_to_size'] = self.dfile.document.possible_text[-1]

        editor_style = serialize.serialize_styles(self.dfile.document,
                                                  prefix='#contenteditor',
                                                  options=options)

        epub_style = serialize.serialize_styles(self.dfile.document,
                                                prefix='',
                                                options=options)

        dir_name = '{}/styles/{}/'.format(settings.DATA_ROOT, book.url_title)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        f = open('{}/editor_style.css'.format(dir_name), 'wt')
        f.write(STYLE_EDITOR)
        f.write(editor_style)
        f.close()

        f = open('{}/epub_style.css'.format(dir_name), 'wt')
        f.write(STYLE_EPUB)
        f.write(epub_style)
        f.close()

    def _import_attachments(self, book, doc):
        default_status = get_default_book_status()
        stat = models.BookStatus.objects.filter(book=book,
                                                name=default_status)[0]

        unimportable_image = False
        not_supported = False

        for rel_id, rel_value in doc.relationships['document'].iteritems():
            if rel_value.get(
                    'type', ''
            ) == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image':
                att = models.Attachment(book=book,
                                        version=book.version,
                                        status=stat)

                valid_extensions = ['.jpg', '.jpeg', '.png', '.gif']
                import_msg = _(
                    "The file format you uploaded is not supported. Please save the image as jpg file and upload it again."
                )  # noqa

                try:
                    with ContentFile(self.dfile.read_file(
                            rel_value['target'])) as content_file:
                        att_name, att_ext = os.path.splitext(
                            os.path.basename(rel_value['target']))
                        original_ext = att_ext[:]

                        if att_ext.lower() in ['.tif', '.tiff']:
                            try:
                                content_file = docutils.convert_image(
                                    'tiff', content_file)
                                self.converted_images.append(
                                    'static/{}{}'.format(rel_id, original_ext))

                                att_ext = '.png'
                            except:
                                # broken image
                                if not unimportable_image:
                                    self.notifier.warning(import_msg)
                                    unimportable_image = True

                                content_file = None

                        elif att_ext.lower() not in valid_extensions:
                            if not unimportable_image:
                                self.notifier.warning(import_msg)
                                unimportable_image = True

                            content_file = None

                        if content_file:
                            att.attachment.save('{}{}'.format(rel_id, att_ext),
                                                content_file,
                                                save=False)
                            att.save()
                        else:
                            if not not_supported:
                                self.notifier.warning(
                                    _("An error occurred while importing images. Some images couldn't be imported. Missing images are marked within the text. Please upload missing images manually."
                                      ))  # noqa
                                not_supported = True

                            self.broken_images.append('static/{}{}'.format(
                                rel_id, original_ext))

                            assets_dir = os.path.join(
                                os.path.dirname(__file__), "assets/")
                            pholder_path = '{}placeholder_broken_img.jpg'.format(
                                assets_dir)
                            data = open(pholder_path, 'rb').read()
                            content_file = ContentFile(data)

                            att.attachment.save('{}.jpg'.format(rel_id),
                                                content_file,
                                                save=False)
                            att.save()
                except Exception as err:
                    logger.exception(
                        "Exception while importing attachments. Msg: %s" % err)

    def _import_chapters(self, book, chapters):
        now = datetime.datetime.now()
        default_status = get_default_book_status()
        stat = models.BookStatus.objects.filter(book=book,
                                                name=default_status)[0]
        n = 100

        for chapter_title, chapter_content in chapters:
            if len(chapter_title) > 100:
                chapter_title = u'{}...'.format(chapter_title[:100])

            if chapter_title == '':
                if n == 100:
                    chapter_title = _('Title Page')
                else:
                    chapter_title = _('Title')

            chapter_n = 0
            possible_title = chapter_title

            while True:
                does_exists = models.Chapter.objects.filter(
                    book=book,
                    version=book.version,
                    url_title=booktype_slugify(possible_title)).exists()

                if does_exists:
                    chapter_n += 1
                    possible_title = u'{} - {}'.format(chapter_title,
                                                       chapter_n)
                else:
                    break

            if chapter_content[6:-8].strip() == '':
                continue

            _content = self._parse_chapter(chapter_content)
            try:
                chapter_content = unidecode(_content)[6:-8]
            except UnicodeDecodeError:
                chapter_content = _content.decode('utf-8',
                                                  errors='ignore')[6:-8]
            except Exception as err:
                chapter_content = 'Error parsing chapter content'
                logger.exception(
                    "Error while decoding chapter content {0}".format(err))

            chapter = models.Chapter(
                book=book,
                version=book.version,
                url_title=booktype_slugify(possible_title),
                title=possible_title,
                status=stat,
                content=chapter_content,
                created=now,
                modified=now)
            chapter.save()

            toc_item = models.BookToc(book=book,
                                      version=book.version,
                                      name=chapter.title,
                                      chapter=chapter,
                                      weight=n,
                                      typeof=1)
            toc_item.save()
            n -= 1

            # time to save revisions correctly
            history = logChapterHistory(chapter=chapter,
                                        content=chapter.content,
                                        user=book.owner,
                                        comment='',
                                        revision=chapter.revision)

            if history:
                logBookHistory(book=book,
                               version=book.version,
                               chapter=chapter,
                               chapter_history=history,
                               user=book.owner,
                               kind='chapter_create')

    def _parse_chapter(self, content):
        def _find(tag):
            return tree.xpath('//' + tag)

        from lxml import html, etree

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        headers = []

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                if h1.text == 'Unknown':
                    # Translators: Default chapter title when importing DOCX
                    # files. In case title does not exists.
                    h1.text = _('Title')

        for n in range(5):
            headers.append(_find('h{}'.format(n + 1)))

        level = 2

        if len(headers[0]) > 1:
            for header in headers[0][1:]:
                header.tag = 'h{}'.format(level)
            level += 1

        for levels in headers[1:]:
            has_changed = False

            for header in levels:
                header.tag = 'h{}'.format(level)

            if has_changed:
                if level < 6:
                    level += 1

        imgs = tree.xpath('.//img')

        for _img in imgs:
            image_name = _img.get('src')
            att_name, att_ext = os.path.splitext(os.path.basename(image_name))

            if image_name in self.broken_images:
                _img.set('src', 'static/{}.jpg'.format(att_name))

            if image_name in self.converted_images:
                _img.set('src', 'static/{}.png'.format(att_name))

        has_endnotes = False
        endnotes = None
        idx_endnote = 1

        for endnote in tree.xpath('.//sup[@class="endnote"]'):
            key = endnote.get('data-id', '')
            if key == '':
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnote_key = None
            footnote_key = None

            for k, v in self.endnotes.iteritems():
                if v == key:
                    endnote_key = k

            for k, v in self.footnotes.iteritems():
                if v == key:
                    footnote_key = k

            note_content = None

            if endnote_key:
                endnote = self.dfile.document.endnotes[endnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'endnotes'
                    })

            if footnote_key:
                endnote = self.dfile.document.footnotes[footnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'footnotes'
                    })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol',
                                                {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content,
                    create_parent=True,
                    parser=lxml.html.HTMLParser(encoding='utf-8',
                                                remove_blank_text=True,
                                                remove_comments=True))
                li = etree.SubElement(endnotes, 'li',
                                      {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

                # children are normally just one element which inside has more children
                # so in this case, we just drop_tag and keep content
                for x in li.getchildren():
                    x.drop_tag()

        # let's do some clean out on the not necessary tags,
        # like span tags with no reason to be
        for tag in tree.xpath('.//span'):
            class_name = tag.get('class', None)
            parent_class = tag.getparent().get('class', '')

            if not class_name or class_name in parent_class:
                tag.drop_tag()

        # let's cleanout infoboxes a bit
        # TODO: implement of plugins or something else more organized that separate functions
        docutils.clean_infobox_content(tree)
        docutils.fix_citations(tree)

        return etree.tostring(tree,
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=False)