Ejemplos de EpubImage en Python, ejemplos de ebooklib.epub.EpubImage en Python

Ejemplo n.º 1

1

Mostrar archivo

def to_epub(parser):
    """
    God this function is ugly.
    """
    author = parser.author
    title = parser.title

    book = epub.EpubBook()
    book.set_title(title)
    book.set_language('en')
    book.add_author(author)
    chapters_info = []
    chapters_obj = []
    for chapter in parser.get_chapters_content():
        file_name = 'chapter_%d.xhtml' % chapter['order']
        c = epub.EpubHtml(title=chapter['name'], file_name=file_name)
        c.content = chapter['soup'].prettify()
        chapters_info.append((file_name, title, ''))
        book.add_item(c)
        chapters_obj.append(c)
    for image in parser.processed_images.values():
        img = epub.EpubImage()
        img.file_name = 'images/%s' % image.name
        img.content = open(image.path, 'rb').read()
        book.add_item(img)
    book.toc = (chapters_obj)
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    book.spine = ['nav']
    book.spine += chapters_obj
    epub.write_epub(title.replace(' ', '_') + '.epub', book, {})

Ejemplo n.º 2

0

Mostrar archivo

Archivo: ebook_spider.py Proyecto: moodykeke/ebook_spider

 def add_chapter(self, title, file_name, content, ptype='chapter'):
     """
     生成章节内容实例
     params:
         title: 小节标题
         file_name: 保存文件名
         content: 文件内容
         ptype: 保存类型，默认为文章类型(chapter), 还可以是`image`图片类型
     return:
         chapter: 返回生成的小节对象实例
     """
     if title is None or file_name is None or content is None:
         return None
     c1 = None
     if ptype == 'chapter':
         c1 = epub.EpubHtml(title=title, file_name=file_name, lang=self.lang)
     elif ptype == 'image':
         c1 = epub.EpubImage(title=title, file_name=file_name, lang=self.lang)
     if isinstance(content, list) or isinstance(content, tuple):
         text = ""
         for line in content:
             text += "<p>" + line + "</p>\n"
         c1.set_content(text)
     else:
         c1.set_content(content)
     return c1

Ejemplo n.º 3

0

Mostrar archivo

Archivo: EpubManager.py Proyecto: wudong/safari-epub-downloader

    def add_image(self, image_filename: str):

        if image_filename == "cover.jpeg" or image_filename == "cover.jpg":
            return

        def get_image_type(_image_filename):
            suffix = Path(_image_filename).suffix
            if ".jpeg" == suffix.lower() or ".jpg" == suffix.lower():
                return "image/jpeg"
            if ".png" == suffix.lower():
                return "image/png"
            if ".svg" == suffix.lower():
                return "image/svg+xml"
            if ".gif" == suffix.lower():
                return "image/gif"
            raise RuntimeError(
                f"Unsupported image file format: {_image_filename}")

        item = epub.EpubImage()
        item.id = "image_" + image_filename.replace("/", "_")
        item.file_name = image_filename

        item.media_type = get_image_type(image_filename)

        item.content = self.book.get_binary_content(image_filename)
        self.epub_book.add_item(item)

Ejemplo n.º 4

0

Mostrar archivo

    def add_colophon(self, document, documents=None):
        colophon = self.find_colophon(document or documents[0])
        if colophon:
            html = self.clean_html(
                self.render_colophon(colophon, document, documents))

            # pull in any static images used in the colophon
            doc = ET.HTML(html)
            images = [
                img for img in doc.xpath('//img[@src]')
                if img.get('src').startswith('/static/')
            ]
            # rewrite paths to be relative
            for img in images:
                img.set('src', img.get('src')[1:])
            html = ET.tostring(doc)

            entry = epub.EpubHtml(uid='colophon', file_name='colophon.xhtml')
            entry.content = html
            self.book.add_item(entry)
            self.book.spine.append(entry)

            for fname in set(img.get('src') for img in images):
                local_fname = find_static(fname[7:])
                if local_fname:
                    img = epub.EpubImage()
                    img.file_name = fname
                    with open(local_fname, 'rb') as f:
                        img.content = f.read()
                    self.book.add_item(img)

Ejemplo n.º 5

0

Mostrar archivo

 def find_uncommon_words_in_one_text(self, text):
     #找出一段文字中生僻字的位置
     pos_list=[]
     for idx,char in enumerate(text):
         if re.match(r'\s', char):
             continue
         if char not in self.kindle_charset:
             pos_list.append(idx)
             #没有生成图片时才重新生成
             if char not in self.char_image_map:
                 is_in_big_font=False
                 #查找char所在的字体并渲染，big font还是不够大啊
                 for font, render in self.fonts_render.items():
                     if char in self.font_charset_map[font]:
                         (surface, textpos) = render.render(char, (0, 0, 0))
                         b=char.encode("unicode_escape")
                         name = str(b[2:])[2:-1]
                         name+=".png"
                         pygame.image.save(surface, os.path.join(self.temp_dirctory,name))
                         data=open(os.path.join(self.temp_dirctory,name),'rb').read()
                         self.book.add_item(epub.EpubImage(file_name=os.path.join(self.font_image_dir, name),media_type='image/png', content=data))
                         self.char_image_map[char]=name
                         is_in_big_font=True
                         break
                 if not is_in_big_font:
                     print("Very very uncommon: ",char)
     return pos_list

Ejemplo n.º 6

0

Mostrar archivo

    def _create_epub_images(self):
        """
        Create epub image objects

        :Args:
          - self (:class:`ExportBook`): current class instance
        """

        for i, attachment in enumerate(self.attachments):
            if ('static/' + os.path.basename(
                    attachment.attachment.name)) not in self.embeded_images:
                continue

            try:
                f = open(attachment.attachment.name, "rb")
                blob = f.read()
                f.close()
            except (IOError, OSError):
                continue
            else:
                filename = os.path.basename(
                    attachment.attachment.name.encode("utf-8"))
                itm = epub.EpubImage()
                itm.file_name = 'static/%s' % filename
                itm.content = blob
                self.epub_book.add_item(itm)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: ebook_spider.py Proyecto: moodykeke/ebook_spider

 def fetch_image(self, doc, book, lable_xpath=r"//img", attr='src'):
     '''图片链接下载'''
     for _link in doc.xpath(lable_xpath):
         img_url = _link.get(attr)
         if not img_url.startswith('http'):
             print(f"img_url:{img_url} invalid! not startswith http")
             continue
         print(f"xpath:{lable_xpath},attr:{attr}, img_url={img_url}")
         if img_url not in self.url_doer:
             self.url_doer.add(img_url)
             resp = req_get_info(img_url, proxies=self.proxy)
             if resp is None:
                 continue
             img_item = epub.EpubImage()
             img_url = re.sub(r'\?.*', '', img_url)     # 过滤?及其后参数请求信息#
             file_name = '{:03d}_{}'.format(
                 self.img_idx, img_url.rsplit('/', maxsplit=1)[1]
             )
             img_item.file_name = file_name
             self.img_idx += 1
             img_item.set_content(resp.content)
             book.add_item(img_item)
             _link.set(attr, file_name)
         else:
             print("already downloaded url:", img_url)
             img_url = re.sub(r'\?.*', '', img_url)     # 过滤?及其后参数请求信息#
             file_name = '{:03d}_{}'.format(
                 self.img_idx, img_url.rsplit('/', maxsplit=1)[1]
             )
             _link.set('src', file_name)
     return doc

Ejemplo n.º 8

0

Mostrar archivo

Archivo: convertation.py Proyecto: AntonZimahorau/FinalTaskRssParser

def create_image_objects(news, image_number):
    """Return list of epubImage objects"""
    list_of_image_objects = []
    list_of_images = download_images(news)
    for image in list_of_images:
        img_obj = epub.EpubImage()
        img_obj.file_name = f"{image_number}.jpg"
        image_number += 1
        img_obj.media_type = "image/jpeg"
        img_obj.set_content(image)
        list_of_image_objects.append(img_obj)
    return list_of_image_objects, image_number

Ejemplo n.º 9

0

Mostrar archivo

    def add_attachments(self, document, file_dir):
        fnames = set(
            img.get('src')[6:] for img in document.doc.root.xpath(
                '//a:img[@src]', namespaces={'a': document.doc.namespace})
            if img.get('src', '').startswith('media/'))

        for attachment in document.attachments.all():
            if attachment.filename in fnames:
                img = epub.EpubImage()
                img.file_name = f'{file_dir}/media/{attachment.filename}'
                img.content = attachment.file.read()
                self.book.add_item(img)

Ejemplo n.º 10

0

Mostrar archivo

 def addJpegImage(self, imageData):
     """Adds a jpeg image from the imageData array to the book and returns
     the reference name for the image to be used in html.
     @param imageData Image data in format jpeg
     @return The name of the image to be used in html
     """
     epimg = epub.EpubImage()
     epimg.file_name = "grf/image_%i.jpg" % self.imgCount
     self.imgCount += 1
     epimg.media_type = "image/jpeg"
     epimg.set_content(imageData)
     self.ebook.add_item(epimg)
     return epimg.file_name

Ejemplo n.º 11

0

Mostrar archivo

Archivo: epub.py Proyecto: dipu-bd/lightnovel-crawler

def make_cover_image(app):
    if not (app.book_cover and os.path.isfile(app.book_cover)):
        return None
    # end if
    logger.info('Creating cover: %s', app.book_cover)
    # ext = app.book_cover.split('.')[-1]
    cover_image = epub.EpubImage()
    cover_image.file_name = 'cover.jpg'
    cover_image.media_type = 'image/jpeg'
    with open(app.book_cover, 'rb') as image_file:
        cover_image.content = image_file.read()
    # end with
    return cover_image

Ejemplo n.º 12

0

Mostrar archivo

Archivo: get_ebooks_from_wenku8.py Proyecto: BlueRainLi/Get_ebooks

def get_picture(index: int, src: str, timeout, img_list: list):
    try:
        req = requests.get(src, timeout=timeout)
    except Exception as e:
        print(src, e)
    else:
        img_data = req.content
        match = re.search('[0-9]+.(jpg|png)', src)
        name = match.group()
        img = epub.EpubImage()
        img.file_name = name
        img.media_type = "image/" + match.group(1)
        img.content = img_data
        img_list[index] = img

Ejemplo n.º 13

0

Mostrar archivo

    def _inline_remote_image(self, src):
        epub_img = epub.EpubImage()

        digest = hashlib.sha256(src.encode("utf-8")).hexdigest()
        digest_name = "{}.{}".format(digest, src.rsplit(".")[-1])
        epub_img.file_name = os.path.join(self.tmp_path, digest_name)

        if not os.path.exists(epub_img.file_name):
            logger.info("Downloading remote image %s", src)

            resp = requests.get(src)
            with open(epub_img.file_name, "wb") as f:
                f.write(resp.content)

        logger.info("Remote image %s added as %s", src, epub_img.file_name)

        return epub_img

Ejemplo n.º 14

0

Mostrar archivo

Archivo: epub.py Proyecto: dipu-bd/lightnovel-crawler

def make_chapter_images(book, image_output_path):
    if not os.path.isdir(image_output_path):
        return
    # end if

    for filename in os.listdir(image_output_path):
        if not filename.endswith('.jpg'):
            continue
        # end if

        image_item = epub.EpubImage()
        image_item.media_type = 'image/jpeg'
        image_item.file_name = 'images/' + filename
        with open(os.path.join(image_output_path, filename), 'rb') as fp:
            image_item.content = fp.read()
        # end with

        book.add_item(image_item)

Ejemplo n.º 15

0

Mostrar archivo

    def _inline_local_image(self, img, src):
        epub_img = epub.EpubImage()
        epub_img.file_name = src

        image_path = os.path.join(self.html_root, src)
        if not os.path.exists(image_path):
            logger.error("File %s doesn't exists, skipping!", image_path)
            raise IOError("Can't open %s" % image_path, image_path)

        with open(image_path, "rb") as f:
            epub_img.content = f.read()

        if "style" in img.params:
            del img.params["style"]

        logger.info("Local image %s added", epub_img.file_name)

        return epub_img

Ejemplo n.º 16

0

Mostrar archivo

 def assemble(self):
     urls = self._get_urls()
     book = epub.EpubBook()
     spine = [epub.EpubNcx(), epub.EpubNav(), self._get_cover()]
     toc = []
     for url in reversed(urls):
         cache_entry = self._cache.get(url)
         if not cache_entry:
             continue
         post = Entry(cache_entry, filter_index.ENTRY_FILTERS)
         chapter = post.get_epub_chapter()
         spine.append(chapter)
         toc.append(epub.Link(chapter.file_name, chapter.title, chapter.id))
         for url, filename in post.get_image_urls():
             img = epub.EpubImage()
             img.file_name = filename
             try:
                 img.content = self._cache.get(url, binary=True)
                 img.media_type = 'image/jpeg'
                 book.add_item(img)
             except PageNotFoundError, e:
                 # Ignored, just skip the image.
                 pass

Ejemplo n.º 17

0

Mostrar archivo

 def add_chapter(self, title, content):
     r"""
     添加章节
     :param title:       标题
     :param content:     内容
     :return:
     """
     # 处理图片
     img_urls = re.findall(r'<img\ssrc="(\S+)"', content)
     for img_url in img_urls:
         pic_path = "images/%s.jpg" % str(uuid.uuid1())
         # 下载图片
         content = content.replace(img_url, pic_path)
         img_data = urllib.urlopen(img_url).read()
         image_item = epub.EpubImage()
         image_item.set_content(img_data)
         image_item.file_name = pic_path
         self.book.add_item(image_item)
     chapter = epub.EpubHtml(title=title,
                             file_name='%s.xhtml' % str(uuid.uuid1()),
                             lang='hr')
     chapter.content = content
     self.chapters.append(chapter)
     self.book.add_item(chapter)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: digest.py Proyecto: punchagan/r2k

def _add_images(book, html, base_url):
    tree  = fromstring(html)
    for node in tree.xpath('//*[@src]'):
        if node.tag not in ('img', 'video'):
            continue

        url = node.get('src')
        if node.tag == 'video' or _not_image_file(url) or _image_too_small(node):
            node.getparent().remove(node)

        else:
            file_name = _download_image(urljoin(base_url, url))
            if file_name is None:
                node.getparent().remove(node)

            else:
                node.set('src', file_name)
                img = epub.EpubImage(
                    file_name=file_name,
                    content=open(join(OUTBOX, file_name), 'rb').read()
                )
                book.add_item(img)

    return tostring(tree)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: create_epub.py Proyecto: harmtemolder/teachastronomy-to-epub

                if img:
                    with tag('figure'):
                        img_src = 'https:{}'.format(img['src'].replace(
                            '/hrthumbs', ''))
                        img_path = handle_img(img_src)
                        img_name, img_ext = os.path.splitext(
                            img_path.split('/')[-1])
                        epub_img_path = 'images/{}{}'.format(img_name, img_ext)

                        # Add the image to the EPUB, if it isn't already
                        if book.get_item_with_href(epub_img_path):
                            warnings.warn(
                                '{} has already been added'.format(img_path))
                        else:
                            epub_img = epub.EpubImage()
                            epub_img.uid = img_name
                            epub_img.file_name = epub_img_path

                            if img_ext == '.jpg':
                                epub_img.media_type = 'image/jpeg'
                            else:
                                raise ValueError(
                                    'You\'re adding something that isn\'t a JPEG'
                                )

                            with open(img_path, 'rb') as img_bin:
                                epub_img.content = img_bin.read()

                            book.add_item(epub_img)

Ejemplo n.º 20

0

Mostrar archivo

    def make_image(self, block):
        """
        Given a dict object containing the block info for an image, generate
        the image HTML
        """
        page_no = block['page_no']
        if page_no == 1:
            # The first page's image is made into the cover automatically
            return

        # pad out the filename to four digits
        origfile = '{dir}/{base}_jp2/{base}_{page:0>4}.jp2'.format(
            dir=self.tmpdir.name, base=self.base, page=block['page_no'])
        basefile = 'img_{:0>4}.png'.format(self.picnum)
        pngfile = '{}/{}'.format(self.tmpdir.name, basefile)
        in_epub_imagefile = 'images/{}'.format(basefile)

        # get image dimensions from ABBYY block attributes
        # (left, top, right, bottom)
        box = self.image_dim(block)
        width = box[2] - box[0]
        height = box[3] - box[1]

        # ignore if this image is entirely encapsulated in another image
        for each_pic in self.metadata['pics_by_page']:
            # Ignore if this is just the block itself
            if each_pic == block:
                continue
            new_box = self.image_dim(each_pic)
            for (old, new) in zip(box, new_box):
                if old <= new:
                    return

        # make the image:
        try:
            i = Image.open(origfile)
        except IOError as e:
            self.logger.warning("Can't open image {}: {}".format(origfile, e))
        try:
            i.crop(box).save(pngfile)
        except IOError as e:
            self.logger.warning(
                "Can't crop image {} and save to {}: {}".format(
                    origfile, pngfile, e))
        epubimage = epub.EpubImage()
        epubimage.file_name = in_epub_imagefile
        with open(pngfile, 'rb') as f:
            epubimage.content = f.read()
        epubimage = self.book.add_item(epubimage)

        container_w = width / int(block['style']['pagewidth']) * 100
        content = u'''
        <div style="width: {c_w}%;">
        <img src="{src}" alt="Picture #{picnum}">
        </div>
        '''.format(
            c_w=container_w,
            src=in_epub_imagefile,
            picnum=self.picnum,
            w=width,
            h=height,
        )

        # increment the image number
        self.picnum += 1

        return content

Ejemplo n.º 21

0

Mostrar archivo

Archivo: create_epub.py Proyecto: internetarchive/epub3

    def make_image(self, block):
        """
        Given a dict object containing the block info for an image, generate
        the image HTML
        """
        page_no = block['page_no']
        if page_no == 0:
            # The first page's image is made into the cover automatically
            return

        # pad out the filename to four digits
        origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format(
            dir=self.tmpdir, item_bookpath=self.item_bookpath, page=page_no)
        if not os.path.isfile(origfile):
            return
        basefile = 'img_{:0>4}.png'.format(self.picnum)
        outfile = '{}/{}'.format(self.tmpdir, basefile)
        in_epub_imagefile = 'images/{}'.format(basefile)

        # get image dimensions from ABBYY block attributes
        # (left, top, right, bottom)
        box = self.image_dim(block)
        width = box[2] - box[0]
        height = box[3] - box[1]

        # some image processors also need the original page dimensions
        pagewidth = float(block['style']['pagewidth'])
        pageheight = float(block['style']['pageheight'])
        pagedim = (pagewidth, pageheight)

        # ignore if this image is entirely encapsulated in another image
        for each_pic in self.metadata['pics_by_page']:
            # Ignore if this is just the block itself
            if each_pic == block:
                continue
            new_box = self.image_dim(each_pic)
            if all(i >= j for i, j in zip(box, new_box)):
                return

        # make the image:
        imageobj = ImageFactory(self.image_processor)
        try:
            imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
        except RuntimeError as e:
            # for failed image creation, keep processing the epub
            self.logger.error(e)
            return ''
        epubimage = epub.EpubImage()
        epubimage.file_name = in_epub_imagefile
        with open(outfile, 'rb') as f:
            epubimage.content = f.read()
        epubimage = self.book.add_item(epubimage)

        # to approximate original layout, set the image container width to
        # percentage of the page width
        container_w = (width / pagewidth) * 100
        content = u'''
        <div style="width: {c_w}%;">
        <img src="{src}" alt="Picture #{picnum}">
        </div>
        '''.format(
            c_w=container_w,
            src=in_epub_imagefile,
            picnum=self.picnum,
            w=width,
            h=height,
        )

        # increment the image number
        self.picnum += 1

        return content

Ejemplo n.º 22

0

Mostrar archivo

Archivo: views.py Proyecto: zeuser/Booktype

def export_book(input_file, filename):
    """Reads content of book in Booki.zip format and converts it to EPUB format.

    This function reads content of the book in Booki.zip file, creates new
    book in EPUB format and converts entire content into it. There are some
    things which are different in new EPUB format. One of them is how links 
    and interlinks are handled.
    """

    epub_book = ExportEpubBook()

    # Creating new EPUB file
    epub_book.add_prefix('bkterms', 'http://booktype.org/')

    # Read old Booki.zip format
    bookizip = BookiZip(input_file)

    _toc, _section, _section_name = [], [], None
    spine = ['nav']

    # Get filesnames of all the chapters/sections
    file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()]

    x = 0
    for typ, file_name, title in bookizip.get_toc():
        # Ignore sections
        if typ == 1:
            if _section_name is None and len(_section) > 0:
                _toc.append(_section)
            elif len(_section) > 0:
                _toc.append((epub.Section(_section_name), _section[:]))

            _section_name = title
            _section = []
            continue

        # Create new chapter with new filename
        c1 = epub.EpubHtml(title=title,
                           file_name='{}.xhtml'.format(file_name[6:-5]))
        cont = unicode(bookizip.read(file_name), 'utf-8')
        _section.append(c1)

        try:
            tree = parse_html_string(cont.encode('utf-8'))
        except:
            # Just ignore everything if we can not parse the chapter
            continue

        # Change all the links in the document
        for elem in tree.iter():
            if elem.tag == 'a':
                href = elem.get('href')

                if href:
                    urlp = urlparse.urlparse(href)
                    url_title = urlp.path

                    if urlp.scheme == '':
                        if url_title and url_title in file_names:
                            fixed_href = url_title + '.xhtml'
                            if urlp.fragment:
                                fixed_href = "{}#{}".format(
                                    fixed_href, urlp.fragment)

                            elem.set('href', fixed_href)
                        else:
                            # ovdje brishe sve shto je externo. to se ne bi trebalo desavati
                            elem.drop_tag()

            c1.content = etree.tostring(tree,
                                        pretty_print=True,
                                        encoding='utf-8',
                                        xml_declaration=True)

        epub_book.add_item(c1)
        spine.append(c1)
        x += 1

    if _section_name is None and len(_section) > 0:
        _toc.append(_section)
    elif len(_section) > 0:
        _toc.append((epub.Section(_section_name), _section[:]))

    # Add all of the attachments
    for att_name in bookizip.get_attachments():
        try:
            blob = bookizip.read(att_name)
        except (IOError, OSError):
            continue
        else:
            itm = epub.EpubImage()
            itm.file_name = att_name
            itm.content = blob
            epub_book.add_item(itm)

    epub_book.set_title('Title', 'main')
    epub_book.set_language('en')
    epub_book.add_author('Author', role='aut', uid='author')

    epub_book.toc = _toc
    epub_book.spine = spine

    epub_book.add_item(epub.EpubNcx())
    epub_book.add_item(epub.EpubNav())

    opts = {'plugins': [TidyPlugin(), standard.SyntaxPlugin()]}
    epub.write_epub(filename, epub_book, opts)

Ejemplo n.º 23

0

Mostrar archivo

    def _ebookize_all_news(self, parsed_articles):
        """
        Adds the previously processed news data to the ebook.
        :param parsed_articles: The previously processed news data.
        """
        print("* Ebook-izing downloaded headlines. *")
        # some initialization
        template = self.env.get_template('tmpl/article_template.html')
        self.article_toc_list = []

        # put each into ebook
        for a in parsed_articles:
            print("Loading #{} into ebook: {}".format(a["count"], a["title"]))

            if a["top_image"] is not None:
                img_file_name = "art_img/image_{:03d}".format(a["count"])
                epimg = epub.EpubImage()
                epimg.file_name = img_file_name
                epimg.media_type = "image/jpeg"
                img_resp = requests.get(a["top_image"])
                img = img_resp.content
                epimg.set_content(img)
                self.book.add_item(epimg)

                a["top_image"] = img_file_name

            c = epub.EpubHtml(title=a["title"], file_name="article_{}.xhtml".format(a["count"]), lang='en')
            tree = publish_doctree(a["article_text"])
            html = publish_from_doctree(tree, writer_name='html').decode()
            soup = BeautifulSoup(html, 'lxml')
            body_only = soup.find('body').find('div', {"class": "document"})

            # skip articles that have barred keywords
            if any(kw in a["title"].lower() for kw in settings.TITLE_EXCLUSIONS):
                print("\tArticle title contains a barred keyword. Skipping.")
                continue

            if len(body_only.findAll('p')) < settings.MIN_PARAGRAPHS_FOR_AN_ARTICLE:
                print(
                    "\tArticle from {} too short. It may be paywalled or a video. It may also have been parsed incorrectly."
                    "\n\tURL: {}".format(a["source"], a["url"]))
                # fall back to justext to synthesize article
                a["article_text"] = ""
                count = 0
                paragraphs = justext.justext(requests.get(a["url"]).content, justext.get_stoplist("English"))
                for paragraph in paragraphs:
                    if not paragraph.is_boilerplate:
                        count += 1
                        a["article_text"] += "<p>{}</p>".format(paragraph.text)
                if count < settings.MIN_PARAGRAPHS_FOR_AN_ARTICLE:
                    print("\t\tArticle parsed correctly but actually short. Skipping.")
                    continue  # if it's still short, then it's actually short and not parsed incorrectly...continue
                else:
                    print("\t\tArticle was indeed parsed incorrectly. Fallback has parsed it correctly.")
            else:
                a["article_text"] = body_only

            c.set_content(template.render(article=a))
            self.chaps.append(c)
            self.book.add_item(c)
            self.article_toc_list.append(
                epub.Link("article_{}.xhtml".format(a["count"]), "{} - {}".format(a["title"], a["source"]),
                          "art%d" % a["count"]))

Ejemplo n.º 24

0

Mostrar archivo

def export_booktype(bookid):
    # Get Booktype Book

    try:
        booktype_book = models.Book.objects.get(url_title__iexact=bookid)
    except models.Book.DoesNotExist:
        print 'NO SUCH BOOK'
        sys.exit(-1)

    book_version = booktype_book.getVersion(None)

    # START CREATING THE BOOK
    book = epub.EpubBook()

    # set basic info
    book.set_identifier('booktype:%s' % booktype_book.url_title)
    book.set_title(booktype_book.title)
    book.set_language('en')

    # set description
    if booktype_book.description != '':
        book.add_metadata('DC', 'description', booktype_book.description)

    # set license
    lic = booktype_book.license
    if lic:
        book.add_metadata('DC', 'rights', lic.name)

    # The Contributors for Booktype book


#    book.add_author('Thea von Harbou', role='aut', uid='author')
    book.add_author('Aleksandar Erkalovic', role='aut', uid='author')

    book.add_author('Aleksandar Erkalovic',
                    file_as='Aleksandar Erkalovic',
                    role='ill',
                    uid='illustrator')

    # set cover image
    img = open('cover.jpg', 'r').read()
    book.set_cover("image.jpg", img)

    toc = []
    section = []
    spine = ['cover', 'nav']

    for chapter in book_version.getTOC():
        if chapter.chapter:
            c1 = epub.EpubHtml(title=chapter.chapter.title,
                               file_name='%s.xhtml' %
                               (chapter.chapter.url_title, ))
            c1.add_link(href="style/default.css",
                        rel="stylesheet",
                        type="text/css")

            if chapter.chapter.title == 'Arabic':
                c1.set_language('ar')
            if chapter.chapter.title == 'Japanase':
                c1.set_language('jp')

            cont = chapter.chapter.content

            c1.content = cont

            book.add_item(c1)
            spine.append(c1)

            if len(section) > 1:
                section[1].append(c1)
        else:
            if len(section) > 0:
                toc.append(section[:])
                section = []

            section = [epub.Section(chapter.name), []]
            # this is section

    if len(section) > 0:
        toc.append(section[:])

    for i, attachment in enumerate(
            models.Attachment.objects.filter(version=book_version)):
        try:
            f = open(attachment.attachment.name, "rb")
            blob = f.read()
            f.close()
        except (IOError, OSError), e:
            continue
        else:
            fn = os.path.basename(attachment.attachment.name.encode("utf-8"))
            itm = epub.EpubImage()
            itm.file_name = 'static/%s' % fn
            itm.content = blob
            book.add_item(itm)

Ejemplo n.º 25

0

Mostrar archivo

def epub_write_coolshell(dt_last):
    book = epub.EpubBook()
    today = date.today()
   
    article_title = 'coolshell-%d%d%d' % (today.year, today.month, today.day)
    # set metadata
    book.set_identifier('id123456')
    book.set_title(article_title)
    book.set_language('en')
    
    book.add_author('Chen hao')

    #read html; fetch the title; fetch the text content
    response = urlopen('https://coolshell.cn/')
    content = response.read().decode('utf-8', 'ignore')
    response.close()
    with open('coolshell.html', 'w') as f:
        f.write(content)
    tree = lxml.html.fromstring(content)
        
    chapter_tocs = []
    book.spine = ['nav']
    chapter_no = 1
   
    pubtime_xpath = "//h5/a/time/@datetime"
    pubtime_format = '%Y-%m-%dT%H:%M:%S'
    title_xpath = "//h1[@class='entry-title']"
    content_xpath = "//article/div[@class='entry-content']"
    end_xpath = "//p[re:match(., '全文完')]"
    match = CSSSelector('h2.entry-title a')
    for chapter in match(tree):
        href = chapter.get('href')
        print(href)
        response = urlopen(href)
        content = response.read().decode('utf-8', 'ignore')
        response.close()
        chapter_tree = lxml.html.fromstring(content)
        str_pubtime = chapter_tree.xpath(pubtime_xpath)[0][0:19]
        dt_pubtime = datetime.strptime(str_pubtime, pubtime_format)
        if dt_pubtime <= dt_last:
            continue
        title = chapter_tree.xpath(title_xpath)[0].text
        content_tree = chapter_tree.xpath(content_xpath)[0]
        if end_xpath.find('re:match') > -1:
            last_item = content_tree.xpath(end_xpath, namespaces={"re": "http://exslt.org/regular-expressions"})[0]
        else:
            last_item = content_tree.xpath(end_xpath)[0]
        b_del = False
        for item in content_tree.getchildren():
            if b_del:
                content_tree.remove(item)
            if item == last_item:
                b_del = True
        img_xpath = "//img"
        for img_item in content_tree.xpath(img_xpath):
            if is_ancestor(content_tree, img_item):
                img_url = img_item.get('src')
                listtmp = re.split('/+', img_url)
                jpg_name = listtmp[-1]
                img_local = '%02d%s' % (chapter_no, jpg_name)
                print('img ' + img_url + ' local ' + img_local)
                get_image_from_url(img_url, img_local)
                img_item.set('src', img_local)
                #add the image to book
                img_item = epub.EpubImage()
                img_item.file_name = img_local
                try:
                    img_item.content = open(img_local, 'rb').read()
                except Exception:
                    print('Error open %s' % img_local)
                book.add_item(img_item)
        chapter_content = tostring(content_tree, encoding='unicode')
        chapter_file = 'chap_%02d.xhtml' % chapter_no
    
        # create chapter
        c1 = epub.EpubHtml(title=title, file_name=chapter_file, lang='hr')
        c1.content='<html><body><h1>'+title+'</h1>'+chapter_content+'</body></html>'
        book.add_item(c1)
        chapter_tocs.append(epub.Link(chapter_file, title, title))
        book.spine.append(c1)
        chapter_no = chapter_no + 1 
        #if chapter_no > 2:
        #    break
    
    # define Table Of Contents
    book.toc = tuple(chapter_tocs)
    #book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),
    #             (epub.Section('Simple book'),
    #             (c1, ))
    #            )
    #book.toc = (epub.Link('chap_01.xhtml', chapter_title, chapter_title)
    #            )
    
    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    
    # define CSS style
    style = 'BODY {color: white;}'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    
    # add CSS file
    book.add_item(nav_css)
    
    # basic spine
    #book.spine = ['nav', c1]
    
    # write to the file
    epub.write_epub(article_title + '.epub', book, {})

Ejemplo n.º 26

0

Mostrar archivo

Archivo: transformers.py Proyecto: AYCHKnow/PortableWisdom

def embed_images(book, soup):
    """Embeds remote images in EPUB HTML chapters"""

    for img in soup.find_all('img'):
        src = img.get('src')

        # Remove junk images
        if not src:
            img.decompose()
            continue
        if src.startswith('denied:'):
            img.decompose()
            continue
        if src.startswith('data:'):
            img.decompose()
            continue

        src_parts = urlparse(src)
        ext = os.path.splitext(src_parts.path)[1]
        name = str(hash(src)) + ext

        if name not in image_names:
            # Create `EpubImage` wrapper object
            image = epub.EpubImage()
            image.id = str(hash(src))
            image.file_name = name

            thumbnail_hash = src + str(IMAGE_MAX_SIZE)
            thumbnail_bytes = cache.get(thumbnail_hash)

            # Download the image
            if thumbnail_bytes:
                thumbnail = io.BytesIO(thumbnail_bytes)
            else:
                thumbnail = io.BytesIO()

                try:
                    logging.info('Downloading image %s', img['src'])
                    content = requests.get(img['src'], timeout=3.05).content
                except (requests.exceptions.ContentDecodingError,
                        requests.exceptions.ConnectionError,
                        requests.exceptions.ReadTimeout) as e:
                    logging.error('Skipping image %s (%s)' %
                                  (img['src'], e))
                    continue

                original = io.BytesIO()
                original.write(content)

                try:
                    # Create smaller, greyscale image from source image
                    # convert to `RGBA` before `L` or Pillow will complain
                    im = Image.open(original).convert('RGBA')
                    im.thumbnail(IMAGE_MAX_SIZE)
                    if IMAGE_GREYSCALE:
                        im = im.convert('L')
                    im.save(thumbnail, 'png' if ext == '.png' else 'jpeg')

                except OSError as e:
                    logging.error('Skipping image %s (%s)' %
                                  (img['src'], e))
                    continue

                cache.set(thumbnail_hash, thumbnail.getvalue())

            thumbnail.seek(0)

            image.content = thumbnail.read()
            book.add_item(image)
            image_names.add(name)

        img['style'] = 'max-width: 100%'
        img['src'] = name

Ejemplo n.º 27

0

Mostrar archivo

Archivo: epub.py Proyecto: Snehlata0305/Software-lab

def generate_epub(url, path):
    '''
    Generate epub document at the given path, taking list of html file names of listOfFiles and taking images from listofImg.
    Each html file is added as a new chapter.
    
    Parameters
    -----------
        path : Path to which epub is saved, as given by the user in the GUI.
        
    Returns
    ---------
        Status of execution: "Okay" if successful ; Exception otherwise.
        
    '''
    global pb_label
    global progress
    global progress_var
    global popup
    # setting progress in progress bar
    popup.title("Generating Epub...")
    pb_label.set("Now generating epub from extracted contents....")
    popup.update()
    sleep(5 / 1000)  # lauch task
    progress = 85
    progress_var.set(progress)

    try:
        book = epub.EpubBook()
        # add metadata
        book.set_identifier('sample12345678')

        book.set_title(book_title)
        book.set_language('en')
        object_list = []
        spine_list = ['cover', 'nav']

        book.add_author('We_did_our_best')

        img = Image.open("cover.jpg")
        draw = ImageDraw.Draw(img)
        font = ImageFont.truetype("arial.ttf", size=20)
        MAX_W, MAX_H = img.size
        w, h = draw.textsize(book_title, font=font)
        draw.text(((MAX_W - w) / 2, (MAX_H - h) / 2),
                  book_title,
                  fill="white",
                  font=font,
                  anchor="ms",
                  align="center")

        img.save('cover-out.jpg')
        book.set_cover("cover-out.jpg", open('cover-out.jpg', 'rb').read())

        j = 0
        global listOfFiles
        global listofImg
        for i in listOfFiles:
            htmlfile = i + ".html"
            pathForTemp = os.path.join(path, pathTemp)
            htmlfilepath = pathForTemp + "/" + htmlfile
            file = codecs.open(htmlfilepath, "r", "utf-8")

            content = file.read()
            soup1 = BeautifulSoup(content, "html.parser")
            chaptertitle = soup1.find('title').string
            # Adding chapters
            chaptertitle = chaptertitle.strip()

            c1 = epub.EpubHtml(title=chaptertitle,
                               file_name=chaptertitle + '.xhtml',
                               lang='en')
            c1.content = content
            object_list.append(c1)
            spine_list.append(c1)
            book.add_item(c1)
            j = j + 1

        # adding images
        for img in listofImg:
            ext = img.split(".")[-1]
            if ext == "svg":
                continue
            elif ext == "jpg":
                ext1 = "JPEG"
            else:
                ext1 = ext

            i = Image.open(img)
            b = io.BytesIO()
            i.save(b, ext1)
            j = b.getvalue()
            ei = epub.EpubImage()
            ei.file_name = img
            ei.media_type = 'image/' + ext
            ei.content = j
            book.add_item(ei)

            # add table of contents
        book.toc = object_list

        # add navigation files
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # define css style
        style = '''
    @namespace epub "http://www.idpf.org/2007/ops";
    body {
        font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
    }
    h2 {
         text-align: left;
         text-transform: uppercase;
         font-weight: 200;     
    }
    ol {
            list-style-type: none;
    }
    ol > li:first-child {
            margin-top: 0.3em;
    }
    nav[epub|type~='toc'] > ol > li > ol  {
        list-style-type:square;
    }
    nav[epub|type~='toc'] > ol > li > ol > li {
            margin-top: 0.3em;
    }
    '''

        # add css file
        nav_css = epub.EpubItem(uid="style_nav",
                                file_name="style/nav.css",
                                media_type="text/css",
                                content=style)
        book.add_item(nav_css)

        # spine
        book.spine = spine_list

        # create epub file

        epubname = os.path.join(path, book_title + '.epub')

        epub.write_epub(epubname, book, {})

    except Exception as e:
        listOfFiles = []
        listofImg = []
        print(e)
        return "Exception"

    listOfFiles = []
    listofImg = []

    # setting progress in progress bar
    popup.update()
    sleep(5 / 1000)  # lauch task
    progress = 100
    pb_label.set("saved epub doc at: \n{} ....".format(path))
    progress_var.set(progress)
    popup.update()
    sleep(3)
    popup.withdraw()

    print("saved epub doc at {}".format(path))
    return "Okay"

Ejemplo n.º 28

0

Mostrar archivo

def epub_write_rss_csdn(username, dt_last):
    book = epub.EpubBook()
    today = date.today()
   
    filename_feed = username + 'csdn.feed'
    article_title = '%s-csdn-%d%d%d' % (username, today.year, today.month, today.day)
    # set metadata
    book.set_identifier('id123456')
    book.set_title(article_title)
    book.set_language('en')
    book.add_author(username)

    #read html; fetch the title; fetch the text content
    response = urlopen('https://blog.csdn.net/%s/rss/list' % username)
    content = response.read().decode('utf-8', 'ignore')
    response.close()
    with open(filename_feed, 'w') as f:
        f.write(content)
    tree = etree.parse(filename_feed) 
        
    chapter_tocs = []
    book.spine = ['nav']
    chapter_no = 1
   
    #<div id="content_views" class="markdown_views prism-github-gist">#//article/div[[@id='article_content']/div[@id='content_views']]
    #<div class="postTime"># no need this config 
    pubtime_xpath = "pubDate"
    pubtime_format = '%Y/%m/%d %H:%M:%S'
    title_xpath = "title"
    link_xpath = "link"
    content_xpath = "//article/div[[@id='article_content']/div[@id='content_views']"
    end_xpath = "//div[@class='postTime']"
    chapters = tree.xpath("//item")
    for chapter in chapters:
        str_pubtime = chapter.xpath(pubtime_xpath)[0].text
        dt_pubtime = datetime.strptime(str_pubtime, pubtime_format)
        if dt_pubtime <= dt_last:
            continue
        title = chapter.xpath(title_xpath)[0].text
        href = chapter.xpath(link_xpath)[0].text
        print(href)
        response = urlopen(href)
        content = response.read().decode('utf-8', 'ignore')
        response.close()
        chapter_tree = lxml.html.fromstring(content)
        content_tree = chapter_tree.xpath(content_xpath)[0]
        if end_xpath.find('re:match') > -1:
            last_item = content_tree.xpath(end_xpath, namespaces={"re": "http://exslt.org/regular-expressions"})[0]
        else:
            last_item = content_tree.xpath(end_xpath)[0]
        b_del = False
        for item in content_tree.getchildren():
            if b_del:
                content_tree.remove(item)
            if item == last_item:
                b_del = True
        img_xpath = "//img"
        for img_item in content_tree.xpath(img_xpath):
            if is_ancestor(content_tree, img_item):
                img_url = img_item.get('src')
                listtmp = re.split('/+', img_url)
                jpg_name = listtmp[-1]
                img_local = '%02d%s' % (chapter_no, jpg_name)
                print('img ' + img_url + ' local ' + img_local)
                get_image_from_url(img_url, img_local)
                img_item.set('src', img_local)
                #add the image to book
                img_item = epub.EpubImage()
                img_item.file_name = img_local
                try:
                    img_item.content = open(img_local, 'rb').read()
                except Exception:
                    print('Error open %s' % img_local)
                book.add_item(img_item)
        chapter_content = tostring(content_tree, encoding='unicode')
        chapter_file = 'chap_%02d.xhtml' % chapter_no
    
        # create chapter
        c1 = epub.EpubHtml(title=title, file_name=chapter_file, lang='hr')
        c1.content='<html><body><h1>'+title+'</h1>'+chapter_content+'</body></html>'
        book.add_item(c1)
        chapter_tocs.append(epub.Link(chapter_file, title, title))
        book.spine.append(c1)
        chapter_no = chapter_no + 1 
    
    # define Table Of Contents
    book.toc = tuple(chapter_tocs)
    #book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),
    #             (epub.Section('Simple book'),
    #             (c1, ))
    #            )
    
    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    
    # define CSS style
    style = 'BODY {color: white;}'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    
    # add CSS file
    book.add_item(nav_css)
    
    # basic spine
    
    # write to the file
    epub.write_epub(article_title + '.epub', book, {})

Ejemplo n.º 29

0

Mostrar archivo

    def html_before_write(self, book, chapter):
        from lxml import etree

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        # delete deprecated tags
        # i should really have a list of allowed tags
        for tag in DEPRECATED_TAGS:
            etree.strip_tags(root, tag)

        head = tree.find('head')

        if head is not None and len(head) != 0:

            for _item in head:
                if _item.tag == 'base':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
                elif _item.tag == 'link':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'href', 'crossorigin', 'rel', 'media', 'hreflang',
                            'type', 'sizes'
                        ])
                elif _item.tag == 'title':
                    if _item.text == '':
                        head.remove(_item)
                elif _item.tag == 'meta':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['name', 'http-equiv', 'content', 'charset'])
                    # just remove for now, but really should not be like this
                    head.remove(_item)
                elif _item.tag == 'script':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'type', 'charset', 'async', 'defer',
                            'crossorigin'
                        ])
                elif _item.tag == 'source':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
                elif _item.tag == 'style':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
                else:
                    leave_only(_item, ATTRIBUTES_GLOBAL)

        if len(root.find('body')) != 0:
            body = tree.find('body')

            for _item in body.iter():
                # it is not
                # <a class="indexterm" href="ch05.html#ix_epub:trigger_element">

                if _item.tag == 'a':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'href', 'target', 'download', 'rel', 'hreflang',
                            'type'
                        ])
                elif _item.tag == 'area':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'alt', 'coords', 'shape', 'href', 'target',
                            'download', 'rel', 'hreflang', 'type'
                        ])
                elif _item.tag == 'audio':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'crossorigin', 'preload', 'autoplay',
                            'mediagroup', 'loop', 'muted', 'controls'
                        ])
                elif _item.tag == 'blockquote':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'button':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'disabled', 'form', 'formaction',
                            'formenctype', 'formmethod', 'formnovalidate',
                            'formtarget', 'name', 'type', 'value', 'menu'
                        ])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'del':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'details':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
                elif _item.tag == 'embed':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
                elif _item.tag == 'fieldset':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
                elif _item.tag == 'details':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'accept-charset', 'action', 'autocomplete',
                            'enctype', 'method', 'name', 'novalidate', 'target'
                        ])
                elif _item.tag == 'iframe':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'srcdoc', 'name', 'sandbox', 'seamless',
                            'allowfullscreen', 'width', 'height'
                        ])
                elif _item.tag == 'img':
                    _src = _item.get('src', '').lower()
                    if _src.startswith('http://') or _src.startswith(
                            'https://'):
                        if 'remote-resources' not in chapter.properties:
                            chapter.properties.append('remote-resources')
                            # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
                            # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
                            from ebooklib import epub
                            _img = epub.EpubImage(file_name=_item.get('src'))
                            book.add_item(_img)
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'alt', 'src', 'crossorigin', 'usemap', 'ismap',
                            'width', 'height'
                        ])
                elif _item.tag == 'input':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'accept', 'alt', 'autocomplete', 'autofocus',
                            'checked', 'dirname', 'disabled', 'form',
                            'formaction', 'formenctype', 'formmethod',
                            'formnovalidate', 'formtarget', 'height',
                            'inputmode', 'list', 'max', 'maxlength', 'min',
                            'multiple', 'name', 'pattern', 'placeholder',
                            'readonly', 'required', 'size', 'src', 'step'
                            'type', 'value', 'width'
                        ])
                elif _item.tag == 'ins':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'keygen':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'challenge', 'disabled', 'form',
                            'keytype', 'name'
                        ])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'map':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
                elif _item.tag == 'menu':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
                elif _item.tag == 'object':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'data', 'type', 'typemustmatch', 'name', 'usemap',
                            'form', 'width', 'height'
                        ])
                elif _item.tag == 'ol':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
                elif _item.tag == 'optgroup':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['disabled', 'label'])
                elif _item.tag == 'option':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['disabled', 'label', 'selected', 'value'])
                elif _item.tag == 'output':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
                elif _item.tag == 'param':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
                elif _item.tag == 'progress':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
                elif _item.tag == 'q':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'select':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'disabled', 'form', 'multiple',
                            'name', 'required', 'size'
                        ])

                elif _item.tag == 'table':
                    if _item.get('border', None):
                        if _item.get('border') == '0':
                            _item.set('border', '')

                    if _item.get('summary', None):
                        _caption = etree.Element('caption', {})
                        _caption.text = _item.get('summary')
                        _item.insert(0, _caption)

                        # add it as caption
                        del _item.attrib['summary']

                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['border', 'sortable'])
                elif _item.tag == 'dl':
                    _d = _item.find('dd')
                    if _d is not None and len(_d) == 0:
                        pass

                        # http://html5doctor.com/the-dl-element/
                        # should be like this really
                        # some of the elements can be missing
                        # dl
                        #   dt
                        #   dd
                        #   dt
                        #   dd
                elif _item.tag == 'td':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
                elif _item.tag == 'textarea':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autocomplete', 'autofocus', 'cols', 'dirname',
                            'disabled', 'form', 'inputmode', 'maxlength',
                            'name', 'placeholder', 'readonly', 'required',
                            'rows', 'wrap'
                        ])

                elif _item.tag in ['col', 'colgroup']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
                elif _item.tag == 'th':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'colspan', 'rowspan', 'headers', 'scope', 'abbr',
                            'sorted'
                        ])
                elif _item.tag in ['time']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
                elif _item.tag in ['track']:
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['kind', 'src', 'srclang', 'label', 'default'])
                elif _item.tag == 'video':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'crossorigin', 'poster', 'preload',
                            'autoplay', 'mediagroup', 'loop', 'muted',
                            'controls', 'width', 'height'
                        ])
                elif _item.tag == 'svg':
                    # We need to add property "svg" in case we have embeded svg file
                    if 'svg' not in chapter.properties:
                        chapter.properties.append('svg')

                    if _item.get('viewbox', None):
                        del _item.attrib['viewbox']

                    if _item.get('preserveaspectratio', None):
                        del _item.attrib['preserveaspectratio']
                else:
                    for _attr in six.iterkeys(_item.attrib):
                        if _attr not in ATTRIBUTES_GLOBAL:
                            del _item.attrib[_attr]

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

        return chapter.content

Ejemplo n.º 30

0

Mostrar archivo

Archivo: economist.py Proyecto: ichinaski/economist-ebook

    def build(self):
        '''build issue, downloading articles if needed, and write ebook'''

        self.fetch_issue()
        self.info()

        for s in self.sections:
            s.build(self.db)

        book = epub.EpubBook()

        # add metadata
        book.set_title(self.title)
        book.set_identifier(self.id)
        book.set_language(self.language)
        book.add_author(self.author)

        toc = []
        spine = []

        if self.cover_img:
            img = fetch(self.cover_img).content
            book.set_cover("image.jpg", img)
            spine.append('cover')

        spine.append('nav')

        # Sections
        for section in self.sections:
            items = []

            for article in section.articles:
                if not article.content:
                    logging.error('%s could not be downloaded. Skipping.',
                                  article.url)
                    continue
                item = epub.EpubHtml(title=article.title,
                                     file_name='{}.xhtml'.format(
                                         article.title),
                                     lang=self.language)
                item.content = article.content

                # images were downloaded by the article, and placed
                # in disk for refenrence. We now add them to the book.
                for filename in article.images:
                    img = epub.EpubImage()
                    img.file_name = filename
                    with open(filename, 'rb') as f:
                        img.content = f.read()
                    book.add_item(img)
                items.append(item)

            for item in items:
                book.add_item(item)
            toc.append((epub.Section(section.title,
                                     href=items[0].file_name), items))
            spine.extend(items)

        book.toc = toc
        book.spine = spine

        # add navigation files
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # create epub file
        epub.write_epub('{}.epub'.format(self.id), book, {})