Exemple #1
0
def set_metadata(stream, mi):
    from calibre.utils.zipfile import safe_replace
    c = DOCX(stream, extract=False)
    dp_name, ap_name = c.get_document_properties_names()
    dp_raw = c.read(dp_name)
    try:
        ap_raw = c.read(ap_name)
    except Exception:
        ap_raw = None
    cp = safe_xml_fromstring(dp_raw)
    update_doc_props(cp, mi, c.namespace)
    replacements = {}
    if ap_raw is not None:
        ap = safe_xml_fromstring(ap_raw)
        comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
        for child in tuple(ap):
            if child.tag == comp.tag:
                ap.remove(child)
        comp.text = mi.publisher
        ap.append(comp)
        replacements[ap_name] = BytesIO(xml2str(ap))
    stream.seek(0)
    safe_replace(stream,
                 dp_name,
                 BytesIO(xml2str(cp)),
                 extra_replacements=replacements)
Exemple #2
0
    def _read_opf(self):
        data = self.oeb.container.read(None)
        data = self.oeb.decode(data)
        data = XMLDECL_RE.sub('', data)
        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                      OPF1_NS, data)
        try:
            opf = safe_xml_fromstring(data)
        except etree.XMLSyntaxError:
            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = safe_xml_fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
            except etree.XMLSyntaxError:
                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                data = data.replace(
                    '<dc-metadata>',
                    '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">'
                )
                opf = safe_xml_fromstring(data)
                self.logger.warn('OPF contains invalid tours section')

        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
        opf = self._clean_opf(opf)
        return opf
Exemple #3
0
def html_to_lxml(raw):
    raw = '<div>%s</div>' % raw
    root = parse(raw,
                 keep_doctype=False,
                 namespace_elements=False,
                 maybe_xhtml=False,
                 sanitize_names=True)
    root = next(root.iterdescendants('div'))
    root.set('xmlns', "http://www.w3.org/1999/xhtml")
    raw = etree.tostring(root, encoding='unicode')
    try:
        return safe_xml_fromstring(raw, recover=False)
    except:
        for x in root.iterdescendants():
            remove = []
            for attr in x.attrib:
                if ':' in attr:
                    remove.append(attr)
            for a in remove:
                del x.attrib[a]
        raw = etree.tostring(root, encoding='unicode')
        try:
            return safe_xml_fromstring(raw, recover=False)
        except:
            from calibre.ebooks.oeb.parse_utils import _html4_parse
            return _html4_parse(raw)
Exemple #4
0
def parse_outline(raw, output_dir):
    from lxml import etree
    from calibre.utils.xml_parse import safe_xml_fromstring
    raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
    outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
    if outline:
        from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]
        toc = TOC()
        count = [0]

        def process_node(node, toc):
            for child in node.iterchildren('*'):
                if child.tag == 'outline':
                    parent = toc.children[-1] if toc.children else toc
                    process_node(child, parent)
                else:
                    if child.text:
                        page = child.get('page', '1')
                        toc.add(child.text, 'index.html', 'p' + page)
                        count[0] += 1
        process_node(outline, toc)
        if count[0] > 2:
            root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
            with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
                f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
    def __init__(self, xml, opts, log):
        self.opts, self.log = opts, log
        self.root = safe_xml_fromstring(xml)
        idc = count()

        self.fonts = []
        self.font_map = {}

        for spec in self.root.xpath('//font'):
            self.fonts.append(Font(spec))
            self.font_map[self.fonts[-1].id] = self.fonts[-1]

        self.pages = []
        self.page_map = {}

        for page in self.root.xpath('//page'):
            page = Page(page, self.font_map, opts, log, idc)
            self.page_map[page.id] = page
            self.pages.append(page)

        self.collect_font_statistics()

        for page in self.pages:
            page.document_font_stats = self.font_size_stats
            page.first_pass()
            page.second_pass()

        self.linearize()
        self.render()
Exemple #6
0
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = safe_xml_fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization
        from css_parser import CSSParser, log
        setup_css_parser_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(
            loglevel=logging.WARNING,
            # We dont care about @import rules
            fetcher=lambda x: (None, None),
            log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding='unicode')
Exemple #7
0
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = handle_private_entities(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
    if force_html5_parse:
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
        ans = safe_xml_fromstring(raw, recover=False)
        if ans.tag != '{%s}html' % XHTML_NS:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
            for elem in ans.iter(LxmlElement):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
Exemple #8
0
def parse_xcu(raw, origin='%origin%'):
    ' Get the dictionary and affix file names as well as supported locales for each dictionary '
    ans = {}
    root = safe_xml_fromstring(raw)

    for node in XPath(
            '//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(
                root):
        value = XPath('descendant::prop[@oor:name="Locations"]/value')(node)
        if len(value[0]) == 0:
            # The value node has no children, use its text
            paths = ''.join(
                XPath('descendant::prop[@oor:name="Locations"]/value/text()')(
                    node)).replace('%origin%', origin).split()
        else:
            # Use the text of the value nodes children
            paths = [
                c.text.replace('%origin%', origin) for v in value
                for c in v.iterchildren('*') if c.text
            ]
        aff, dic = paths if paths[0].endswith('.aff') else reversed(paths)
        locales = ''.join(
            XPath('descendant::prop[@oor:name="Locales"]/value/text()')(
                node)).split()
        ans[(dic, aff)] = locales
    return ans
Exemple #9
0
    def mlize_spine(self):
        from calibre.ebooks.oeb.base import XHTML
        from calibre.ebooks.oeb.stylizer import Stylizer
        from calibre.utils.xml_parse import safe_xml_fromstring
        output = self.header()
        if 'titlepage' in self.oeb_book.guide:
            href = self.oeb_book.guide['titlepage'].href
            item = self.oeb_book.manifest.hrefs[href]
            if item.spine_position is None:
                stylizer = Stylizer(item.data, item.href, self.oeb_book,
                        self.opts, self.opts.output_profile)
                self.currently_dumping_item = item
                output += self.dump_text(item.data.find(XHTML('body')), stylizer)
                output += r'{\page }'
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to RTF markup...' % item.href)
            # Removing comments is needed as comments with -- inside them can
            # cause fromstring() to fail
            content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
            content = self.remove_newlines(content)
            content = self.remove_tabs(content)
            content = safe_xml_fromstring(content)
            stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
            self.currently_dumping_item = item
            output += self.dump_text(content.find(XHTML('body')), stylizer)
            output += r'{\page }'
        output += self.footer()
        output = self.insert_images(output)
        output = self.clean_text(output)

        return output
Exemple #10
0
def parse_xmp_packet(raw_bytes):
    raw_bytes = raw_bytes.strip()
    enc = None
    pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]'''
    encodings = ('8', '16-le', '16-be', '32-le', '32-be')
    header = raw_bytes[:1024]
    emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings}
    emap[b''] = 'utf-8'
    for q in encodings:
        m = re.search(pat.encode('utf-'+q), header)
        if m is not None:
            enc = emap.get(m.group(1), enc)
            break
    if enc is None:
        return safe_xml_fromstring(raw_bytes)
    raw = _xml_declaration.sub('', raw_bytes.decode(enc))  # lxml barfs if encoding declaration present in unicode string
    return safe_xml_fromstring(raw)
Exemple #11
0
 def fix_markup(self, html, log):
     root = safe_xml_fromstring(html)
     self.filter_css(root, log)
     self.extract_css(root, log)
     self.epubify_markup(root, log)
     self.apply_list_starts(root, log)
     html = etree.tostring(root, encoding='utf-8', xml_declaration=True)
     return html
Exemple #12
0
    def mlize(self):
        from calibre.ebooks.oeb.base import XHTML
        from calibre.ebooks.oeb.stylizer import Stylizer
        from calibre.utils.xml_parse import safe_xml_fromstring
        output = [u'']
        stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book,
                            self.opts, self.opts.output_profile)
        content = etree.tostring(self.item.data.find(XHTML('body')),
                                 encoding='unicode')
        #        content = self.remove_newlines(content)
        trees = {}
        for subitem, subtitle in self.subitems:
            snbcTree = etree.Element("snbc")
            snbcHead = etree.SubElement(snbcTree, "head")
            etree.SubElement(snbcHead, "title").text = subtitle
            if self.opts and self.opts.snb_hide_chapter_name:
                etree.SubElement(snbcHead, "hidetitle").text = "true"
            etree.SubElement(snbcTree, "body")
            trees[subitem] = snbcTree
        output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
        output += self.dump_text(self.subitems, safe_xml_fromstring(content),
                                 stylizer)[0]
        output = self.cleanup_text(''.join(output))

        subitem = ''
        bodyTree = trees[subitem].find(".//body")
        for line in output.splitlines():
            pos = line.find(CALIBRE_SNB_PRE_TAG)
            if pos == -1:
                line = line.strip(' \t\n\r\u3000')
            else:
                etree.SubElement(bodyTree, "text").text = \
                    etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):])
                continue
            if len(line) != 0:
                if line.find(CALIBRE_SNB_IMG_TAG) == 0:
                    prefix = ProcessFileName(os.path.dirname(self.item.href))
                    if prefix != '':
                        etree.SubElement(bodyTree, "img").text = \
                            prefix + '_' + line[len(CALIBRE_SNB_IMG_TAG):]
                    else:
                        etree.SubElement(bodyTree, "img").text = \
                            line[len(CALIBRE_SNB_IMG_TAG):]
                elif line.find(CALIBRE_SNB_BM_TAG) == 0:
                    subitem = line[len(CALIBRE_SNB_BM_TAG):]
                    bodyTree = trees[subitem].find(".//body")
                else:
                    if self.opts and not self.opts.snb_dont_indent_first_line:
                        prefix = '\u3000\u3000'
                    else:
                        prefix = ''
                    etree.SubElement(bodyTree, "text").text = \
                        etree.CDATA(unicode_type(prefix + line))
                if self.opts and self.opts.snb_insert_empty_line:
                    etree.SubElement(bodyTree, "text").text = \
                        etree.CDATA('')

        return trees
Exemple #13
0
    def read_ncx_toc(self, toc, root=None):
        self.base_path = os.path.dirname(toc)
        if root is None:
            with open(toc, 'rb') as f:
                raw = xml_to_unicode(f.read(),
                                     assume_utf8=True,
                                     strip_encoding_pats=True)[0]
            root = safe_xml_fromstring(raw)
        xpn = {'re': 'http://exslt.org/regular-expressions'}
        XPath = functools.partial(etree.XPath, namespaces=xpn)

        def get_attr(node, default=None, attr='playorder'):
            for name, val in node.attrib.items():
                if name and val and name.lower().endswith(attr):
                    return val
            return default

        nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
        txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
        content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
        np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')

        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = ''
                for txt in txt_path(nl):
                    text += etree.tostring(txt,
                                           method='text',
                                           encoding='unicode',
                                           with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)

        nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
        if not nm:
            raise ValueError('NCX files must have a <navmap> element.')
        nm = nm[0]

        for child in np_path(nm):
            process_navpoint(child, self)
Exemple #14
0
def pretty_all_xml_in_dir(path):
    for f in walk(path):
        if f.endswith('.xml') or f.endswith('.rels'):
            with open(f, 'r+b') as stream:
                raw = stream.read()
                if raw:
                    root = safe_xml_fromstring(raw)
                    stream.seek(0)
                    stream.truncate()
                    stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
Exemple #15
0
def _html4_parse(data):
    data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
            elem.text = elem.text.strip('-')
    data = etree.tostring(data, encoding='unicode')

    data = safe_xml_fromstring(data)
    return data
Exemple #16
0
 def __init__(self, raw):
     self.root = safe_xml_fromstring(raw) if raw else None
     self.entries = {}
     if self.root is not None:
         for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
             algorithm = em.get('Algorithm', '')
             cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')
             if cr:
                 uri = cr[0].get('URI', '')
                 if uri and algorithm:
                     self.entries[uri] = algorithm
Exemple #17
0
def html_to_lxml(raw):
    raw = '<div>%s</div>' % raw
    root = html.fragment_fromstring(raw)
    root.set('xmlns', "http://www.w3.org/1999/xhtml")
    raw = etree.tostring(root, encoding=None)
    try:
        return safe_xml_fromstring(raw, recover=False)
    except:
        for x in root.iterdescendants():
            remove = []
            for attr in x.attrib:
                if ':' in attr:
                    remove.append(attr)
            for a in remove:
                del x.attrib[a]
        raw = etree.tostring(root, encoding=None)
        try:
            return safe_xml_fromstring(raw, recover=False)
        except:
            from calibre.ebooks.oeb.parse_utils import _html4_parse
            return _html4_parse(raw)
Exemple #18
0
def extract_cover_from_embedded_svg(html, base, log):
    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
    from calibre.utils.xml_parse import safe_xml_fromstring
    root = safe_xml_fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        if href:
            path = os.path.join(base, *href.split('/'))
            return return_raster_image(path)
Exemple #19
0
    def fb2mlize_spine(self):
        output = (
            self.fb2_header(),
            self.get_text(),
            self.fb2mlize_images(),
            self.fb2_footer(),
        )
        output = self.clean_text('\n'.join(output))

        if self.opts.pretty_print:
            output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True)

        return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
Exemple #20
0
def parse_opf(stream_or_path):
    stream = stream_or_path
    if not hasattr(stream, 'read'):
        stream = open(stream, 'rb')
    raw = stream.read()
    if not raw:
        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
    raw = raw[raw.find('<'):]
    root = safe_xml_fromstring(clean_xml_chars(raw))
    if root is None:
        raise ValueError('Not an OPF file')
    return root
def open_search(url, query, max_results=10, timeout=60):
    url_template = 'https://standardebooks.org/opds/all?query={searchTerms}'
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = safe_xml_fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = 'https://standardebooks.org' + href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = 'https://standardebooks.org' + href
                    elif 'http://opds-spec.org/acquisition' in rel:
                        if type:
                            ext = href.split('.')[1]
                            if ext:
                                ext = ext[:].upper().strip()
                                s.downloads[
                                    ext] = 'https://standardebooks.org' + href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            yield s
Exemple #22
0
    def get_text(self):
        from calibre.ebooks.oeb.stylizer import Stylizer
        from calibre.ebooks.oeb.base import XHTML

        text = ['']
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to PML markup...' % item.href)
            content = etree.tostring(item.data, encoding='unicode')
            content = self.prepare_text(content)
            content = safe_xml_fromstring(content)
            stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
            text.append(self.add_page_anchor(item))
            text += self.dump_text(content.find(XHTML('body')), stylizer, item)
        return ''.join(text)
Exemple #23
0
 def __init__(self, stream=None):
     if not stream:
         return
     container = safe_xml_fromstring(stream.read())
     if container.get('version', None) != '1.0':
         raise EPubException("unsupported version of OCF")
     rootfiles = container.xpath('./*[local-name()="rootfiles"]')
     if not rootfiles:
         raise EPubException("<rootfiles/> element missing")
     for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'):
         mt, fp = rootfile.get('media-type'), rootfile.get('full-path')
         if not mt or not fp:
             raise EPubException("<rootfile/> element malformed")
         self[mt] = fp
Exemple #24
0
def search(browser, url, timeout=60):
    with closing(browser.open(url, timeout=timeout)) as f:
        data = safe_xml_fromstring(f.read())
        for entry in data.xpath('//*[local-name() = "entry"]'):
            if is_book(entry):
                yield parse_book(entry, url)
            else:
                for link in entry.xpath('./*[local-name() = "link"]'):
                    href = link.get('href')
                    type = link.get('type')
                    if href and type:
                        next_url = urljoin(url, href)
                        for book in search(browser, next_url, timeout):
                            yield book
Exemple #25
0
def get_metadata(f):
    read = lambda at, amount: _read(f, at, amount)
    f.seek(0)
    buf = f.read(12)
    if buf[4:] == b'ftypLRX2':
        offset = 0
        while True:
            offset += word_be(buf[:4])
            try:
                buf = read(offset, 8)
            except:
                raise ValueError('Not a valid LRX file')
            if buf[4:] == b'bbeb':
                break
        offset += 8
        buf = read(offset, 16)
        if buf[:8].decode('utf-16-le') != 'LRF\x00':
            raise ValueError('Not a valid LRX file')
        lrf_version = word_le(buf[8:12])
        offset += 0x4c
        compressed_size = short_le(read(offset, 2))
        offset += 2
        if lrf_version >= 800:
            offset += 6
        compressed_size -= 4
        uncompressed_size = word_le(read(offset, 4))
        info = decompress(f.read(compressed_size))
        if len(info) != uncompressed_size:
            raise ValueError('LRX file has malformed metadata section')
        root = safe_xml_fromstring(info)
        bi = root.find('BookInfo')
        title = bi.find('Title')
        title_sort = title.get('reading', None)
        title = title.text
        author = bi.find('Author')
        author_sort = author.get('reading', None)
        mi = MetaInformation(title, string_to_authors(author.text))
        mi.title_sort, mi.author_sort = title_sort, author_sort
        author = author.text
        publisher = bi.find('Publisher')
        mi.publisher = getattr(publisher, 'text', None)
        mi.tags = [x.text for x in bi.findall('Category')]
        mi.language = root.find('DocInfo').find('Language').text
        return mi

    elif buf[4:8] == b'LRX':
        raise ValueError('Librie LRX format not supported')
    else:
        raise ValueError('Not a LRX file')
Exemple #26
0
def ensure_namespace(doc):
    # Workaround for broken FB2 files produced by convertonlinefree.com. See
    # https://bugs.launchpad.net/bugs/1404701
    bare_tags = False
    for x in ('description', 'body'):
        for x in doc.findall(x):
            if '{' not in x.tag:
                bare_tags = True
                break
    if bare_tags:
        import re
        raw = etree.tostring(doc, encoding='unicode')
        raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
        doc = safe_xml_fromstring(raw)
    return doc
Exemple #27
0
    def insert_cover(self):
        from calibre.ebooks.oeb.base import urldefrag
        g, m = self.oeb.guide, self.oeb.manifest
        item = None
        if 'titlepage' not in g:
            if 'cover' in g:
                href = g['cover'].href
            else:
                href = self.default_cover()
            if href is None:
                return
            width, height = self.inspect_cover(href)
            if width == -1 or height == -1:
                self.log.warning('Failed to read cover dimensions')
                width, height = 600, 800
            # if self.preserve_aspect_ratio:
            #    width, height = 600, 800
            self.svg_template = self.svg_template.replace(
                '__viewbox__', '0 0 %d %d' % (width, height))
            self.svg_template = self.svg_template.replace(
                '__width__', unicode_type(width))
            self.svg_template = self.svg_template.replace(
                '__height__', unicode_type(height))

            if href is not None:
                templ = self.non_svg_template if self.no_svg_cover \
                        else self.svg_template
                tp = templ % unquote(href)
                id, href = m.generate('titlepage', 'titlepage.xhtml')
                item = m.add(id,
                             href,
                             guess_type('t.xhtml')[0],
                             data=safe_xml_fromstring(tp))
        else:
            item = self.oeb.manifest.hrefs[urldefrag(
                self.oeb.guide['titlepage'].href)[0]]
        if item is not None:
            self.oeb.spine.insert(0, item, True)
            if 'cover' not in self.oeb.guide.refs:
                self.oeb.guide.add('cover', 'Title page', 'a')
            self.oeb.guide.refs['cover'].href = item.href
            if 'titlepage' in self.oeb.guide.refs:
                self.oeb.guide.refs['titlepage'].href = item.href
            titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
            if titem is not None:
                titem.href = item.href
Exemple #28
0
 def __init__(self):
     from calibre.utils.config import config_dir
     from calibre.utils.lock import ExclusiveFile
     self.conf_path = os.path.join(config_dir, 'scheduler.xml')
     old_conf_path  = os.path.join(config_dir, 'scheduler.pickle')
     self.root = E.recipe_collection()
     self.lock = RLock()
     if os.access(self.conf_path, os.R_OK):
         with ExclusiveFile(self.conf_path) as f:
             try:
                 self.root = safe_xml_fromstring(f.read())
             except:
                 print('Failed to read recipe scheduler config')
                 import traceback
                 traceback.print_exc()
     elif os.path.exists(old_conf_path):
         self.migrate_old_conf(old_conf_path)
Exemple #29
0
def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
    from calibre.spell.dictionary import parse_lang_code
    dest_dir = dest_dir or os.path.join(config_dir, 'dictionaries')
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    num = 0
    with ZipFile(source_path) as zf:

        def read_file(key):
            try:
                return zf.open(key).read()
            except KeyError:
                # Some dictionaries apparently put the xcu in a sub-directory
                # and incorrectly make paths relative to that directory instead
                # of the root, for example:
                # http://extensions.libreoffice.org/extension-center/italian-dictionary-thesaurus-hyphenation-patterns/releases/4.1/dict-it.oxt
                while key.startswith('../'):
                    key = key[3:]
                return zf.open(key.lstrip('/')).read()

        root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read())
        xcu = XPath(
            '//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]'
        )(root)[0].get('{%s}full-path' % NS_MAP['manifest'])
        for (dic, aff), locales in iteritems(
                parse_xcu(zf.open(xcu).read(), origin='')):
            dic, aff = dic.lstrip('/'), aff.lstrip('/')
            d = tempfile.mkdtemp(prefix=prefix, dir=dest_dir)
            locales = uniq([
                x for x in map(fill_country_code, locales)
                if parse_lang_code(x).countrycode
            ])
            if not locales:
                continue
            metadata = [name] + list(locales)
            with open(os.path.join(d, 'locales'), 'wb') as f:
                f.write(('\n'.join(metadata)).encode('utf-8'))
            dd, ad = convert_to_utf8(read_file(dic), read_file(aff))
            with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
                f.write(dd)
            with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
                f.write(ad)
            num += 1
    return num
def import_opml(raw, preserve_groups=True):
    root = safe_xml_fromstring(raw)
    groups = defaultdict(list)
    ax = etree.XPath('ancestor::outline[@title or @text]')
    for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'):
        url = outline.get('xmlUrl')
        parent = outline.get('title', '') or url
        title = parent if ('title' in outline.attrib and parent) else None
        if preserve_groups:
            for ancestor in ax(outline):
                if ancestor.get('type', None) != 'rss':
                    text = ancestor.get('title') or ancestor.get('text')
                    if text:
                        parent = text
                        break
        groups[parent].append((title, url))

    for title in sorted(groups, key=sort_key):
        yield Group(title, uniq(groups[title], kmap=itemgetter(1)))