def set_metadata(stream, mi): from calibre.utils.zipfile import safe_replace c = DOCX(stream, extract=False) dp_name, ap_name = c.get_document_properties_names() dp_raw = c.read(dp_name) try: ap_raw = c.read(ap_name) except Exception: ap_raw = None cp = safe_xml_fromstring(dp_raw) update_doc_props(cp, mi, c.namespace) replacements = {} if ap_raw is not None: ap = safe_xml_fromstring(ap_raw) comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep']) for child in tuple(ap): if child.tag == comp.tag: ap.remove(child) comp.text = mi.publisher ap.append(comp) replacements[ap_name] = BytesIO(xml2str(ap)) stream.seek(0) safe_replace(stream, dp_name, BytesIO(xml2str(cp)), extra_replacements=replacements)
def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = safe_xml_fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = safe_xml_fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace( '<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">' ) opf = safe_xml_fromstring(data) self.logger.warn('OPF contains invalid tours section') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf
def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = parse(raw, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True) root = next(root.iterdescendants('div')) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def parse_outline(raw, output_dir): from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = safe_xml_fromstring(raw).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0] toc = TOC() count = [0] def process_node(node, toc): for child in node.iterchildren('*'): if child.tag == 'outline': parent = toc.children[-1] if toc.children else toc process_node(child, parent) else: if child.text: page = child.get('page', '1') toc.add(child.text, 'index.html', 'p' + page) count[0] += 1 process_node(outline, toc) if count[0] > 2: root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def __init__(self, xml, opts, log): self.opts, self.log = opts, log self.root = safe_xml_fromstring(xml) idc = count() self.fonts = [] self.font_map = {} for spec in self.root.xpath('//font'): self.fonts.append(Font(spec)) self.font_map[self.fonts[-1].id] = self.fonts[-1] self.pages = [] self.page_map = {} for page in self.root.xpath('//page'): page = Page(page, self.font_map, opts, log, idc) self.page_map[page.id] = page self.pages.append(page) self.collect_font_statistics() for page in self.pages: page.document_font_stats = self.font_size_stats page.first_pass() page.second_pass() self.linearize() self.render()
def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = safe_xml_fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization from css_parser import CSSParser, log setup_css_parser_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser( loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding='unicode')
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: ans = safe_xml_fromstring(raw, recover=False) if ans.tag != '{%s}html' % XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def parse_xcu(raw, origin='%origin%'): ' Get the dictionary and affix file names as well as supported locales for each dictionary ' ans = {} root = safe_xml_fromstring(raw) for node in XPath( '//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')( root): value = XPath('descendant::prop[@oor:name="Locations"]/value')(node) if len(value[0]) == 0: # The value node has no children, use its text paths = ''.join( XPath('descendant::prop[@oor:name="Locations"]/value/text()')( node)).replace('%origin%', origin).split() else: # Use the text of the value nodes children paths = [ c.text.replace('%origin%', origin) for v in value for c in v.iterchildren('*') if c.text ] aff, dic = paths if paths[0].endswith('.aff') else reversed(paths) locales = ''.join( XPath('descendant::prop[@oor:name="Locales"]/value/text()')( node)).split() ans[(dic, aff)] = locales return ans
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = self.header() if 'titlepage' in self.oeb_book.guide: href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += r'{\page }' for item in self.oeb_book.spine: self.log.debug('Converting %s to RTF markup...' % item.href) # Removing comments is needed as comments with -- inside them can # cause fromstring() to fail content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) content = self.remove_newlines(content) content = self.remove_tabs(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(content.find(XHTML('body')), stylizer) output += r'{\page }' output += self.footer() output = self.insert_images(output) output = self.clean_text(output) return output
def parse_xmp_packet(raw_bytes): raw_bytes = raw_bytes.strip() enc = None pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]''' encodings = ('8', '16-le', '16-be', '32-le', '32-be') header = raw_bytes[:1024] emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings} emap[b''] = 'utf-8' for q in encodings: m = re.search(pat.encode('utf-'+q), header) if m is not None: enc = emap.get(m.group(1), enc) break if enc is None: return safe_xml_fromstring(raw_bytes) raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string return safe_xml_fromstring(raw)
def fix_markup(self, html, log): root = safe_xml_fromstring(html) self.filter_css(root, log) self.extract_css(root, log) self.epubify_markup(root, log) self.apply_list_starts(root, log) html = etree.tostring(root, encoding='utf-8', xml_declaration=True) return html
def mlize(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = [u''] stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile) content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode') # content = self.remove_newlines(content) trees = {} for subitem, subtitle in self.subitems: snbcTree = etree.Element("snbc") snbcHead = etree.SubElement(snbcTree, "head") etree.SubElement(snbcHead, "title").text = subtitle if self.opts and self.opts.snb_hide_chapter_name: etree.SubElement(snbcHead, "hidetitle").text = "true" etree.SubElement(snbcTree, "body") trees[subitem] = snbcTree output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, "")) output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0] output = self.cleanup_text(''.join(output)) subitem = '' bodyTree = trees[subitem].find(".//body") for line in output.splitlines(): pos = line.find(CALIBRE_SNB_PRE_TAG) if pos == -1: line = line.strip(' \t\n\r\u3000') else: etree.SubElement(bodyTree, "text").text = \ etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):]) continue if len(line) != 0: if line.find(CALIBRE_SNB_IMG_TAG) == 0: prefix = ProcessFileName(os.path.dirname(self.item.href)) if prefix != '': etree.SubElement(bodyTree, "img").text = \ prefix + '_' + line[len(CALIBRE_SNB_IMG_TAG):] else: etree.SubElement(bodyTree, "img").text = \ line[len(CALIBRE_SNB_IMG_TAG):] elif line.find(CALIBRE_SNB_BM_TAG) == 0: subitem = line[len(CALIBRE_SNB_BM_TAG):] bodyTree = trees[subitem].find(".//body") else: if self.opts and not self.opts.snb_dont_indent_first_line: prefix = '\u3000\u3000' else: prefix = '' etree.SubElement(bodyTree, "text").text = \ etree.CDATA(unicode_type(prefix + line)) if self.opts and self.opts.snb_insert_empty_line: etree.SubElement(bodyTree, "text").text = \ etree.CDATA('') return trees
def read_ncx_toc(self, toc, root=None): self.base_path = os.path.dirname(toc) if root is None: with open(toc, 'rb') as f: raw = xml_to_unicode(f.read(), assume_utf8=True, strip_encoding_pats=True)[0] root = safe_xml_fromstring(raw) xpn = {'re': 'http://exslt.org/regular-expressions'} XPath = functools.partial(etree.XPath, namespaces=xpn) def get_attr(node, default=None, attr='playorder'): for name, val in node.attrib.items(): if name and val and name.lower().endswith(attr): return val return default nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]') txt_path = XPath('./*[re:match(local-name(), "text$", "i")]') content_path = XPath('./*[re:match(local-name(), "content$", "i")]') np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]') def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = '' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd) nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root) if not nm: raise ValueError('NCX files must have a <navmap> element.') nm = nm[0] for child in np_path(nm): process_navpoint(child, self)
def pretty_all_xml_in_dir(path): for f in walk(path): if f.endswith('.xml') or f.endswith('.rels'): with open(f, 'r+b') as stream: raw = stream.read() if raw: root = safe_xml_fromstring(raw) stream.seek(0) stream.truncate() stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
def _html4_parse(data): data = html.fromstring(data) data.attrib.pop('xmlns', None) for elem in data.iter(tag=etree.Comment): if elem.text: elem.text = elem.text.strip('-') data = etree.tostring(data, encoding='unicode') data = safe_xml_fromstring(data) return data
def __init__(self, raw): self.root = safe_xml_fromstring(raw) if raw else None self.entries = {} if self.root is not None: for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): algorithm = em.get('Algorithm', '') cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]') if cr: uri = cr[0].get('URI', '') if uri and algorithm: self.entries[uri] = algorithm
def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = html.fragment_fromstring(raw) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def extract_cover_from_embedded_svg(html, base, log): from calibre.ebooks.oeb.base import XPath, SVG, XLINK from calibre.utils.xml_parse import safe_xml_fromstring root = safe_xml_fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) if href: path = os.path.join(base, *href.split('/')) return return_raster_image(path)
def fb2mlize_spine(self): output = ( self.fb2_header(), self.get_text(), self.fb2mlize_images(), self.fb2_footer(), ) output = self.clean_text('\n'.join(output)) if self.opts.pretty_print: output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True) return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
def parse_opf(stream_or_path): stream = stream_or_path if not hasattr(stream, 'read'): stream = open(stream, 'rb') raw = stream.read() if not raw: raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw = raw[raw.find('<'):] root = safe_xml_fromstring(clean_xml_chars(raw)) if root is None: raise ValueError('Not an OPF file') return root
def open_search(url, query, max_results=10, timeout=60): url_template = 'https://standardebooks.org/opds/all?query={searchTerms}' oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = 'https://standardebooks.org' + href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = 'https://standardebooks.org' + href elif 'http://opds-spec.org/acquisition' in rel: if type: ext = href.split('.')[1] if ext: ext = ext[:].upper().strip() s.downloads[ ext] = 'https://standardebooks.org' + href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() yield s
def get_text(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML text = [''] for item in self.oeb_book.spine: self.log.debug('Converting %s to PML markup...' % item.href) content = etree.tostring(item.data, encoding='unicode') content = self.prepare_text(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append(self.add_page_anchor(item)) text += self.dump_text(content.find(XHTML('body')), stylizer, item) return ''.join(text)
def __init__(self, stream=None): if not stream: return container = safe_xml_fromstring(stream.read()) if container.get('version', None) != '1.0': raise EPubException("unsupported version of OCF") rootfiles = container.xpath('./*[local-name()="rootfiles"]') if not rootfiles: raise EPubException("<rootfiles/> element missing") for rootfile in rootfiles[0].xpath('./*[local-name()="rootfile"]'): mt, fp = rootfile.get('media-type'), rootfile.get('full-path') if not mt or not fp: raise EPubException("<rootfile/> element malformed") self[mt] = fp
def search(browser, url, timeout=60): with closing(browser.open(url, timeout=timeout)) as f: data = safe_xml_fromstring(f.read()) for entry in data.xpath('//*[local-name() = "entry"]'): if is_book(entry): yield parse_book(entry, url) else: for link in entry.xpath('./*[local-name() = "link"]'): href = link.get('href') type = link.get('type') if href and type: next_url = urljoin(url, href) for book in search(browser, next_url, timeout): yield book
def get_metadata(f): read = lambda at, amount: _read(f, at, amount) f.seek(0) buf = f.read(12) if buf[4:] == b'ftypLRX2': offset = 0 while True: offset += word_be(buf[:4]) try: buf = read(offset, 8) except: raise ValueError('Not a valid LRX file') if buf[4:] == b'bbeb': break offset += 8 buf = read(offset, 16) if buf[:8].decode('utf-16-le') != 'LRF\x00': raise ValueError('Not a valid LRX file') lrf_version = word_le(buf[8:12]) offset += 0x4c compressed_size = short_le(read(offset, 2)) offset += 2 if lrf_version >= 800: offset += 6 compressed_size -= 4 uncompressed_size = word_le(read(offset, 4)) info = decompress(f.read(compressed_size)) if len(info) != uncompressed_size: raise ValueError('LRX file has malformed metadata section') root = safe_xml_fromstring(info) bi = root.find('BookInfo') title = bi.find('Title') title_sort = title.get('reading', None) title = title.text author = bi.find('Author') author_sort = author.get('reading', None) mi = MetaInformation(title, string_to_authors(author.text)) mi.title_sort, mi.author_sort = title_sort, author_sort author = author.text publisher = bi.find('Publisher') mi.publisher = getattr(publisher, 'text', None) mi.tags = [x.text for x in bi.findall('Category')] mi.language = root.find('DocInfo').find('Language').text return mi elif buf[4:8] == b'LRX': raise ValueError('Librie LRX format not supported') else: raise ValueError('Not a LRX file')
def ensure_namespace(doc): # Workaround for broken FB2 files produced by convertonlinefree.com. See # https://bugs.launchpad.net/bugs/1404701 bare_tags = False for x in ('description', 'body'): for x in doc.findall(x): if '{' not in x.tag: bare_tags = True break if bare_tags: import re raw = etree.tostring(doc, encoding='unicode') raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) doc = safe_xml_fromstring(raw) return doc
def insert_cover(self): from calibre.ebooks.oeb.base import urldefrag g, m = self.oeb.guide, self.oeb.manifest item = None if 'titlepage' not in g: if 'cover' in g: href = g['cover'].href else: href = self.default_cover() if href is None: return width, height = self.inspect_cover(href) if width == -1 or height == -1: self.log.warning('Failed to read cover dimensions') width, height = 600, 800 # if self.preserve_aspect_ratio: # width, height = 600, 800 self.svg_template = self.svg_template.replace( '__viewbox__', '0 0 %d %d' % (width, height)) self.svg_template = self.svg_template.replace( '__width__', unicode_type(width)) self.svg_template = self.svg_template.replace( '__height__', unicode_type(height)) if href is not None: templ = self.non_svg_template if self.no_svg_cover \ else self.svg_template tp = templ % unquote(href) id, href = m.generate('titlepage', 'titlepage.xhtml') item = m.add(id, href, guess_type('t.xhtml')[0], data=safe_xml_fromstring(tp)) else: item = self.oeb.manifest.hrefs[urldefrag( self.oeb.guide['titlepage'].href)[0]] if item is not None: self.oeb.spine.insert(0, item, True) if 'cover' not in self.oeb.guide.refs: self.oeb.guide.add('cover', 'Title page', 'a') self.oeb.guide.refs['cover'].href = item.href if 'titlepage' in self.oeb.guide.refs: self.oeb.guide.refs['titlepage'].href = item.href titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None) if titem is not None: titem.href = item.href
def __init__(self): from calibre.utils.config import config_dir from calibre.utils.lock import ExclusiveFile self.conf_path = os.path.join(config_dir, 'scheduler.xml') old_conf_path = os.path.join(config_dir, 'scheduler.pickle') self.root = E.recipe_collection() self.lock = RLock() if os.access(self.conf_path, os.R_OK): with ExclusiveFile(self.conf_path) as f: try: self.root = safe_xml_fromstring(f.read()) except: print('Failed to read recipe scheduler config') import traceback traceback.print_exc() elif os.path.exists(old_conf_path): self.migrate_old_conf(old_conf_path)
def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'): from calibre.spell.dictionary import parse_lang_code dest_dir = dest_dir or os.path.join(config_dir, 'dictionaries') if not os.path.exists(dest_dir): os.makedirs(dest_dir) num = 0 with ZipFile(source_path) as zf: def read_file(key): try: return zf.open(key).read() except KeyError: # Some dictionaries apparently put the xcu in a sub-directory # and incorrectly make paths relative to that directory instead # of the root, for example: # http://extensions.libreoffice.org/extension-center/italian-dictionary-thesaurus-hyphenation-patterns/releases/4.1/dict-it.oxt while key.startswith('../'): key = key[3:] return zf.open(key.lstrip('/')).read() root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read()) xcu = XPath( '//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]' )(root)[0].get('{%s}full-path' % NS_MAP['manifest']) for (dic, aff), locales in iteritems( parse_xcu(zf.open(xcu).read(), origin='')): dic, aff = dic.lstrip('/'), aff.lstrip('/') d = tempfile.mkdtemp(prefix=prefix, dir=dest_dir) locales = uniq([ x for x in map(fill_country_code, locales) if parse_lang_code(x).countrycode ]) if not locales: continue metadata = [name] + list(locales) with open(os.path.join(d, 'locales'), 'wb') as f: f.write(('\n'.join(metadata)).encode('utf-8')) dd, ad = convert_to_utf8(read_file(dic), read_file(aff)) with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f: f.write(dd) with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f: f.write(ad) num += 1 return num
def import_opml(raw, preserve_groups=True): root = safe_xml_fromstring(raw) groups = defaultdict(list) ax = etree.XPath('ancestor::outline[@title or @text]') for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'): url = outline.get('xmlUrl') parent = outline.get('title', '') or url title = parent if ('title' in outline.attrib and parent) else None if preserve_groups: for ancestor in ax(outline): if ancestor.get('type', None) != 'rss': text = ancestor.get('title') or ancestor.get('text') if text: parent = text break groups[parent].append((title, url)) for title in sorted(groups, key=sort_key): yield Group(title, uniq(groups[title], kmap=itemgetter(1)))