def __init__(self, file=None, # Document file name mimetype='', # Mimetype string content='', # Content data (the text) manifest='', # Lists the contents of the ODF file meta='', # Metadata styles='', # Formatting data settings='', # Application-specific data additional={}, # Additional bundled files (e.g. images) file_dates={} # File dates for all files and directories ): # Get all method parameters args = locals() # Pass XML components to corresponding constructors self.content = Content(content) self.manifest = Manifest(manifest) self.meta = Meta(meta) self.settings = Settings(settings) self.styles = Styles(styles) # Remaining components don't need any conversion self.file = file self.mimetype = mimetype self.additional = additional self.file_dates = file_dates
class Document(object): """ The ODF document class -- object model and associated methods. Contains the document in memory and is used as the intermediate step for conversions and transformations. This implementation uses the ElementTree module to create and navigate the object. This is built into Python 2.5 and available separately as a standalone module. """ def __init__(self, file=None, # Document file name mimetype='', # Mimetype string content='', # Content data (the text) manifest='', # Lists the contents of the ODF file meta='', # Metadata styles='', # Formatting data settings='', # Application-specific data additional={}, # Additional bundled files (e.g. images) file_dates={} # File dates for all files and directories ): # Get all method parameters args = locals() # Pass XML components to corresponding constructors self.content = Content(content) self.manifest = Manifest(manifest) self.meta = Meta(meta) self.settings = Settings(settings) self.styles = Styles(styles) # Remaining components don't need any conversion self.file = file self.mimetype = mimetype self.additional = additional self.file_dates = file_dates # Get non-XML components from the document def get_embedded(self, filter=None, ignore_case=False): """Return a dictionary of the objects embedded in the document. By default, this should return all embedded objects; the list/dictionary can also be filtered for a certain type, e.g. image files. The filter currently supports UNIX glob patterns like "*a[bc]?.png" and/or correct regular expressions like ".*a[bc].\.png$". """ # TODO: support other embedded objects search = get_search_for_filter(filter, ignore_case) return dict([(filename[9:], data) for filename, data in self.additional.items() if 'Pictures/' == filename[:9] and search(filename[9:])]) def get_extension(self): """Return ODF extension for given mimetype.""" return get_extension(self.mimetype) # Convert the document to other formats def tostring(self, key="content", encoding="utf-8"): """Get the XML representation of the given component.""" comp = getattr(self, key) if isinstance(comp, str): return comp.encode(encoding) else: return comp.tostring(encoding=encoding) def totext(self, skip_blank_lines=True): """Return the content of the document as a plain-text Unicode string. Included here as well as in self.content to resemble to_html's usage. """ return self.content.totext() def tohtml(self, title="", encoding="utf-8"): """Return an UTF-8 encoded HTML representation of the document.""" # TODO: # First, convert to ET operations # Then, # - Scrape up meta tags and add to headnode # '<meta http-equiv="content-type" content="text/html; charset=UTF-8">' # '<meta type="Generator" content="python-odftools" />' # - Title for the page, if applicable # - Convert self.styles to CSS and add to headnode as a <style type="text/css"> element # - see cssutils at the Python cheeseshop # - Fix the unit test # # ENH: # - Support encodings other than UTF-8, and maybe Unicode # - Allow named elements # - A more natural way of doing the doctype declaration, if possible attrs_odf2html = {"style-name": "class"} tags_odf2html = { "a": "a", "body": "body", "p": "p", "span": "span", "table": "table", "h": "h1", "table-row": "tr", "table-cell": "td", "image": "img", "list": "ol", "list-item": "li" } htmldoc = ET.Element("html") headnode = ET.SubElement(htmldoc, "head") titlenode = ET.SubElement(headnode, "title") titlenode.text = title # ENH: add meta etc. nodes to the head as needed docbody = self.content.root.find("office:body") if docbody: bodynode = translate_nodes(docbody, tags_odf2html, attrs_odf2html) else: bodynode = ET.SubElement(htmldoc, "body") doctypestr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n' htmlstr = ET.tostring(htmldoc, encoding=encoding) return "\n".join((doctypestr, htmlstr)) # Operations def replace(self, search, replace, key="content"): return getattr(self, key).replace(search, replace)