def parse(self, encoding=None, errors='strict', **kwargs): """ Return the contents of toolbox settings file with a nested structure. :param encoding: encoding used by settings file :type encoding: str :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method. :type errors: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface """ builder = TreeBuilder() for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): # Check whether the first char of the field marker # indicates a block start (+) or end (-) block=mkr[0] if block in ("+", "-"): mkr=mkr[1:] else: block=None # Build tree on the basis of block char if block == "+": builder.start(mkr, {}) builder.data(value) elif block == '-': builder.end(mkr) else: builder.start(mkr, {}) builder.data(value) builder.end(mkr) return builder.close()
def test_xsd_sequence_callback(self): from xml.etree.ElementTree import TreeBuilder, tostring from c2cgeoportal.lib.dbreflection import _xsd_sequence_callback from papyrus.xsd import tag tb = TreeBuilder() with tag(tb, "xsd:sequence") as tb: _xsd_sequence_callback(tb, self.cls) e = tb.close() self.assertEqual( tostring(e), '<xsd:sequence>' '<xsd:element minOccurs="0" name="child1" nillable="true">' '<xsd:simpleType>' '<xsd:restriction base="xsd:string">' '<xsd:enumeration value="foo" />' '<xsd:enumeration value="bar" />' '</xsd:restriction>' '</xsd:simpleType>' '</xsd:element>' '<xsd:element minOccurs="0" name="child2" nillable="true">' '<xsd:simpleType>' '<xsd:restriction base="xsd:string">' '<xsd:enumeration value="foo" />' '<xsd:enumeration value="bar" />' '</xsd:restriction>' '</xsd:simpleType>' '</xsd:element>' '</xsd:sequence>')
def dumpGEXF(self,targetfp,projectName): self.ensureDumpBase() tb=TreeBuilder() tb.start("gexf",{"xmlns":"http://www.gexf.net/1.2draft","version":"1.2"}) tb.start("meta",{"lastmodifieddate":str(datetime.datetime.now())}) tb.start("creator",{}) tb.data("LAVI Collector") tb.end("creator") tb.start("description",{}) tb.data(projectName) tb.end("description") tb.end("meta") tb.start("graph",{"mode":"dynamic","start":formatTstamp(self.mint),"end":formatTstamp(self.maxt)}) tb.start("nodes",{}) for id,email,mint,maxt in self.nodesIterator(): tb.start("node",{"id":id,"label":email,"start":formatTstamp(mint),"end":formatTstamp(maxt)}) tb.end("node") tb.end("nodes") tb.start("edges",{}) for id,sender,rec,tstamp in self.edgesIterator(): tb.start("edge",{"id":id, "source":sender,"target":rec,"start":formatTstamp(tstamp),"end":formatTstamp(tstamp)}) tb.end("edge") tb.end("edges") tb.end("graph") tb.end("gexf") et=ElementTree(tb.close()) et.write(targetfp, encoding="UTF-8")
def emit(output, title_string, structure): "Write an SVG file to output representing structure." global builder builder = TreeBuilder() builder.start("svg", dict(xmlns="http://www.w3.org/2000/svg", width=str(width), height=str(height))) title(title_string) structure.depth(3) for key, depth in [(3, 3), (1, 5), (2, 6), (1, 10), (1, 11), (2, 15), (3, 16), (1, 22), (2, 25), (1, 25), ]: structure.key(key, depth) builder.end("svg") ElementTree(builder.close()).write(output, encoding='utf-8', xml_declaration=True) output.write("\n")
def build(self, root=None): if root is None: was_root = True root = TreeBuilder() else: was_root = False root.start(self.tagname(), self.attrs()) for i, child in enumerate(self.children): if isinstance(child, HTMLBuilder): child.build(root=root) else: if i in self._formatted: try: proxy = TreeProxy(root) parser = XMLParser(html=True, target=proxy) parser.feed(child) proxy.cleanup() except Exception as e: print("Bad formatting", e) root.data(str(child)) else: root.data(str(child)) root.end(self.tagname()) if was_root: root = root.close() return str(tostring(root, method="html").decode('utf-8'))
def message(self): builder = TreeBuilder() builder.start(self.resource_type, dict()) self.serialize(builder) builder.end(self.resource_type) msg = tostring(builder.close()) return msg
def save(self): '''saves action to file''' builder = TreeBuilder() builder.start("action", { "id" : str(self.id), "type" : str(self.type) }) builder.start("name", {}) builder.data(self.name) builder.end("name") self._write_type(builder) builder.end("action") doc = ElementTree(builder.close()) doc.write(self.get_file(self.id))
def _get_new_album_body(self, title): ''' Формирует XML-элемент для нового альбома. @param title: string @return Element ''' builder = TreeBuilder(Element) builder.start('entry', {'xmlns': ATOM_NS}) builder.start('title', {}) builder.data(title.decode('utf8')) builder.end('title') builder.end('entry') node = builder.close() return tostring(node)
def generate_nic(network): builder = TreeBuilder() builder.start('interface', {'type': network['type']}) builder.start('mac', {'address': network['mac']}) builder.end('mac') builder.start('source', network['source']) builder.end('source') builder.start('model', {'type':'virtio'}) builder.end('model') builder.end('interface') return builder.close() # vim:set sw=4 ts=4 et: # -*- coding: utf-8 -*-
def write_xml(f, taxes, _solr=None, rows=0): """ Export an XML file for the given taxonomies to the (open) file handle specified. If a SOLR connection is supplied, then include document elements for each category. """ x = TreeBuilder() x.start("taxonomy", {}) def f_pre(term): x.start("category", {"id": str(term.uid)}) x.start("name", {}) x.data(term.name) x.end("name") for clue, positive in term.iter_clues(): attrs = {} if not positive: attrs["negative"] = "true" x.start("clue", attrs) x.data(clue) x.end("clue") if _solr is not None: count, docs = get_docs_for_category(_solr, term, rows=rows) x.start("count", {}) x.data(str(count)) x.end("count") for doc_id, title, score in docs: x.start("doc", {"id": doc_id, "score": str(score)}) x.start("name", {}) x.data(title) x.end("name") x.end("doc") def f_post(term): x.end("category") for term in taxes: term.walk(f_pre, f_post) x.end("taxonomy") xml = ElementTree(x.close()) xml.write(f, xml_declaration=True, encoding="utf-8")
def generate_disk(disk, devicename): """ Creates XML representation of disk for libvirt based on disk_definition disk_definition is something like { 'type': 'network', 'device': 'disk', 'format': 'raw', 'source': { 'protocol': 'sheepdog', 'name': 'Alice', 'hosts': [('127.0.0.1', '7000'),], }, } or { 'type': 'file', 'device': 'disk', 'format': 'qcow2', 'source': { 'file': '/var/lib/libvirt/images/Alice.img', }, }, devicename is string representing devicename (eg. vda, vdb, ...) """ builder = TreeBuilder() builder.start('disk', {'type': disk['type'], 'device': disk['device']}) builder.start('source', sans(disk['source'], 'hosts')) for host in disk['source'].get('hosts', []): builder.start('host', {'name': host[0], 'port': host[1]}) builder.end('host') builder.end('source') builder.start('target', {'dev': devicename, 'bus': 'virtio'}) builder.end('target') builder.start('driver', {'name': 'qemu', 'cache': 'none', 'type': disk['format']}) builder.end('driver') builder.end('disk') return builder.close()
def build(self, root=None): if root is None: root = TreeBuilder() root.start("html", {}) root.start("head", {}) if self.title: root.start("title", {}) root.data(self.title) root.end("title") for meta in self._metas: root.start("meta", meta) root.end("meta") for style in self._stylesheets: root.start("link", {"rel": "stylesheet", "href": style, "type": "text/css"}) root.end("link") for script in self._scripts: root.start("script", {"type": "text/javascript", "src": script}) root.data(" ") root.end('script') root.end("head") super(Document, self).build(root=root) root.end("html") root = root.close() return "<!DOCTYPE html>\n%s" % str(tostring(root, method="html").decode('utf-8'))
class PDFMinerParser(object): def __init__(self): self.treebuild = TreeBuilder() @staticmethod def esc(s): return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s) def add_xml_node(self, tag, attrs, data): if not attrs: attrs = {} if data is None: data = "MISSING" self.treebuild.start(tag, attrs) self.treebuild.data(data) self.treebuild.end(tag) def dump(self, obj): try: obj_attrs = {"size": str(len(obj))} except TypeError: obj_attrs = {} if obj is None: self.add_xml_node("null", {}, '') elif isinstance(obj, dict): self.treebuild.start("dict", obj_attrs) for key, val in obj.iteritems(): # Replace non word characters in key key = re.sub(r'\W+', '', key) if key.isdigit() or not key: key = 'KEYERROR' self.treebuild.start(key, {}) try: self.dump(val) except Exception as e: sys.stderr.write("DUMP excpetion: %s\n" % e) self.treebuild.end(key) self.treebuild.end("dict") elif isinstance(obj, list): self.treebuild.start("list", obj_attrs) for listobj in obj: try: self.dump(listobj) except Exception as e: sys.stderr.write("DUMP excpetion: %s\n" % e) self.treebuild.end("list") elif isinstance(obj, str): self.add_xml_node("string", obj_attrs.update({"enc": ENC}), self.esc(obj).encode(ENC)) elif isinstance(obj, pdftypes.PDFStream): self.treebuild.start("stream", obj_attrs) self.treebuild.start("props", {}) try: self.dump(obj.attrs) except Exception as e: sys.stderr.write("DUMP excpetion: %s\n" % e) self.treebuild.end("props") try: data = obj.get_data() except pdftypes.PDFNotImplementedError as e: self.add_xml_node("error", {"type": "PDFNotImplementedError"}, e.message) except pdftypes.PDFException as e: self.add_xml_node("error", {"type": "PDFException"}, e.message) except Exception as e: self.add_xml_node("error", {"type": "Uncaught"}, str(e)) else: js = getJavascript(str(data)) if js: self.add_xml_node("js", { "enc": ENC, "size": str(len(js)) }, js) else: self.add_xml_node("data", { "enc": ENC, "size": str(len(data)) }, self.esc(data).encode(ENC)) self.treebuild.end("stream") elif isinstance(obj, pdftypes.PDFObjRef): self.add_xml_node("ref", {"id": str(obj.objid)}, '') elif isinstance(obj, PSKeyword): self.add_xml_node("keyword", {}, obj.name) elif isinstance(obj, PSLiteral): self.add_xml_node("literal", {}, obj.name) elif isinstance(obj, (int, long, float)): self.add_xml_node("number", {}, str(obj)) else: raise TypeError(obj) def get_obj_loc(self, xref, objid): loc = "UNKNOWN" try: loc = xref.get_pos(objid)[1] except KeyError: loc = "FREE" finally: return loc def read_pdf_block(self, parser, pos, length=512): obj_data = "UNKNOWN" try: obj_data = parser.read_n_from(pos, length) except TypeError: obj_data = "ERROR: Could not read PDF data from pos: %s for %s bytes" % ( pos, length) finally: return obj_data def end_xml_node(self, tag): try: self.treebuild.end(tag) except AssertionError as e: if 'mismatch' in e.message: expected_tag = e.message.partition("(expected ")[2] expected_tag = expected_tag.partition(",")[0] if expected_tag: self.end_xml_node(expected_tag) def parse(self, pdf): try: fp = open(pdf.path, 'rb') except IOError as e: # logging.error("PDFMinerParser.parse unable to open PDF: %s" % e) sys.stderr.write("PDFMinerParser.parse unable to open PDF: %s\n" % e) return visited = set() self.treebuild.start("pdf", {"path": pdf.path}) try: parser = PDFParser(fp) doc = PDFDocument(parser) except PSEOF: self.add_xml_node("PSException", {}, "Unexpected end of PDF") self.treebuild.end("pdf") pdf.parsed = True return if doc.found_eof and doc.eof_distance > 3: pdf.blob = parser.read_from_end(doc.eof_distance).encode("base64") for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) obj_attrs = {"id": str(objid), "type": "normal"} obj_data = '' obj_xml = self.treebuild.start("object", obj_attrs) obj_loc = self.get_obj_loc(xref, objid) obj_xml.set("location", str(obj_loc)) try: self.dump(doc.getobj(objid)) except pdftypes.PDFObjectNotFound as e: obj_xml.set("type", "malformed") obj_data = self.read_pdf_block(parser, obj_loc, 4096).replace("<", "0x3C") except TypeError: obj_xml.set("type", "unknown") obj_data = self.read_pdf_block(parser, obj_loc).replace( "<", "0x3C") except Exception as e: obj_xml.set("type", "exception") obj_data = self.read_pdf_block(parser, obj_loc).replace( "<", "0x3C") self.add_xml_node("exception", {}, str(e)) try: obj_data.decode("ascii") except UnicodeDecodeError: obj_data = obj_data.encode("base64") self.treebuild.data(obj_data) #self.end_xml_node("object") try: self.treebuild.end("object") except (AssertionError, TypeError): return self.treebuild.start("trailer", {}) self.dump(xref.trailer) self.treebuild.end("trailer") self.treebuild.end("pdf") pdf.xml = self.treebuild.close() pdf.errors = doc.errors pdf.bytes_read = parser.BYTES pdf.parsed = True fp.close()
def scheme_to_etree(scheme, data_format="literal", pickle_fallback=False): """ Return an `xml.etree.ElementTree` representation of the `scheme. """ builder = TreeBuilder(element_factory=Element) builder.start("scheme", {"version": "2.0", "title": scheme.title or "", "description": scheme.description or ""}) ## Nodes node_ids = defaultdict(inf_range().__next__) builder.start("nodes", {}) for node in scheme.nodes: desc = node.description attrs = {"id": str(node_ids[node]), "name": desc.name, "qualified_name": desc.qualified_name, "project_name": desc.project_name or "", "version": desc.version or "", "title": node.title, } if node.position is not None: attrs["position"] = str(node.position) if type(node) is not SchemeNode: attrs["scheme_node_type"] = "%s.%s" % (type(node).__name__, type(node).__module__) builder.start("node", attrs) builder.end("node") builder.end("nodes") ## Links link_ids = defaultdict(inf_range().__next__) builder.start("links", {}) for link in scheme.links: source = link.source_node sink = link.sink_node source_id = node_ids[source] sink_id = node_ids[sink] attrs = {"id": str(link_ids[link]), "source_node_id": str(source_id), "sink_node_id": str(sink_id), "source_channel": link.source_channel.name, "sink_channel": link.sink_channel.name, "enabled": "true" if link.enabled else "false", } builder.start("link", attrs) builder.end("link") builder.end("links") ## Annotations annotation_ids = defaultdict(inf_range().__next__) builder.start("annotations", {}) for annotation in scheme.annotations: annot_id = annotation_ids[annotation] attrs = {"id": str(annot_id)} data = None if isinstance(annotation, SchemeTextAnnotation): tag = "text" attrs.update({"rect": repr(annotation.rect)}) # Save the font attributes font = annotation.font attrs.update({"font-family": font.get("family", None), "font-size": font.get("size", None)}) attrs = [(key, value) for key, value in attrs.items() if value is not None] attrs = dict((key, str(value)) for key, value in attrs) data = annotation.text elif isinstance(annotation, SchemeArrowAnnotation): tag = "arrow" attrs.update({"start": repr(annotation.start_pos), "end": repr(annotation.end_pos)}) # Save the arrow color try: color = annotation.color attrs.update({"fill": color}) except AttributeError: pass data = None else: log.warning("Can't save %r", annotation) continue builder.start(tag, attrs) if data is not None: builder.data(data) builder.end(tag) builder.end("annotations") builder.start("thumbnail", {}) builder.end("thumbnail") # Node properties/settings builder.start("node_properties", {}) for node in scheme.nodes: data = None if node.properties: try: data, format = dumps(node.properties, format=data_format, pickle_fallback=pickle_fallback) except Exception: log.error("Error serializing properties for node %r", node.title, exc_info=True) if data is not None: builder.start("properties", {"node_id": str(node_ids[node]), "format": format}) builder.data(data) builder.end("properties") builder.end("node_properties") builder.end("scheme") root = builder.close() tree = ElementTree(root) return tree
def serialize(self): builder = TreeBuilder() builder.start(self.resource_type, dict()) self.encode(builder) builder.end(self.resource_type) return tostring(builder.close())
builder.start("trkseg", {}) for waypoint in route["waypointList"]: coords = {"lat":str(waypoint["lat"]), "lon":str(waypoint["lon"])} builder.start("trkpt", coords) builder.start("ele", {}) builder.data(str(waypoint["alt"])) builder.end("ele") builder.start("time", {}) time = datetime.utcfromtimestamp(waypoint["time"]/1000) builder.data(time.strftime("%Y-%m-%dT%H:%M:%SZ")) builder.end("time") builder.end("trkpt") builder.end("trkseg") builder.end("trk") builder.end("gpx") root = builder.close() tree = ElementTree(root) tree.write(open(sys.argv[1][:-5]+".gpx", "wb"), "utf-8")
class EtreeModuleSerializer(ModuleSerializer): def __init__(self, module, flags=types.DEFAULT): super(EtreeModuleSerializer, self).__init__(module, flags) self.stack = [ElementBuilder("module", {}, [])] self.builder = TreeBuilder() self.etree = None def run_ast(self): ast_el = ElementBuilder("ast", {}, []) self.add_child(ast_el) self.stack.append(ast_el) super(EtreeModuleSerializer, self).run_ast() self.stack.pop() def run_tokens(self): tokens_el = ElementBuilder("tokens", {}, []) self.add_child(tokens_el) self.stack.append(tokens_el) super(EtreeModuleSerializer, self).run_tokens() self.stack.pop() def get_root(self): if self.etree is None: assert len(self.stack) == 1 self._build(self.stack[0]) self.etree = ElementTree(self.builder.close()) return self.etree def process_token(self, token, offset): ttype, tval, tstart, tend, tline = token attrs = {"type": repr(ttype), "value": repr(tval).lstrip("u")[1:-1], "offset": repr(offset)} if tstart is not None: attrs.update({"start_line": repr(tstart[0]), "start_col": repr(tstart[1])}) if tend is not None: attrs.update({"end_line": repr(tend[0]), "end_col": repr(tend[1])}) if tline is not None: attrs["line"] = repr(tline).lstrip("u")[1:-1] el = ElementBuilder(tokenize.tok_name[ttype], attrs, []) self.add_child(el) def add_attr(self, attr, value): self.stack[-1].attrs[attr] = value def add_child(self, child): self.stack[-1].children.append(child) def start_ast(self, node): el = ElementBuilder(type(node).__name__, {}, []) line = getattr(node, "lineno", None) if line is not None: el.attrs["line"] = repr(line) col = getattr(node, "col_offset", None) if col is not None: el.attrs["col"] = repr(col) if self.flags & types.TOKENS: n = self.module.ast_map.get(node) if n is not None: if n.token_start < n.token_end: el.attrs["token_start"] = repr(n.token_start) el.attrs["token_end"] = repr(n.token_end) self.add_child(el) self.stack.append(el) def end_ast(self, node): self.stack.pop() def start_list_field(self, field, value): el = ElementBuilder(field, {}, []) self.add_child(el) self.stack.append(el) def end_list_field(self, field, value): self.stack.pop() def process_ast_field(self, field, value): el = ElementBuilder(field, {}, []) self.add_child(el) self.stack.append(el) super(EtreeModuleSerializer, self).process_ast_field(field, value) self.stack.pop() def process_string_field(self, field, value): self.add_attr(field, value) def process_number_field(self, field, value): self.add_attr(field, repr(value)) def process_null_field(self, field): self.add_child(ElementBuilder(field, {}, [])) def _build(self, el): self.builder.start(el.tag, el.attrs) for child in el.children: self._build(child) self.builder.end(el.tag) def write_xml(self, path): etree = self.get_root() if hasattr(path, "write"): etree.write(path, "utf-8") else: with open(path, "w") as f: etree.write(f, "utf-8") def xml_string(self): f = StringIO() self.write_xml(f) f.seek(0) return f.read() def generic_visit(self, node): super(EtreeModuleSerializer, self).generic_visit(node)
def _record_parse(self, key=None, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file with all markers at the same level. Thus the following Toolbox database:: \_sh v3.0 400 Rotokas Dictionary \_DateStampHasFourDigitYear \lx kaa \ps V.A \ge gag \gp nek i pas \lx kaa \ps V.B \ge strangle \gp pasim nek after parsing will end up with the same structure (ignoring the extra whitespace) as the following XML fragment after being parsed by ElementTree:: <toolbox_data> <header> <_sh>v3.0 400 Rotokas Dictionary</_sh> <_DateStampHasFourDigitYear/> </header> <record> <lx>kaa</lx> <ps>V.A</ps> <ge>gag</ge> <gp>nek i pas</gp> </record> <record> <lx>kaa</lx> <ps>V.B</ps> <ge>strangle</ge> <gp>pasim nek</gp> </record> </toolbox_data> :param key: Name of key marker at the start of each record. If set to None (the default value) the first marker that doesn't begin with an underscore is assumed to be the key. :type key: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface :return: contents of toolbox data divided into header and records """ builder = TreeBuilder() builder.start('toolbox_data', {}) builder.start('header', {}) in_records = False for mkr, value in self.fields(**kwargs): if key is None and not in_records and mkr[0] != '_': key = mkr if mkr == key: if in_records: builder.end('record') else: builder.end('header') in_records = True builder.start('record', {}) builder.start(mkr, {}) builder.data(value) builder.end(mkr) if in_records: builder.end('record') else: builder.end('header') builder.end('toolbox_data') return builder.close()
def getGPX(self, fileName): gpx = TreeBuilder() # GPX tag gpx.start( "gpx", { "version": "1.2", "creator": "NikePlus", "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xmlns": "http://www.topografix.com/GPX/1/2/", "xsi:schemaLocation": "http://www.topografix.com/GPX/1/2/ http://www.topografix.com/gpx/1/2/gpx.xsd", }, ) # Metadata gpx.start("metadata", {}) # Meta Name gpx.start("name", {}) gpx.data("Run " + self._runId) gpx.end("name") # Bounds minLat = min([point.lat for point in self._trackPointList]) maxLat = max([point.lat for point in self._trackPointList]) minLon = min([point.lon for point in self._trackPointList]) maxLon = max([point.lon for point in self._trackPointList]) extents = {"minLat": str(minLat), "maxLat": str(maxLat), "minLon": str(minLon), "maxLon": str(maxLon)} gpx.start("bounds", extents) gpx.end("bounds") # Metadata close gpx.end("metadata") # Track gpx.start("trk", {}) # Track Name gpx.start("name", {}) gpx.data(str(self._runId)) gpx.end("name") # Track Type gpx.start("type", {}) gpx.data("Run") gpx.end("type") gpx.start("trkseg", {}) for point in self._trackPointList: gpx.start("trkpt", {"lat": str(point.lat), "lon": str(point.lon)}) gpx.start("ele", {}) gpx.data(str(point.altitudeMeters)) gpx.end("ele") gpx.start("time", {}) gpx.data(point.time.strftime("%Y-%m-%dT%H:%M:%SZ")) gpx.end("time") gpx.end("trkpt") gpx.end("trkseg") gpx.end("trk") gpx.end("gpx") gpxFile = ElementTree(gpx.close()) gpxFile.write(open(fileName, "wb"), "utf-8")
def _record_parse(self, key=None, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file with all markers at the same level. Thus the following Toolbox database:: \_sh v3.0 400 Rotokas Dictionary \_DateStampHasFourDigitYear \lx kaa \ps V.A \ge gag \gp nek i pas \lx kaa \ps V.B \ge strangle \gp pasim nek after parsing will end up with the same structure (ignoring the extra whitespace) as the following XML fragment after being parsed by ElementTree:: <toolbox_data> <header> <_sh>v3.0 400 Rotokas Dictionary</_sh> <_DateStampHasFourDigitYear/> </header> <record> <lx>kaa</lx> <ps>V.A</ps> <ge>gag</ge> <gp>nek i pas</gp> </record> <record> <lx>kaa</lx> <ps>V.B</ps> <ge>strangle</ge> <gp>pasim nek</gp> </record> </toolbox_data> :param key: Name of key marker at the start of each record. If set to None (the default value) the first marker that doesn't begin with an underscore is assumed to be the key. :type key: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface :return: contents of toolbox data divided into header and records """ builder = TreeBuilder() builder.start("toolbox_data", {}) builder.start("header", {}) in_records = False for mkr, value in self.fields(**kwargs): if key is None and not in_records and mkr[0] != "_": key = mkr if mkr == key: if in_records: builder.end("record") else: builder.end("header") in_records = True builder.start("record", {}) builder.start(mkr, {}) builder.data(value) builder.end(mkr) if in_records: builder.end("record") else: builder.end("header") builder.end("toolbox_data") return builder.close()
class PDFMinerParser(object): def __init__(self): self.treebuild = TreeBuilder() @staticmethod def esc(s): return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s) def add_xml_node(self, tag, attrs=None, data=''): if not attrs: attrs = {} self.treebuild.start(tag, attrs) self.treebuild.data(data) self.treebuild.end(tag) def dump(self, obj): try: obj_attrs = {"size": str(len(obj))} except TypeError: obj_attrs = {} if obj is None: self.add_xml_node("null") elif isinstance(obj, dict): self.treebuild.start("dict", obj_attrs) for key, val in obj.iteritems(): # Replace non word characters in key key = re.sub(r'\W+', '', key) if key.isdigit() or not key: key = 'KEYERROR' self.treebuild.start(key, {}) self.dump(val) self.treebuild.end(key) self.treebuild.end("dict") elif isinstance(obj, list): self.treebuild.start("list", obj_attrs) for listobj in obj: self.dump(listobj) self.treebuild.end("list") elif isinstance(obj, str): self.add_xml_node("string", obj_attrs.update({"enc": ENC}), self.esc(obj).encode(ENC)) elif isinstance(obj, pdftypes.PDFStream): self.treebuild.start("stream", obj_attrs) self.treebuild.start("props", {}) self.dump(obj.attrs) self.treebuild.end("props") try: data = obj.get_data() except pdftypes.PDFNotImplementedError as e: self.add_xml_node("error", {"type": "PDFNotImplementedError"}, e.message) except pdftypes.PDFException as e: self.add_xml_node("error", {"type": "PDFException"}, e.message) except Exception as e: self.add_xml_node("error", {"type": "Uncaught"}, e.message) else: self.add_xml_node("data", attrs={ "enc": ENC, "size": str(len(data)) }, data=self.esc(data).encode(ENC)) """ Check js? swf? """ self.treebuild.end("stream") elif isinstance(obj, pdftypes.PDFObjRef): self.add_xml_node("ref", {"id": str(obj.objid)}) elif isinstance(obj, PSKeyword): self.add_xml_node("keyword", data=obj.name) elif isinstance(obj, PSLiteral): self.add_xml_node("literal", data=obj.name) elif isinstance(obj, (int, long, float)): self.add_xml_node("number", data=str(obj)) else: raise TypeError(obj) def parse(self, pdf): try: fp = open(pdf.path, 'rb') except IOError as e: logging.error("PDFMinerParser.parse unable to open PDF: %s" % e) return parser = PDFParser(fp) doc = PDFDocument(parser) if doc.found_eof and doc.eof_distance > 3: pdf.blob = parser.read_from_end(doc.eof_distance) visited = set() self.treebuild.start("pdf", {"path": pdf.path}) for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) obj_attrs = {"id": str(objid), "type": "normal"} obj_data = '' obj_xml = self.treebuild.start("object", obj_attrs) try: self.dump(doc.getobj(objid)) except pdftypes.PDFObjectNotFound as e: obj_xml.set("type", "malformed") obj_data = parser.read_n_from(xref.get_pos(objid)[1], 4096) obj_data = obj_data.replace('<', '0x3C') except TypeError: obj_xml.set("type", "unknown") obj_data = parser.read_n_from(xref.get_pos(objid)[1], 512) except Exception as e: obj_xml.set("type", "exception") obj_data = parser.read_n_from(xref.get_pos(objid)[1], 512) self.add_xml_node("exception", {}, e.message) self.treebuild.data(obj_data) try: self.treebuild.end("object") except AssertionError as e: logging.error("Parse end object error: %s" % e) sys.stderr.write("%s\n" % tostring(obj_xml)) self.treebuild.start("trailer", {}) self.dump(xref.trailer) self.treebuild.end("trailer") self.treebuild.end("pdf") pdf.xml = self.treebuild.close() pdf.errors = doc.errors pdf.bytes_read = parser.BYTES pdf.parsed = True fp.close()
class WordMLTranslator(nodes.NodeVisitor): """ This WordML writer handles most of the features for now, You can get the gist of it rather easily. It's not nearly as complicated as the HTML writer, surprisingly. """ title_styles = ["Heading1", "Heading2", "Heading3"] xml_spaces = " " # these are meant to be default lists, one for bullet and one for enumerated. # other list types should be added here later. def __init__(self, document): nodes.NodeVisitor.__init__(self, document) self.document = document self.doc = "" self.body = [] # the body of the document self.in_docinfo = None self.in_sidebar = None self.settings = settings = document.settings self.section_level = 0 self.title_level = 0 self.text_properties = {} # the list of properties for a run of test self.text_prefix = [] # text or commands inserted before next text run self.paragraph_properties = {} self.id = 1000 self.in_footnote = 0 self.no_text = 0 # flag to suppress any text output self.literal_text = 0 # flag to output newlines in text self.skip_encode = 0 # flag to skip the encoding of text to xml friendly output self.has_text_output = 0 # flag if any text output since last reset self.figure_count = 0 self.in_figure = 0 self.indentation = 0 # list related variables self.lists = [] self.list_level = -1 self.list_count = 0 self.list_properties = [] self.list_defs = [] self.template_list_defs = [] self.current_listdef = [] self.in_listdef = 0 self.show_list_properties = 0 self.extract_listdefs() self.substitutions = {} # table related variables self.in_table = 0 self.colspecs = [] self.row = 0 self.col = 0 self.spans = {} # for multispan columns and rows self.total_col_width = 0 # xml output variables self.doc_tree = TreeBuilder() self.doc_tree.start("w:document", {}) self.xml_spacing = 0 def gen_id(self): self.id += 1 return self.id def template_start_element(self, name, attr): if name == "w:rest": self.doc_tree.end("w:document") tree = self.doc_tree.close() doc_string = tostring(tree) p = xml.parsers.expat.ParserCreate() p.StartElementHandler = self.body_start_element p.EndElementHandler = self.body_end_element p.CharacterDataHandler = self.body_data p.Parse(doc_string) # for b in self.body: # self.doc += b elif name == "w:rstlists": for l in self.lists: self.doc += l elif name == "w:rstlistdefs": for l in self.list_defs: self.doc += l else: self.doc += "<" + name + " " for k, v in attr.iteritems(): self.doc += k + '="' + v + '" ' self.doc += ">" def template_end_element(self, name): if name == "w:rest" or name == "w:rstlists" or name == "w:rstlistdefs": pass else: self.doc += "</" + name + ">" def template_char_data(self, data): self.doc += data # routines for extracting the listdef elements from the template def start_listdef_extract(self, name, attr): if name == "w:listDef": self.in_listdef = 1 self.current_listdef = [] if name == "w:lsid": return if self.in_listdef == 1: self.current_listdef.append([name, attr]) def end_listdef_extract(self, name): self.current_listdef.append(["/" + name]) if name == "w:listDef": self.template_list_defs.append(self.current_listdef) self.in_listdef = 0 if name == "w:lsid": return def list_def_data(self, data): pass def extract_listdefs(self): p = xml.parsers.expat.ParserCreate() p.StartElementHandler = self.start_listdef_extract p.EndElementHandler = self.end_listdef_extract p.CharacterDataHandler = self.list_def_data template = file(self.document.settings.template) p.ParseFile(template) def listdef_to_xml(self, list_number, level, start, nfc): """Modify a listdef to include an alternate numbering scheme and new starting number. Then convert it to XML and return the XML string.""" xml = "" lvl = -1000 for x in self.template_list_defs[list_number]: # change the listDefId to match the new list if x[0] == "w:listDef": x[1]["w:listDefId"] = str(self.list_count + 1) # get the level if it has changed if x[0] == "w:lvl": lvl = int(x[1]["w:ilvl"]) # skip an existing nfc node if (x[0] == "w:nfc" or x[0] == "/w:nfc") and level == lvl: continue xml += "<" + x[0] + " " if len(x) == 2: for k, v in x[1].iteritems(): if x[0] == "w:start" and k == "w:val" and lvl == level: xml += k + '="' + str(start) + '" ' else: xml += k + '="' + v + '" ' xml += ">\n" # add in our nfc node right after the start node if x[0] == "/w:start" and level == lvl: xml += '<w:nfc w:val="' + str(nfc) + '" />\n' return xml def body_start_element(self, name, attr): if name == "w:document": return element = self.xml_spaces[0 : self.xml_spacing] + "<" + name + " " for k, v in attr.iteritems(): element += k + '="' + v + '" ' element = element[:-1] element += ">" if name != "w:t": element += "\n" self.xml_spacing += 2 self.doc += element def body_end_element(self, name): if name == "w:document": return self.xml_spacing -= 2 element = "" if name != "w:t": element += self.xml_spaces[0 : self.xml_spacing] element += "</" + name + ">\n" self.doc += element def body_data(self, data): self.doc += data def check_for_span(self, col, row): check_span = "{" + str(col) + "," + str(row) + "}" if self.spans.has_key(check_span): self.doc_tree.start("w:tc", {}) self.doc_tree.start("w:tcPr", {}) self.doc_tree.start("w:tcW", {"w": str(self.calc_col_pct(self.col)), "w:type": "pct"}) self.doc_tree.start(self.spans[check_span][0], self.spans[check_span][1]) self.doc_tree.end(self.spans[check_span][0]) self.doc_tree.end("w:tcW") self.doc_tree.end("w:tcPr") self.doc_tree.start("w:p", {}) self.doc_tree.end("w:p") self.doc_tree.end("w:tc") self.body.append( '<w:tc>\n <w:tcPr>\n <w:tcW w="' + str(self.calc_col_pct(col)) + '" w:type="pct" />\n <' + self.spans[check_span][0] + " />\n </w:tcPr>\n <w:p />\n</w:tc>\n" ) return True return False def astext(self): p = xml.parsers.expat.ParserCreate() p.StartElementHandler = self.template_start_element p.EndElementHandler = self.template_end_element p.CharacterDataHandler = self.template_char_data template = file(self.document.settings.template) p.ParseFile(template) return self.doc def encode(self, text): """Encode special characters in `text` & return.""" text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace('"', """) text = text.replace(">", ">") if self.literal_text == 1: text = text.replace("\n", "</w:t><w:br /><w:t>") else: text = text.replace("\n", " ") return text def visit_Text(self, node): # if we have turned off text input, then just return if self.no_text: return # skip encode allows us to inject custom wordml as a text node without self.doc_tree.start("w:r", {}) self.body.append("<w:r>") if len(self.text_properties) > 0: self.doc_tree.start("w:rPr", {}) self.body.append("<w:rPr>\n") for v in self.text_properties.values(): if type(v) == type(()): element = "<" + v[0] + " " for k, a in v[1].iteritems(): element += k + '="' + a + '" ' element += "/>" self.doc_tree.start(v[0], v[1]) self.doc_tree.end(v[0]) self.body.append(element) else: self.body.append(v) self.doc_tree.end("w:rPr") self.body.append("</w:rPr>\n") self.doc_tree.start("w:t", {}) self.body.append("<w:t>") text = node.astext() encoded = self.encode(text) self.doc_tree.data(encoded) self.body.append(encoded) self.has_text_output = 1 def depart_Text(self, node): # if we have turned off text input, then just return if self.no_text: return self.doc_tree.end("w:t") self.doc_tree.end("w:r") self.body.append("</w:t></w:r>\n") def visit_TextNoEncoding(self, node): if self.no_text: return self.doc_tree.data(node.astext()) self.body.append(node.astext()) def depart_TextNoEncoding(self, node): pass def visit_TextWithStyle(self, node): self.text_properties[node.style[0]] = node.style[1] self.visit_Text(node) def depart_TextWithStyle(self, node): del self.text_properties[node.style[0]] self.depart_Text(node) def visit_abbreviation(self, node): pass def depart_abbreviation(self, node): pass def visit_acronym(self, node): pass def depart_acronym(self, node): pass def visit_address(self, node): pass def depart_address(self, node): pass def visit_admonition(self, node, name=""): pass def depart_admonition(self, node=None): pass def visit_attention(self, node): pass def depart_attention(self, node): pass def visit_attribution(self, node): pass def depart_attribution(self, node): pass def visit_author(self, node): self.paragraph_properties["author"] = ("w:pStyle", {"w:val": "AuthorName"}) # self.paragraph_properties[ 'author' ] = '<w:pStyle w:val="AuthorName" />' self.visit_paragraph(node) def depart_author(self, node): del self.paragraph_properties["author"] self.depart_paragraph(node) def visit_authors(self, node): pass def depart_authors(self, node): pass def visit_block_quote(self, node): self.indentation += 720 def depart_block_quote(self, node): self.indentation -= 720 def visit_bullet_list(self, node): self.list_level += 1 self.list_count += 1 self.lists.append( '<w:list w:ilfo="' + str(self.list_count) + '">\n <w:ilst w:val="1">\n </w:ilst>\n</w:list>\n' ) self.list_properties.append( '<w:listPr>\n<w:ilvl w:val="' + str(self.list_level) + '" />\n<w:ilfo w:val="' + str(self.list_count) + '" />\n</w:listPr>\n' ) def depart_bullet_list(self, node): self.list_properties.pop() self.list_level -= 1 def visit_caption(self, node): self.figure_count += 1 self.text_properties["caption"] = ("w:rStyle", {"w:val": "Caption"}) # self.text_properties[ 'caption' ] = '<w:rStyle w:val="Caption" />' node.children.insert(0, nodes.Text("Figure " + str(self.figure_count) + " ")) def depart_caption(self, node): del self.text_properties["caption"] def visit_caution(self, node): pass def depart_caution(self, node): pass def visit_citation(self, node): if not self.in_footnote: self.no_text += 1 def depart_citation(self, node): if not self.in_footnote: self.no_text -= 1 def visit_citation_reference(self, node): citation = self.document.ids[node["refid"]] citation_reference_text = "" if not isinstance(citation, nodes.citation): raise TypeError("not a citation node mapped to id") self.doc_tree.start("w:r", {}) self.doc_tree.start("w:rPr", {}) self.doc_tree.start("w:rStyle", {"w:val": "CitationReference"}) self.doc_tree.end("w:rStyle") self.doc_tree.end("w:rPr") self.doc_tree.start("w:endnote", {"w:suppressRef": "on"}) self.body.append( '<w:r>\n<w:rPr>\n<w:rStyle w:val="CitationReference"/>\n</w:rPr>\n<w:endnote w:suppressRef="on">\n' ) self.in_footnote = 1 former_paragraph_properties = self.paragraph_properties.copy() self.paragraph_properties = {} self.paragraph_properties["citation"] = ("w:pStyle", {"w:val": "EndnoteText"}) # self.paragraph_properties[ 'citation' ] = '<w:pStyle w:val="EndnoteText"/>' labels = citation.traverse(condition=nodes.label) for n in labels: citation_reference_text += n.astext() citation.children.remove(n) p = citation.traverse(condition=nodes.paragraph) # t_head = TextNoEncoding( '<w:r>\n<w:rPr>\n<w:rStyle w:val="CitationReference" />\n</w:rPr>\n<w:t>' ) # t_tail = TextNoEncoding( '</w:t>\n</w:r>\n') # p[0].children.insert( 0, t_tail ) # p[0].children.insert( 0, TextNoEncoding( '[' + citation_reference_text + '] ' )) # p[0].children.insert( 0, nodes.Text( '[' + citation_reference_text + '] ' )) t = TextWithStyle( "[" + citation_reference_text + "] ", ("citation", ("w:rStyle", {"w:val": "CitationReference"})) ) p[0].children.insert(0, t) # p[0].children.insert( 0, t_head ) citation.walkabout(self) p[0].children.remove(t) self.doc_tree.end("w:endnote") self.doc_tree.start("w:t", {}) self.doc_tree.data("[" + citation_reference_text + "]") self.doc_tree.end("w:t") self.doc_tree.end("w:r") self.body.append("</w:endnote>\n") self.body.append("<w:t>") self.body.append("[" + citation_reference_text + "]") self.body.append("</w:t>\n</w:r>\n") del self.paragraph_properties["citation"] self.in_footnote = 0 self.no_text += 1 self.paragraph_properties = former_paragraph_properties def depart_citation_reference(self, node): self.no_text -= 1 pass def visit_classifier(self, node): pass def depart_classifier(self, node): pass def visit_colspec(self, node): self.colspecs.append(node) self.total_col_width += node["colwidth"] def depart_colspec(self, node): pass def visit_comment(self, node): self.no_text += 1 def depart_comment(self, node): self.no_text -= 1 def visit_compound(self, node): pass def depart_compound(self, node): pass def visit_contact(self, node): self.paragraph_properties["contact"] = ("w:pStyle", {"w:val": "AuthorContact"}) # self.paragraph_properties[ 'contact' ] = '<w:pStyle w:val="AuthorContact" />' self.visit_paragraph(node) def depart_contact(self, node): del self.paragraph_properties["contact"] self.depart_paragraph(node) def visit_container(self, node): pass def depart_container(self, node): pass def visit_copyright(self, node): self.paragraph_properties["copyright"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # self.paragraph_properties[ 'copyright' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_copyright(self, node): del self.paragraph_properties["copyright"] self.depart_paragraph(node) def visit_danger(self, node): pass def depart_danger(self, node): pass def visit_date(self, node): self.paragraph_properties["date"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # self.paragraph_properties[ 'date' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_date(self, node): del self.paragraph_properties["date"] self.depart_paragraph(node) def visit_decoration(self, node): pass def depart_decoration(self, node): pass def visit_definition(self, node): self.indentation += 720 self.paragraph_properties["definition"] = ("w:pStyle", {"w:val": "Definition"}) # self.paragraph_properties[ 'definition' ] = '<w:pStyle w:val="Definition" />' def depart_definition(self, node): self.indentation -= 720 del self.paragraph_properties["definition"] def visit_definition_list(self, node): pass def depart_definition_list(self, node): pass def visit_definition_list_item(self, node): pass def depart_definition_list_item(self, node): pass def visit_description(self, node): pass def depart_description(self, node): pass def visit_docinfo(self, node): pass def depart_docinfo(self, node): pass def visit_docinfo_item(self, node, name, meta=1): pass def depart_docinfo_item(self): pass def visit_doctest_block(self, node): pass def depart_doctest_block(self, node): pass def visit_document(self, node): pass def depart_document(self, node): pass def visit_emphasis(self, node): self.text_properties["*"] = ("w:i", {}) # self.text_properties[ '*' ] = '<w:i/>' def depart_emphasis(self, node): del self.text_properties["*"] def calc_col_pct(self, col): width = int(self.colspecs[col]["colwidth"] * 100.0 / self.total_col_width + 0.5) return width def visit_entry(self, node): width = self.calc_col_pct(self.col) self.doc_tree.start("w:tc", {}) self.doc_tree.start("w:tcPr", {}) self.doc_tree.start("w:tcW", {"w": str(width), "w:type": "pct"}) self.doc_tree.end("w:tcW") self.body.append('<w:tc>\n <w:tcPr>\n <w:tcW w="' + str(width) + '" w:type="pct" />\n') self.has_text_output = 0 if node.has_key("morecols") and node.has_key("morerows"): raise NotImplementedError( "Table cell " + str(self.col) + "," + str(self.row) + " can't have both merged rows and columns." ) if node.has_key("morecols"): self.doc_tree.start("w:hmerge", {"w:val": "restart"}) self.doc_tree.end("w:hmerge") self.body.append('<w:hmerge w:val="restart" />') for i in range(node["morecols"]): span = "{" + str(self.col + i + 1) + "," + str(self.row) + "}" self.spans[span] = ("w:hmerge", {}) # '<w:hmerge />' if node.has_key("morerows"): self.doc_tree.start("w:vmerge", {"w:val": "restart"}) self.doc_tree.end("w:vmerge") self.body.append('<w:vmerge w:val="restart" />') for i in range(node["morerows"]): span = "{" + str(self.col) + "," + str(self.row + i + 1) + "}" self.spans[span] = ("w:vmerge", {}) # '<w:vmerge />' self.doc_tree.end("w:tcPr") self.body.append("</w:tcPr>\n") def depart_entry(self, node): if self.has_text_output == 0: self.doc_tree.start("w:p", {}) self.doc_tree.end("w:p") self.body.append(" <w:p />\n") self.doc_tree.end("w:tc") self.body.append("</w:tc>\n") # if there are any cells that are part of a span, then include them here as empty cells. col = self.col + 1 row = self.row while self.check_for_span(col, row): col = col + 1 self.col = col self.row = row def visit_enumerated_list(self, node): # to put any sort of customization in, it's necessary to add an entirely new listDef element. # so, I need to load the listDefs at the beginning of the run, and then customize them as needed. self.list_level += 1 self.list_count += 1 list_props = ( '<w:listPr>\n<w:ilvl w:val="' + str(self.list_level) + '" />\n<w:ilfo w:val="' + str(self.list_count) + '" />\n' ) # if the list has an explicit start, then set the text for it start = 1 if node.has_key("start"): start = int(node["start"]) nfc = 0 if node["enumtype"] == "arabic": nfc = 0 elif node["enumtype"] == "upperalpha": nfc = 3 elif node["enumtype"] == "loweralpha": nfc = 4 elif node["enumtype"] == "lowerroman": nfc = 2 elif node["enumtype"] == "upperroman": nfc = 1 self.list_defs.append(self.listdef_to_xml(0, self.list_level, start, nfc)) self.lists.append( '<w:list w:ilfo="' + str(self.list_count) + '">\n <w:ilst w:val="' + str(self.list_count + 1) + '">\n </w:ilst>\n</w:list>\n' ) list_props += "</w:listPr>\n" self.list_properties.append(list_props) def depart_enumerated_list(self, node): self.show_list_properties -= 1 self.list_properties.pop() self.list_level -= 1 def visit_error(self, node): pass def depart_error(self, node): pass def visit_field(self, node): self.paragraph_properties["field"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # self.paragraph_properties[ 'field' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_field(self, node): del self.paragraph_properties["field"] self.depart_paragraph(node) def visit_field_body(self, node): pass def depart_field_body(self, node): pass def visit_field_list(self, node): pass def depart_field_list(self, node): pass def visit_field_name(self, node): pass def depart_field_name(self, node): pass def visit_figure(self, node): self.in_figure = 1 def depart_figure(self, node): if self.in_figure: self.doc_tree.end("w:p") self.body.append("</w:p>\n") self.in_figure = 0 def visit_footer(self, node): pass def depart_footer(self, node): pass def visit_footnote(self, node): if not self.in_footnote: self.no_text += 1 def depart_footnote(self, node): if not self.in_footnote: self.no_text -= 1 def visit_footnote_reference(self, node): if not node.has_key("auto"): raise TypeError("footnotes required to be auto numbered") footnote = self.document.ids[node["refid"]] if not isinstance(footnote, nodes.footnote): raise TypeError("not a footnote node mapped to id") self.doc_tree.start("w:r", {}) self.doc_tree.start("w:rPr", {}) self.doc_tree.start("w:rStyle", {"w:val": "EndnoteReference"}) self.doc_tree.end("w:rStyle") self.doc_tree.end("w:rPr") self.doc_tree.start("w:endnote", {}) self.body.append('<w:r>\n<w:rPr>\n<w:rStyle w:val="EndnoteReference"/>\n</w:rPr>\n<w:endnote>\n') # figure out how to get the <w:endnoteRef/> self.in_footnote = 1 former_paragraph_properties = self.paragraph_properties.copy() self.paragraph_properties = {} self.paragraph_properties["footnote"] = ("w:pStyle", {"w:val": "EndnoteText"}) # self.paragraph_properties[ 'footnote' ] = '<w:pStyle w:val="EndnoteText"/>' # self.body.append( '<w:p>\n<w:pPr>\n<w:pStyle w:val="EndnoteText"/>\n</w:pPr>\n<w:r>\n<w:r>\n<w:rPr>\n<w:rStyle w:val="EndnoteReference"/>\n</w:rPr>\n<w:endnoteRef/>\n' ) # Find the label in the target footnode node and add it here. labels = footnote.traverse(condition=nodes.label) # replace label text with <w:endnoteRef /> for n in labels: footnote.children.remove(n) # n.children.append( nodes.Text('<w:r>\n<w:rPr>\n<w:rStyle w:val="EndnoteReference" />\n</w:rPr>\n<w:endnoteRef />\n</w:r>\n')) p = footnote.traverse(condition=nodes.paragraph) # t = TextNoEncoding( '<w:r>\n<w:rPr>\n<w:rStyle w:val="EndnoteReference" />\n</w:rPr>\n<w:endnoteRef />\n</w:r>\n') t = XMLRegion(xml=[("w:endnoteRef", {})], styles=[("w:rStyle", {"w:val": "EndnoteReference"})]) p[0].children.insert(0, t) footnote.walkabout(self) p[0].children.remove(t) self.doc_tree.end("w:endnote") self.doc_tree.end("w:r") self.body.append("</w:endnote>\n</w:r>\n") del self.paragraph_properties["footnote"] self.in_footnote = 0 self.no_text += 1 self.paragraph_properties = former_paragraph_properties def depart_footnote_reference(self, node): # del self.paragraph_properties[ 'footnote' ] # self.in_footnote = 0 self.no_text -= 1 def visit_generated(self, node): pass def depart_generated(self, node): pass def visit_header(self, node): pass def depart_header(self, node): pass def visit_hint(self, node): pass def depart_hint(self, node): pass def visit_image(self, node): width = 100 height = 100 align = "center" use_width = 0 use_height = 0 if Image: try: im = Image.open(node["uri"]) width = im.size[0] height = im.size[1] use_width = 1 use_height = 1 except (IOError, UnicodeError): pass if node.has_key("width"): width = node["width"] use_width = 1 if node.has_key("height"): height = node["height"] use_height = 1 if node.has_key("align"): align = node["align"] self.doc_tree.start("w:p", {}) self.doc_tree.start("w:pPr", {}) self.doc_tree.start("w:jc", {"w:val": str(align)}) self.doc_tree.end("w:jc") self.doc_tree.end("w:pPr") self.doc_tree.start("w:pict", {}) style = "position:absolute;left:0;text-align:left;margin-left:0;margin-top:0;width:" if use_width: style += str(width) + "px" else: style += "auto" style += ";height:" if use_height: style += str(height) + "px" else: style += "auto" style += ";z-index:1;mso-position-horizontal:center" self.doc_tree.start( "v:shape", { "id": str(self.gen_id()), "style": style, "coordsize": "", "o:spt": "100", "adj": "0,,0", "path": "", "stroked": "f", }, ) self.doc_tree.start("v:imagedata", {"src": node["uri"]}) self.doc_tree.end("v:imagedata") self.doc_tree.start("w10:wrap", {"type": "square"}) self.doc_tree.end("w10:wrap") self.doc_tree.end("v:shape") self.doc_tree.end("w:pict") self.body.append( '<w:p>\n<w:pPr>\n<w:jc w:val="' + str(align) + '" />\n</w:pPr>\n<w:pict>\n<v:shape id="' + str(self.gen_id()) + '" ' ) self.body.append('style="position:absolute;left:0;text-align:left;margin-left:0;margin-top:0;width:') if use_width: self.body.append(str(width) + "px") else: self.body.append("auto") self.body.append(";height:") if use_height: self.body.append(str(height) + "px") else: self.body.append("auto") self.body.append( ';z-index:1;mso-position-horizontal:center" coordsize="" o:spt="100" adj="0,,0" path="" stroked="f" >\n' ) self.body.append('<v:imagedata src="' + node["uri"] + '"/>\n<w10:wrap type="square"/>\n</v:shape>\n</w:pict>\n') def depart_image(self, node): if not self.in_figure: self.doc_tree.end("w:p") self.body.append("</w:p>\n") def visit_important(self, node): pass def depart_important(self, node): pass def visit_inline(self, node): pass def depart_inline(self, node): pass def visit_label(self, node): self.text_properties["label"] = ("w:rStyle", {"w:val": '"EndnoteReference"'}) # self.text_properties[ 'label' ] = '<w:rStyle w:val="EndnoteReference"/>\n' # if self.in_footnote: # self.no_text += 1 # self.body.append( '<w:r>\n<w:rPr>\n<w:rStyle w:val="EndnoteReference"/>\n</w:rPr>\n<w:endnoteRef/>\n</w:r>\n' ) pass def depart_label(self, node): del self.text_properties["label"] # if self.in_footnote: # self.no_text -= 1 pass def visit_legend(self, node): pass def depart_legend(self, node): pass def visit_line(self, node): pass def depart_line(self, node): pass def visit_line_block(self, node): pass def depart_line_block(self, node): pass def visit_list_item(self, node): pass def depart_list_item(self, node): pass def visit_literal(self, node): self.text_properties["literal"] = ("w:rStyle", {"w:val": "Literal"}) # self.text_properties[ 'literal' ] = '<w:rStyle w:val="Literal"/>\n' def depart_literal(self, node): del self.text_properties["literal"] def visit_literal_block(self, node): self.paragraph_properties["literal"] = ("w:pStyle", {"w:val": "LiteralBlock"}) # ~ self.paragraph_properties[ 'literal' ] = '<w:pStyle w:val="LiteralBlock" />\n' self.visit_paragraph(node) self.literal_text = 1 def depart_literal_block(self, node): del self.paragraph_properties["literal"] self.depart_paragraph(node) self.literal_text = 0 def visit_meta(self, node): pass def depart_meta(self, node): pass def visit_note(self, node): pass def depart_note(self, node): pass def visit_option(self, node): pass def depart_option(self, node): pass def visit_option_argument(self, node): pass def depart_option_argument(self, node): pass def visit_option_group(self, node): pass def depart_option_group(self, node): pass def visit_option_list(self, node): pass def depart_option_list(self, node): pass def visit_option_list_item(self, node): pass def depart_option_list_item(self, node): pass def visit_option_string(self, node): pass def depart_option_string(self, node): pass def visit_organization(self, node): self.paragraph_properties["organization"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # ~ self.paragraph_properties[ 'organization' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_organization(self, node): del self.paragraph_properties["organization"] self.depart_paragraph(node) def visit_paragraph(self, node): self.doc_tree.start("w:p", {}) self.body.append("<w:p>\n") if ( len(self.paragraph_properties) > 0 or len(self.list_properties) > 0 or (self.indentation > 0 and not self.in_footnote) ): self.doc_tree.start("w:pPr", {}) self.body.append("<w:pPr>\n") if self.indentation > 0 and not self.in_footnote: self.doc_tree.start("w:ind", {"w:left": str(self.indentation), "w:right": str(self.indentation)}) self.doc_tree.end("w:ind") self.body.append( '<w:ind w:left="' + str(self.indentation) + '" w:right="' + str(self.indentation) + '" />\n' ) for v in self.paragraph_properties.values(): if type(v) == type(()): element = "<" + v[0] + " " for k, a in v[1].iteritems(): element += k + '="' + a + '" ' element += "/>" self.doc_tree.start(v[0], v[1]) self.doc_tree.end(v[0]) self.body.append(element) else: self.body.append(v) if len(self.list_properties) > 0 and isinstance(node.parent, nodes.list_item): if type(self.list_properties[-1]) == type(()): t = self.list_properties[-1] element = "<" + t[0] + " " for k, a in t[1].iteritems(): element += k + '="' + a + '" ' element += "/>" self.doc_tree.start(t[0], t[1]) self.doc_tree.end(t[0]) self.body.append(element) else: self.body.append(self.list_properties[-1]) self.doc_tree.end("w:pPr") self.body.append("\n</w:pPr>\n") def depart_paragraph(self, node): self.doc_tree.end("w:p") self.body.append("</w:p>\n") def visit_problematic(self, node): pass def depart_problematic(self, node): pass def visit_raw(self, node): pass def visit_reference(self, node): if node.has_key("refid"): self.doc_tree.start("w:hlink", {"w:bookmark": node["refid"]}) self.body.append('<w:hlink w:bookmark="' + node["refid"] + '" >\n') if node.has_key("refuri"): self.doc_tree.start("w:hlink", {"w:dest": node["refuri"]}) self.body.append('<w:hlink w:dest="' + node["refuri"] + '" >\n') if not node.has_key("refuri") and not node.has_key("refid"): raise NotImplementedError("Unknown reference type") self.text_properties["ref"] = ("w:rStyle", {"w:val": "Hyperlink"}) # ~ self.text_properties['ref'] = '<w:rStyle w:val="Hyperlink" />\n' def depart_reference(self, node): del self.text_properties["ref"] self.doc_tree.end("w:hlink") self.body.append("</w:hlink>\n") def visit_revision(self, node): pass def depart_revision(self, node): pass def visit_row(self, node): self.doc_tree.start("w:tr", {}) self.body.append("<w:tr>\n") while self.check_for_span(self.col, self.row): self.col += 1 def depart_row(self, node): self.row += 1 self.col = 0 self.doc_tree.end("w:tr") self.body.append("</w:tr>\n") def visit_rubric(self, node): pass def depart_rubric(self, node): pass def visit_section(self, node): self.section_level += 1 if self.section_level > 3: raise NotImplementedError("Only 3 levels of headings supported.") if node.has_key("ids"): for id in node["ids"]: refid = self.gen_id() self.doc_tree.start( "aml:annotation", {"aml:id": str(refid), "w:type": "Word.Bookmark.Start", "w:name": id} ) self.doc_tree.end("aml:annotation") self.doc_tree.start( "aml:annotation", {"aml:id": str(refid), "w:type": "Word.Bookmark.End", "w:name": id} ) self.doc_tree.end("aml:annotation") self.body.append( '<aml:annotation aml:id="' + str(refid) + '" w:type="Word.Bookmark.Start" w:name="' + id + '" />' ) self.body.append( '<aml:annotation aml:id="' + str(refid) + '" w:type="Word.Bookmark.End" w:name="' + id + '" />' ) def depart_section(self, node): self.section_level -= 1 def visit_sidebar(self, node): pass def depart_sidebar(self, node): pass def visit_status(self, node): self.paragraph_properties["status"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # ~ self.paragraph_properties[ 'status' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_status(self, node): del self.paragraph_properties["status"] self.depart_paragraph(node) def visit_strong(self, node): self.text_properties["**"] = ("w:b", {}) # ~ self.text_properties[ '**' ] = '<w:b/>' def depart_strong(self, node): del self.text_properties["**"] def visit_subscript(self, node): self.text_properties["subscript"] = ("w:vertAlign", {"w:val": "subscript"}) # ~ self.text_properties[ 'subscript' ] = '<w:vertAlign w:val="subscript" />' def depart_subscript(self, node): del self.text_properties["subscript"] def visit_substitution_definition(self, node): raise nodes.SkipNode def visit_substitution_reference(self, node): raise NotImplementedError("substitution references not implemented") def visit_subtitle(self, node): self.paragraph_properties["subtitle"] = ("w:pStyle", {"w:val": self.title_styles[self.section_level + 1]}) # ~ self.paragraph_properties[ 'subtitle' ] = '<w:pStyle w:val="' + self.title_styles[ self.section_level + 1 ] + '"/>\n' self.visit_paragraph(node) def depart_subtitle(self, node): del self.paragraph_properties["subtitle"] self.depart_paragraph(node) def visit_superscript(self, node): self.text_properties["superscript"] = ("w:vertAlign", {"w:val": "superscript"}) # ~ self.text_properties[ 'superscript' ] = '<w:vertAlign w:val="superscript" />\n' def depart_superscript(self, node): del self.text_properties["superscript"] def visit_system_message(self, node): pass def depart_system_message(self, node): pass def visit_table(self, node): # include for now the default border around the table with a top vertical alignment self.in_table = 1 self.colspecs = [] self.spans = {} self.total_col_width = 0 self.row = 0 self.col = 0 self.doc_tree.start("w:tbl", {}) self.doc_tree.start("w:tblPr", {}) self.doc_tree.start("w:tblStyle", {"w:val": "Normal"}) self.doc_tree.end("w:tblStyle") self.doc_tree.start("w:tblW", {"w:w": "5000", "w:type": "pct"}) self.doc_tree.end("w:tblW") self.doc_tree.start("w:tblBorders", {}) self.doc_tree.start( "w:top", {"w:val": "single", "w:sz": "4", "wx:bdrwidth": "10", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:top") self.doc_tree.start( "w:left", {"w:val": "single", "w:sz": "4", "wx:bdrwidth": "10", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:left") self.doc_tree.start( "w:bottom", {"w:val": "single", "w:sz": "4", "wx:bdrwidth": "10", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:bottom") self.doc_tree.start( "w:right", {"w:val": "single", "w:sz": "4", "wx:bdrwidth": "10", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:right") self.doc_tree.start( "w:insideH", {"w:val": "single", "w:sz": "6", "wx:bdrwidth": "15", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:insideH") self.doc_tree.start( "w:insideV", {"w:val": "single", "w:sz": "6", "wx:bdrwidth": "15", "w:space": "0", "w:color": "auto"} ) self.doc_tree.end("w:insideV") self.doc_tree.end("w:tblBorders") self.doc_tree.start("w:tblLook", {"w:val": "000001e0"}) self.doc_tree.end("w:tblLook") self.doc_tree.end("w:tblPr") self.body.append( "<w:tbl>\n" " <w:tblPr>\n" ' <w:tblStyle w:val="Normal" />\n' ' <w:tblW w:w="5000" w:type="pct" />\n' " <w:tblBorders>\n" ' <w:top w:val="single" w:sz="4" wx:bdrwidth="10" w:space="0" w:color="auto"/>\n' ' <w:left w:val="single" w:sz="4" wx:bdrwidth="10" w:space="0" w:color="auto"/>\n' ' <w:bottom w:val="single" w:sz="4" wx:bdrwidth="10" w:space="0" w:color="auto"/>\n' ' <w:right w:val="single" w:sz="4" wx:bdrwidth="10" w:space="0" w:color="auto"/>\n' ' <w:insideH w:val="single" w:sz="6" wx:bdrwidth="15" w:space="0" w:color="auto"/>\n' ' <w:insideV w:val="single" w:sz="6" wx:bdrwidth="15" w:space="0" w:color="auto"/>\n' " </w:tblBorders>\n" ' <w:tblLook w:val="000001e0" />\n' " </w:tblPr>\n" ) def depart_table(self, node): self.doc_tree.end("w:tbl") self.doc_tree.start("w:p", {}) self.doc_tree.end("w:p") self.body.append("</w:tbl>\n") self.body.append("<w:p />") # add a blank line after the table self.in_table = 0 def visit_target(self, node): if node.has_key("refid"): refid = self.gen_id() self.doc_tree.start( "aml:annotation", {"aml:id": str(refid), "w:type": "Word.Bookmark.Start", "w:name": node["refid"]} ) self.doc_tree.end("aml:annotation") self.doc_tree.start( "aml:annotation", {"aml:id": str(refid), "w:type": "Word.Bookmark.End", "w:name": node["refid"]} ) self.doc_tree.end("aml:annotation") self.body.append( '<aml:annotation aml:id="' + str(refid) + '" w:type="Word.Bookmark.Start" w:name="' + node["refid"] + '" />\n' ) self.body.append( '<aml:annotation aml:id="' + str(refid) + '" w:type="Word.Bookmark.End" w:name="' + node["refid"] + '" />\n' ) def depart_target(self, node): pass def visit_tbody(self, node): pass def depart_tbody(self, node): pass def visit_term(self, node): self.paragraph_properties["term"] = ("w:pStyle", {"w:val": "DefinitionTerm"}) # ~ self.paragraph_properties[ 'term' ] = '<w:pStyle w:val="DefinitionTerm" />' self.visit_paragraph(node) def depart_term(self, node): del self.paragraph_properties["term"] self.depart_paragraph(node) def visit_tgroup(self, node): pass def depart_tgroup(self, node): pass def visit_thead(self, node): pass def depart_thead(self, node): pass def visit_tip(self, node): pass def depart_tip(self, node): pass def visit_title(self, node, move_ids=1): self.paragraph_properties["title"] = ("w:pStyle", {"w:val": self.title_styles[self.section_level]}) # ~ self.paragraph_properties[ 'title' ] = '<w:pStyle w:val="' + self.title_styles[ self.section_level ] + '"/>\n' self.visit_paragraph(node) def depart_title(self, node): del self.paragraph_properties["title"] self.depart_paragraph(node) def visit_title_reference(self, node): pass def depart_title_reference(self, node): pass def visit_topic(self, node): self.paragraph_properties["topic"] = ("w:pStyle", {"w:val": "Topic"}) # ~ self.paragraph_properties[ 'topic' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_topic(self, node): del self.paragraph_properties["topic"] self.depart_paragraph(node) def visit_transition(self, node): pass def depart_transition(self, node): pass def visit_version(self, node): self.paragraph_properties["version"] = ("w:pStyle", {"w:val": "BibliographMatter"}) # self.paragraph_properties[ 'version' ] = '<w:pStyle w:val="BibliographMatter" />' self.visit_paragraph(node) def depart_version(self, node): del self.paragraph_properties["version"] self.depart_paragraph(node) def visit_warning(self, node): pass def depart_warning(self, node): pass def visit_XMLRegion(self, node): self.doc_tree.start("w:r", {}) self.doc_tree.start("w:rPr", {}) for style in node["styles"]: self.doc_tree.start(style[0], style[1]) self.doc_tree.end(style[0]) self.doc_tree.end("w:rPr") for tag in node["xml"]: self.doc_tree.start(tag[0], tag[1]) self.doc_tree.end(tag[0]) def depart_XMLRegion(self, node): self.doc_tree.end("w:r") def unimplemented_visit(self, node): pass
def scheme_to_etree(scheme, data_format="literal", pickle_fallback=False): """ Return an `xml.etree.ElementTree` representation of the `scheme. """ builder = TreeBuilder(element_factory=Element) builder.start( "scheme", { "version": "2.0", "title": scheme.title or "", "description": scheme.description or "", }, ) ## Nodes node_ids = defaultdict(inf_range().__next__) builder.start("nodes", {}) for node in scheme.nodes: desc = node.description attrs = { "id": str(node_ids[node]), "name": desc.name, "qualified_name": desc.qualified_name, "project_name": desc.project_name or "", "version": desc.version or "", "title": node.title, } if node.position is not None: attrs["position"] = str(node.position) if type(node) is not SchemeNode: attrs["scheme_node_type"] = "%s.%s" % ( type(node).__name__, type(node).__module__, ) builder.start("node", attrs) builder.end("node") builder.end("nodes") ## Links link_ids = defaultdict(inf_range().__next__) builder.start("links", {}) for link in scheme.links: source = link.source_node sink = link.sink_node source_id = node_ids[source] sink_id = node_ids[sink] attrs = { "id": str(link_ids[link]), "source_node_id": str(source_id), "sink_node_id": str(sink_id), "source_channel": link.source_channel.name, "sink_channel": link.sink_channel.name, "enabled": "true" if link.enabled else "false", } builder.start("link", attrs) builder.end("link") builder.end("links") ## Annotations annotation_ids = defaultdict(inf_range().__next__) builder.start("annotations", {}) for annotation in scheme.annotations: annot_id = annotation_ids[annotation] attrs = {"id": str(annot_id)} data = None if isinstance(annotation, SchemeTextAnnotation): tag = "text" attrs.update({"type": annotation.content_type}) attrs.update({"rect": repr(annotation.rect)}) # Save the font attributes font = annotation.font attrs.update( { "font-family": font.get("family", None), "font-size": font.get("size", None), } ) attrs = [(key, value) for key, value in attrs.items() if value is not None] attrs = dict((key, str(value)) for key, value in attrs) data = annotation.content elif isinstance(annotation, SchemeArrowAnnotation): tag = "arrow" attrs.update( { "start": repr(annotation.start_pos), "end": repr(annotation.end_pos), "fill": annotation.color, } ) data = None else: log.warning("Can't save %r", annotation) continue builder.start(tag, attrs) if data is not None: builder.data(data) builder.end(tag) builder.end("annotations") builder.start("thumbnail", {}) builder.end("thumbnail") # Node properties/settings builder.start("node_properties", {}) for node in scheme.nodes: data = None if node.properties: try: data, format = dumps( node.properties, format=data_format, pickle_fallback=pickle_fallback ) except Exception: log.error( "Error serializing properties for node %r", node.title, exc_info=True, ) if data is not None: builder.start( "properties", {"node_id": str(node_ids[node]), "format": format} ) builder.data(data) builder.end("properties") builder.end("node_properties") builder.end("scheme") root = builder.close() tree = ElementTree(root) return tree
class XMLDumper(object): """XML-dumper for raw (unresolved) COD parse trees. """ def __init__(self, output_file, log_file=sys.stderr): from xml.etree.ElementTree import TreeBuilder self._out = output_file self._log = log_file self._tb = TreeBuilder() def log(self, msg): print >>self._log, msg def start(self, tag, **attrs): return self._tb.start(tag, dict((k, str(v)) for k, v in attrs.iteritems())) def data(self, data): self._tb.data(data) return self def end(self, tag): x = self._tb.end(tag) self.data("\n") return x def close(self): from xml.etree.ElementTree import TreeBuilder x = self._tb.close() self._tb = TreeBuilder() return x def dump_cod(self, cod_file): from xml.etree.ElementTree import ElementTree self.start(type(cod_file).__name__) self.dump_struct(cod_file) self.end(type(cod_file).__name__) etree = ElementTree(self.close()) etree.write(self._out) return etree def dump_struct(self, struct): for name in struct: self.dump_field(name, getattr(struct, name)) def dump_field(self, name, value): from bytecleaver import Struct attrs = {"name": name} # try: # attrs['raw'] = repr(value._C.get_range(value._start, value._end)) # except: # pass if isinstance(value, Struct): attrs["start"] = str(value._start) attrs["end"] = str(value._end) attrs["length"] = str(len(value)) self.start(type(value).__name__, **attrs) self.dump_value(value) self.end(type(value).__name__) def dump_value(self, value): from bytecleaver import Struct if isinstance(value, Struct): self.dump_struct(value) elif type(value) is list: for i, item in enumerate(value): self.start(type(item).__name__, index=str(i)) self.dump_value(item) self.end(type(item).__name__) else: self.data(repr(value))
print theSub print "Found jobid:", jobid print "nJobs: ", nJobs nomvar = "var" print nomvar jobid_pre = jobid.split("@")[0] print jobid.split("@") builder.start( "Sub", { "Name": theSub, "Sample": sample, "Type": "var", "Inel": var[0], "Elast": var[1] }) for i in range(0, int(nJobs)): jobid_end_num = int(jobid_pre[-1]) + i new_jobid = jobid_pre[0:-1] + str(jobid_end_num) builder.start("Job", {"ID": new_jobid, "N": str(i)}) builder.end("Job") builder.end("Sub") else: print "Error\n", stderr builder.end("Jobs") root = builder.close() rough = tostring(root, 'utf-8') reparsed = xdm.parseString(rough) tree = ElementTree(fromstring(reparsed.toprettyxml(indent=" "))) tree.write(args.o)