def source_fragment(self): value = self.context.body xpath = self.request.get('xpath', None) namespaces = {} if xpath is None or xpath == '/': return value.raw_encoded elif xpath.startswith('/cnx:'): namespaces = {'cnx': 'http://cnx.rice.edu/cnxml'} result = value.raw_encoded tree = etree.fromstring(value.raw_encoded) result = '' for node in tree.xpath(xpath, namespaces=namespaces): result += etree.tostring(node) # if we don't strip namespaces from the fragment, eip loses the # plot completely result = etree.fromstring(result) for prefix, ns in result.nsmap.items(): if prefix is None: prefix = 'nil' etree.strip_attributes(result, '{%s}%s' % (ns, prefix)) etree.cleanup_namespaces(result) result = etree.tostring(result) result = result.replace('xmlns="http://cnx.rice.edu/cnxml" ', '') return result
def insert_frequency(in_dir, out_dir, freq_dir): """ Find frequency values in the frequency_build data, and inserts them in the GEL data. """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') frequency_finder = FrequencyMemo(freq_dir) for filecontent in iterator.iterate(): for entry in filecontent.entries: for wordclass_set in entry.wordclass_sets(): etree.strip_attributes(wordclass_set.node, 'size') tables = {} for type in wordclass_set.types(): frequencies = frequency_finder.find_frequencies(type.id) if frequencies: tables[type.id] = FrequencyTable(data=frequencies) else: tables[type.id] = None for type in wordclass_set.types(): if tables[type.id]: type.node.append(tables[type.id].to_xml()) non_null_tables = [table for table in tables.values() if table] if non_null_tables: wcs_table = sum_frequency_tables(non_null_tables) wordclass_set.node.append(wcs_table.to_xml())
def strip_attributes(self, attributes): """ Remove list of attributes from the node. (Wrapper for etree.strip_attributes()) """ etree.strip_attributes(self.node, attributes)
def xml(self): """Filtered and stripped serialized document.""" if not hasattr(self, "_xml"): try: xml = etree.tostring(self.doc.resolved, encoding="utf-8") parser = etree.XMLParser(remove_blank_text=True) root = etree.fromstring(xml, parser) first = True for node in root.findall("SummaryMetaData/MainTopics"): if first: first = False else: parent = node.getparent() parent.remove(node) for node in root.xpath(self.CHANGES): parent = node.getparent() parent.remove(node) etree.strip_elements(root, with_tail=False, *self.STRIP) etree.strip_attributes(root, "PdqKey") opts = dict(pretty_print=True, encoding="unicode") self._xml = etree.tostring(root, **opts) except: logger.exception("failure processing XML") bail("failure processing XML") return self._xml
def clean_html(self, html): if not isinstance(html, unicode): raise ValueError('We only support cleaning unicode HTML fragments') #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff) divnode = fromstring(u'<div>' + html + u'</div>') self(divnode) # Strip all class attributes etree.strip_attributes(divnode, 'class') for style in divnode.xpath("//@style"): parent = style.getparent() try: cssStyle = cssutils.parseStyle(style) except Exception, e: logging.info("Style %s failed to parse with error %s." % (style, e)) parent.attrib.pop('style', None) continue # Set the line separator so that the style gets serialized cssutils.ser.prefs.lineSeparator = '' # Only allow valid style properties cssutils.ser.prefs.validOnly = True new_style = cssStyle.cssText if not new_style.strip(): parent.attrib.pop('style', None) else: parent.attrib['style'] = new_style
def handle_unit_node(self, node): original_id = node.get("id") filename = node.get("id").split("__%04__")[1] node.set("id", original_id.split("__%04__").pop()) for k in [i for i in node.keys() if i != "id"]: etree.strip_attributes(node, k) self.create_file_element(node, filename)
def rel_to_abs(self, base_url): """ Converts relative links from html contents to absolute links """ # Delete target attributes strip_attributes(self.tree, "target") # Absolute links self.tree.rewrite_links( lambda link: urljoin(base_url, link) if not link.startswith(self.rel_to_abs_excluded_prefixes) else link ) # Extra attributes onclick_elements = self.tree.xpath("//*[@onclick]") for element in onclick_elements: # Replace attribute with absolute URL element.set( "onclick", self.javascript_open_re.sub( lambda match: "%s%s%s" % (match.group("opening"), urljoin(base_url, match.group("url")), match.group("ending")), element.get("onclick"), ), )
def fit_svg_circles(self): elem_circles = self.root.findall('circle') for e in elem_circles: e.attrib['r'] = "{:0.2f}".format(max(float(e.attrib['height']), float(e.attrib['width'])) / 2) e.attrib['cx'] = "{:0.2f}".format(float(e.attrib['x']) + float(e.attrib['r'])) e.attrib['cy'] = "{:0.2f}".format(float(e.attrib['y']) + float(e.attrib['r'])) etree.strip_attributes(e, 'x', 'y', 'width', 'height', 'transform')
def reduce_block(node): logger.debug("reduce_block() with node: %s %s", node.tag, node.attrib) count_all_children = len(node.findall('./', namespaces)) count_fob_children = len(node.findall('./fo:block', namespaces)) count_fom_children = len(node.findall('./fo:marker', namespaces)) logger.debug("amount of children: %s", count_all_children) logger.debug("amount of fob children: %s", count_fob_children) logger.debug("node contains any text: %s", "None" if node.text is None else "yes") if (count_all_children == 1 and count_fob_children == 1 and node.text is None) or \ (count_all_children == 2 and count_fob_children == 1 and count_fom_children == 1 and node.text is None): logger.debug("merge possible: yes") child = node.find('fo:block', namespaces) marker = node.find('fo:marker', namespaces) parent = node.getparent() # contains the position of the current node in the parent tree pos = parent.index(node) # merging attributes # child attributes overwrite parent attributes merged_attrib = merge_two_dicts(dict(node.attrib), dict(child.attrib)) logger.debug("child attribs: %s %s", len(child.attrib), child.attrib) logger.debug("node attribs: %s %s", len(node.attrib), node.attrib) logger.debug("merged attribs %s %s", len(merged_attrib), merged_attrib) # merging # 1) move child to the position of node in parent tree # 2) replace child's attributes with the merged attribs # 3) remove node from parent tree parent.insert(pos, child) etree.strip_attributes(child, '*') for k, v in merged_attrib.iteritems(): child.set(k, v) parent.remove(node) # if there is a fo:marker block, it has also to be moved (in front of fo:block) if marker is not None: parent.insert(pos, marker) # because the node order has changed (node was replaced by child) # the child must be analyzed also, so we call reduce_block with child from the new position reduce_block(parent[pos]) else: logger.debug("merge possible: no") logger.debug("children: ") for child in node.findall('./fo:block', namespaces): logger.debug("%s %s", child.tag, child.attrib) for child in node.findall('./fo:block', namespaces): reduce_block(child) return
def Write(self, fxml): etree.strip_attributes( self.root, '{http://codespeak.net/lxml/objectify/pytype}pytype') objectify.deannotate(self.root, xsi_nil=True) etree.cleanup_namespaces(self.root) print >> fxml, (etree.tostring(self.root, pretty_print=True, xml_declaration=True, encoding='utf-8'))
def fit_svg_polygons(self): elem_polygons = self.root.findall('polygon') for e in elem_polygons: e.attrib['width'] = e.attrib['height'] t = create_triangle(float(e.attrib['x']), float(e.attrib['y']) , max(float(e.attrib['height']), float(e.attrib['width']))) e.attrib['points'] = "{:0.2f},{:0.2f} {:0.2f},{:0.2f} {:0.2f},{:0.2f}".\ format(*t[:-2]) etree.strip_attributes(e, 'x', 'y', 'width', 'height')
def remove_namespaces(self, tree): for elem in tree.getiterator(): if not hasattr(elem.tag, 'find'): continue # (1) i = elem.tag.find('}') if i >= 0: elem.tag = elem.tag[i+1:] objectify.deannotate(tree, cleanup_namespaces=True) etree.strip_attributes(tree, '{}id'.format('{'+self.xml+'}')) pass
def addpoint(Loc_data, Route_data): new_element = objectify.Element('CoursePoint') new_element.Name = Route_data[0][0:10] new_element.Time = Loc_data[0] new_element.Position = '' new_element.Position.LatitudeDegrees = Loc_data[1] new_element.Position.LongitudeDegrees = Loc_data[2] new_element.PointType = Route_data[1] new_element.Notes = Route_data[3] etree.strip_attributes(new_element, '{http://codespeak.net/lxml/objectify/pytype}pytype') etree.strip_attributes(new_element, '{http://www.w3.org/2001/XMLSchema-instance}nil') etree.cleanup_namespaces(new_element) return new_element
def html_before_write(self, book, chapter): from lxml import etree try: from urlparse import urlparse, urljoin except ImportError: from urllib.parse import urlparse, urljoin try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') # should also be aware to handle # ../chapter/ # ../chapter/#reference # ../chapter#reference for _link in body.xpath('//a'): # This is just temporary for the footnotes if _link.get('href', '').find('InsertNoteID') != -1: _ln = _link.get('href', '') i = _ln.find('#') _link.set('href', _ln[i:]) continue _u = urlparse(_link.get('href', '')) # Let us care only for internal links at the moment if _u.scheme == '': if _u.path != '': _link.set('href', '%s.xhtml' % _u.path) if _u.fragment != '': _link.set( 'href', urljoin(_link.get('href'), '#%s' % _u.fragment)) if _link.get('name') != None: _link.set('id', _link.get('name')) etree.strip_attributes(_link, 'name') chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def from_xml(self, content): tree = etree.fromstring(content) NSMAP = ['codeList', 'codeListValue', 'schemaLocation'] etree.strip_attributes(tree, *NSMAP) root = tree etree.cleanup_namespaces(root) namespaces = { 'gvq': '', 'gco': '', 'gmd19157': '', 'updated19115': '', 'gmd': '', 'gml': '', 'xsi': '', 'xmlns': '', } #Some fields are xml text fields. The fields must be unique inside the model. # Store those elements temporarily. listOfTextFields = getXMLTextFields() fieldDictionary = dict() namespaceDict, originalNamespace = getNamespaceInfo() for field in listOfTextFields: foundElements = root.findall(".//" + namespaceDict[field] + field, namespaces=originalNamespace) valueList = list() for element in foundElements: valueList.append(''.join( [etree.tostring(child) for child in element])) fieldDictionary[field] = valueList xmlcontent = etree.tostring(root) jsondata = xmltodict.parse(xmlcontent, namespaces=namespaces) #Replace all textFields with the xml text. for field in listOfTextFields: i = 0 replaceDictValueWithStringElement(jsondata, field, i, fieldDictionary) if 'GVQ_FeedbackCollection' in jsondata: item = jsondata['GVQ_FeedbackCollection']['item'] elif 'item' in jsondata: item = jsondata['item'] else: item = jsondata self.from_json(json.dumps(item)) return item
def to_node(self, serialized=False): node = etree.Element('morphSet', pos=self.baseclass) for unit in self.morphunits: child = unit.to_node() node.append(child) etree.strip_attributes(node, 'variantType') node.set('variantType', self.variant_type) node.set('score', str(self.score)) node.set('sort', self.lexical_sort()) if serialized: return etree.tostring(node) else: return node
def clean_html(self, html): if not isinstance(html, str): raise ValueError('We only support cleaning unicode HTML fragments') #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff) divnode = fromstring('<div>' + str(html) + u'</div>') self(divnode) # Strip all class attributes etree.strip_attributes(divnode, 'class') # Drop all xml:lang and lang attributes, and handle the # stripping of any bad css styles # Also drop id and class attributes - these are not useful in RichTextEditor for node in divnode.xpath("//*"): for key, value in node.attrib.items(): if key.lower() in ('xml:lang', 'lang', 'id', 'class'): node.attrib.pop(key, None) elif 'style' == key.lower(): try: cssStyle = cssutils.parseStyle(value) except Exception as e: logging.info( "Style %s failed to parse with error %s." % (value, e)) node.attrib.pop(key, None) continue # Set the line separator so that the style gets serialized cssutils.ser.prefs.lineSeparator = '' # Only allow valid style properties cssutils.ser.prefs.validOnly = True new_style = cssStyle.cssText if not new_style.strip(): node.attrib.pop(key, None) else: node.attrib[key] = new_style # Drop all empty span tags if node.tag == 'span' and not list(node.keys()): node.drop_tag() #Now unwrap the divnode (i.e. just serialize the children of our extra div node) cleaned = saxutils.escape(divnode.text) if divnode.text else '' for n in divnode: cleaned += tostring(n, encoding='unicode', method='xml') return cleaned
def from_xml(self, content): tree = etree.fromstring(content) NSMAP = ['codeList', 'codeListValue', 'schemaLocation'] etree.strip_attributes(tree, *NSMAP) root = tree etree.cleanup_namespaces(root) namespaces = { 'gvq' :'', 'gco' :'', 'gmd19157' :'', 'updated19115':'', 'gmd' :'', 'gml' :'', 'xsi' :'', 'xmlns' :'', } #Some fields are xml text fields. The fields must be unique inside the model. # Store those elements temporarily. listOfTextFields = getXMLTextFields() fieldDictionary = dict() namespaceDict, originalNamespace = getNamespaceInfo() for field in listOfTextFields: foundElements = root.findall(".//"+namespaceDict[field]+field, namespaces=originalNamespace) valueList = list() for element in foundElements: valueList.append(''.join([etree.tostring(child) for child in element])) fieldDictionary[field] = valueList xmlcontent = etree.tostring(root) jsondata = xmltodict.parse(xmlcontent, namespaces=namespaces) #Replace all textFields with the xml text. for field in listOfTextFields: i = 0 replaceDictValueWithStringElement(jsondata, field, i, fieldDictionary) if 'GVQ_FeedbackCollection' in jsondata: item = jsondata['GVQ_FeedbackCollection']['item'] elif 'item' in jsondata: item = jsondata['item'] else: item = jsondata self.from_json(json.dumps(item)) return item
def create_xml(self,tei,revision_id): """Create a simplified XML file only containing the text to be processed with WebLicht.""" for elem in tei.getiterator(): if not hasattr(elem.tag, 'find'): continue # (1) i = elem.tag.find('}') if i >= 0: elem.tag = elem.tag[i+1:] objectify.deannotate(tei, cleanup_namespaces=True) etree.strip_attributes(tei, '{}id'.format('{'+self.xml+'}')) content = tei.xpath('./text/body/div')[0] text = etree.Element('text', id = 'wiki_'+revision_id) text.append(content) outpath = os.path.join(self.xmldir,'wiki_'+revision_id+'.xml') tree = etree.ElementTree(text) tree.write(outpath, encoding = 'utf-8', pretty_print=True, xml_declaration=True) pass
def html_before_write(self, book, chapter): from lxml import etree try: from urlparse import urlparse, urljoin except ImportError: from urllib.parse import urlparse, urljoin try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') # should also be aware to handle # ../chapter/ # ../chapter/#reference # ../chapter#reference for _link in body.xpath('//a'): # This is just temporary for the footnotes if _link.get('href', '').find('InsertNoteID') != -1: _ln = _link.get('href', '') i = _ln.find('#') _link.set('href', _ln[i:]); continue _u = urlparse(_link.get('href', '')) # Let us care only for internal links at the moment if _u.scheme == '': if _u.path != '': _link.set('href', '%s.xhtml' % _u.path) if _u.fragment != '': _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment)) if _link.get('name') != None: _link.set('id', _link.get('name')) etree.strip_attributes(_link, 'name') chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def query_xpath_x(cnt_str, cc_xpath, removeTags=None, removeAtts=None, fixNode=' ', rstmode='html'): rs, msg = query_xpath(cnt_str, cc_xpath, fixNode) if msg or len(rs) == 0: return rs, msg for i in range(len(rs)): if isinstance(rs[i], etree._Element): if removeTags: etree.strip_tags(rs[i], removeTags) if removeAtts: etree.strip_attributes(rs[i], removeAtts) rs[i] = etree.tostring(rs[i], encoding='unicode', method=rstmode) return rs, msg
def protect(self): """ Find all elements with a 'Protected=False' attribute and replace the text with a protected value in the XML element tree. If there was a 'ProtectedValue' attribute, it is deleted and the 'Protected' attribute is set to 'True'. The 'ProtectPassword' element in the 'Meta' section is also set to 'True'. This does not just restore the previous protected value, but reencrypts all text values of elements with 'Protected=False'. So you could use this after modifying a password, adding a completely new entry or deleting entry history items. """ self._reset_salsa() self.obj_root.Meta.MemoryProtection.ProtectPassword._setText('True') for elem in self.obj_root.iterfind('.//Value[@Protected="False"]'): etree.strip_attributes(elem, 'ProtectedValue') elem.set('Protected', 'True') elem._setText(self._protect(elem.text))
def processReuters(html): # initialize return article_info = dict() article_info['title'] = html.find('.//meta[@property=\'og:title\']').get('content') text_elem = html.find('.//span[@id=\'articleText\']') try: article_info['author'] = etree.tostring(text_elem.find('.//p[@class=\'byline\']')).replace('\n','') except: article_info['author'] = "By Unknown" etree.strip_elements(text_elem, 'div') etree.strip_tags(text_elem, 'span') for elem in text_elem.findall('.//p'): etree.strip_attributes(elem) article_info['story'] = ''.join([etree.tostring(child) for child in text_elem.iterchildren()]).replace('\n','') return article_info
def htm2x(f, type, lang): #delete cell spaning cause of pandoc no support tree = html.parse(f, parser=html.HTMLParser(encoding='utf-8', compact=True)) for table in tree.xpath(".//table"): firstrow = table.xpath(".//tr")[0] for cell in firstrow.getchildren(): try: for i in range(1, int(cell.attrib["colspan"])): firstrow.append(etree.XML("<td></td>")) except (KeyError): pass etree.strip_attributes(tree, "class", "style", "colspan", "rowspan") with open(f, 'wb') as file: file.write(html.tostring(tree, pretty_print=True, encoding='utf-8')) subprocess.run([ cfg['pandoc']['location'], '-o', 'tmp/print.' + type, f, '--reference-doc=.\/static\/mat\/templates\/' + lang + '_msword.' + type ])
def rel_to_abs(self, base_url): """ Converts relative links from html contents to absolute links """ # Delete target attributes strip_attributes(self.tree, 'target') # Absolute links self.tree.rewrite_links( lambda link: urljoin(base_url, link) if not link.startswith(self.rel_to_abs_excluded_prefixes) else link) # Extra attributes onclick_elements = self.tree.xpath('//*[@onclick]') for element in onclick_elements: # Replace attribute with absolute URL element.set('onclick', self.javascript_open_re.sub( lambda match: '%s%s%s' % (match.group('opening'), urljoin(base_url, match.group('url')), match.group('ending')), element.get('onclick')))
def clean_html(self, html): if not isinstance(html, unicode): raise ValueError('We only support cleaning unicode HTML fragments') #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff) divnode = fromstring(u'<div>' + html + u'</div>') self(divnode) # Strip all class attributes etree.strip_attributes(divnode, 'class') # Drop all xml:lang and lang attributes, and handle the # stripping of any bad css styles # Also drop id and class attributes - these are not useful in RichTextEditor for node in divnode.xpath("//*"): for key, value in node.attrib.iteritems(): if key.lower() in ('xml:lang', 'lang','id','class'): node.attrib.pop(key, None) elif 'style' == key.lower(): try: cssStyle = cssutils.parseStyle(value) except Exception, e: logging.info("Style %s failed to parse with error %s." % (value, e)) node.attrib.pop(key, None) continue # Set the line separator so that the style gets serialized cssutils.ser.prefs.lineSeparator = '' # Only allow valid style properties cssutils.ser.prefs.validOnly = True new_style = cssStyle.cssText if not new_style.strip(): node.attrib.pop(key, None) else: node.attrib[key] = new_style # Drop all empty span tags if node.tag == 'span' and not node.keys(): node.drop_tag()
def xml_parser(xml_file): """解析xml文件""" with open(xml_file, 'r') as file_object: xml_content = file_object.read().encode('utf-8') doc = etree.XML(xml_content) # 整通通话 global total_duration total_duration = doc.xpath(r'//instance/@duration')[0] total_call = doc.xpath(r'//channel[@no="mix"]//item') input_dict, output_dict = {}, {} for tc in total_call: if tc.xpath(r'@duration'): # 有duration属性的标签为静音通话 etree.strip_attributes(tc, 'duration') tc.set('energy', '') tc.set('speed', '') get_attr = tc.xpath('attribute::*') # 获取标签中所有属性值 template = ('start', 'end', 'energy', 'speed') tmp_dict = dict(zip(template, get_attr)) input_dict[int(tmp_dict['start'])] = tmp_dict # 按开始时间打标 sorted_list = [tl[1] for tl in sorted(input_dict.items(), key=lambda item: item[0])] # 根据开始时间对通话排序 for call_num in range(len(sorted_list)): output_dict[call_num+1] = sorted_list[call_num] # 坐席通话 agent_time = doc.xpath(r'//channel[@no="n0"]//time/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n0"]//time/text()') else [] agent_text = doc.xpath(r'//channel[@no="n0"]//text/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n0"]//text/text()') else [] agent_call = call_generator(agent_time, agent_text, '0') # 客户通话 cust_time = doc.xpath(r'//channel[@no="n1"]//time/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n1"]//time/text()') else [] cust_text = doc.xpath(r'//channel[@no="n1"]//text/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n1"]//text/text()') else [] cust_call = call_generator(cust_time, cust_text, '1') # 合并坐席与客户通话 agent_call.update(cust_call) call_content = {key: value for key, value in sorted(agent_call.items(), key=lambda item: item[0])} result = call_match(output_dict, call_content) return result
def parse_content(href): """ open file and replace ../img with img and src to data_src for iframes """ myparser = etree.HTMLParser(encoding="utf-8") with open(href, 'r') as file: htmltext = file.read() tree = etree.HTML(htmltext, parser=myparser) # = html.fromstring(filein) # removing "Retour au cours" links try: links = tree.xpath('//a[contains(@href, "COURSEVIEWBYID")]') print (" ----- found links %s" % str(links)) for l in links: l.getparent().remove(l) except: pass # Adapt img links to direct path to img instead of ../img try: imgs = tree.xpath('//img')#we get a list of elements for img in imgs: new_src = img.get('src').replace('../img', 'img') img.set('src', new_src) except Exception as e: pass # For all iframes, rename 'src' attribute to 'data-src' try: iframes = tree.xpath('//iframe') for iframe in iframes: iframe.attrib['data-src'] = iframe.attrib['src'] etree.strip_attributes(iframe, 'src') except Exception as e: pass return html.tostring(tree, encoding='utf-8').decode('utf-8')
def parse_content(href): """ open file and replace ../img with img and src to data_src for iframes """ myparser = etree.HTMLParser(encoding="utf-8") with open(href, 'r') as file: htmltext = file.read() tree = etree.HTML(htmltext, parser=myparser) # = html.fromstring(filein) # removing "Retour au cours" links try: links = tree.xpath('//a[contains(@href, "COURSEVIEWBYID")]') print(" ----- found links %s" % str(links)) for l in links: l.getparent().remove(l) except: pass # Adapt img links to direct path to img instead of ../img try: imgs = tree.xpath('//img') #we get a list of elements for img in imgs: new_src = img.get('src').replace('../img', 'img') img.set('src', new_src) except Exception as e: pass # For all iframes, rename 'src' attribute to 'data-src' try: iframes = tree.xpath('//iframe') for iframe in iframes: iframe.attrib['data-src'] = iframe.attrib['src'] etree.strip_attributes(iframe, 'src') except Exception as e: pass return html.tostring(tree, encoding='utf-8').decode('utf-8')
def htm2x(f, type, lang, location): # delete cell spaning cause of pandoc no support tree = html.parse(f, parser=html.HTMLParser(encoding='utf-8', compact=True)) for table in tree.xpath(".//table"): if table.xpath(".//tr"): firstrow = table.xpath(".//tr")[0] for cell in firstrow.getchildren(): try: for i in range(1, int(cell.attrib["colspan"])): firstrow.append(etree.XML("<td></td>")) except (KeyError): pass etree.strip_attributes(tree, "class", "style", "colspan", "rowspan") with open(f, 'wb') as file: file.write(html.tostring(tree, pretty_print=True, encoding='utf-8')) pypandoc.convert_file(source_file="tmp/print.html", to='docx', outputfile='tmp/print.docx', extra_args=[ "--reference-doc", "static/mat/templates/" + lang + "_msword.docx" ])
def bloombergStoryParser(url): from lxml import etree parsed_data = dict() # webpage = open('bloomberg/sample_article1.html').read() webpage = requestHTML(url) html = etree.HTML(webpage) # header - title/author/date header = html.find('.//div[@id=\'story_head\']') header_title = header.find('.//div[@id=\'disqus_title\']/h1').text header_byline = header.find('.//cite[@class=\'byline\']').text header_date = header.find('.//span[@class=\'datestamp\']/noscript').text parsed_data['header_title'] = header_title parsed_data['header_author'] = header_byline parsed_data['header_date'] = header_date # story - content content = html.find('.//div[@id=\'story_content\']') etree.strip_elements(content, 'div', 'script', 'ul') etree.strip_attributes(content) for h2_tag in content.findall('.//h2'): h2_tag.tag = 'h3' content_story = etree.tostring(content) parsed_data['story_content'] = content_story #parsed_data['story_content'] = '~~~Story Content~~~' return parsed_data
def fix_scripts(self): for script in self.element.xpath('.//Script'): script.text = etree.CDATA(script.text) for text in self.element.xpath('.//Text'): content = None if text.get('Content') not in ['', None]: content = text.get('Content') else: if str(text.text).isspace(): content = '' else: content = text.text text.text = etree.CDATA(content) etree.strip_attributes(text, 'Content') for text_box in self.element.xpath('.//TextBox'): if text_box.get('Content') not in ['', None]: content = text_box.get('Content') else: if str(text_box.text).isspace(): content = '' else: content = text_box.text text_box.text = etree.CDATA(content) etree.strip_attributes(text_box, 'Content')
def strip_attributes(self, *attribute_names): etree.strip_attributes(self.root, *attribute_names)
def convert_to_xhtml(filepath, dirpath): # # By default the etree parser will open the file # in binary mode, leaving the windows-style \r\n linefeeds # intact. The end result is #&13 turds all over the XHTML. # if filepath.startswith("http://"): f = urllib.urlopen(filepath) else: f = open(filepath, "rb") try: text = f.read() finally: f.close() text = text.replace("\r\n", "\n") html = etree.parse(StringIO.StringIO(text), etree.HTMLParser()) # # Strip tags which won't be needed # for br in html.iter("br"): previous = br.getprevious() if previous is not None: previous.tail = (previous.tail or "") + " " else: br.tail = " " + (br.tail or "") # # A little bit dangerous but... the header block on papalencycicals.net uses # divs whose generated ids all start with "DMSMenu" # for div in list(html.iter("div")): id = div.get("id") if id and id.startswith("DMSMenu"): div.getparent().remove(div) etree.strip_tags(html, "span", "script", "center", "style", "br", "a", "table", "img", "tr", "td", "font", "div") # # Remove tag-level styles and alignments # etree.strip_attributes(html, "class", "style", "align") # # Remove paragraphs which are completely empty # for p in html.iter("p"): if not list(p) and not (p.text or "").strip() and not (p.tail or "").strip(): p.getparent().remove(p) html_body = html.find("body") html_body.attrib.clear() xml = etree.XML(XHTML_DOCUMENT) xml_body = xml.find("{%s}body" % xml.nsmap[None]) xml_body.getparent().replace(xml_body, html_body) base, ext = os.path.splitext(os.path.basename(filepath)) output_filepath = os.path.join(dirpath, "%s.xhtml" % base) with open(output_filepath, "wb") as f: f.write( etree.tostring( xml, pretty_print=True, xml_declaration=True, encoding="utf-8" ) )
def parse_old_conversations(mbox_file, xml_dir, addresses): # Until about 2013-05-01 Google used XMPP-like XML for chat print('Parsing old-style XML chats... ', file=sys.stdout) sys.stdout.flush() mbox = mailbox.mbox(mbox_file) num_messages = 0 num_malformed = 0 # no XML num_groupchats = 0 for message in mbox: num_messages += 1 thread_id = message['X-GM-THRID'] payload = message.get_payload(i=0) transfer_encoding = payload['Content-Transfer-Encoding'] content_type = payload.get_content_type() if content_type != 'text/xml': # These are either empty messages or email messages that Google # confused while attempting to blur the distinction between chat # and email. In the latter case, we only care about chat messages # here, but the message should be correctly parsed as email num_malformed += 1 else: # Message contains good XML, clean it up payload_cleaned = clean_xml_payload(payload) tree = etree.fromstring(payload_cleaned) if tree.xpath('//cli:message[@type="groupchat"]', namespaces={'cli': 'jabber:client'}): # Skip group chats num_groupchats += 1 continue # Python's ElementTree fails miserably here, use lxml xpaths = [] # Google elements signature_xpath = tree.xpath('//met:google-mail-signature', namespaces={'met': 'google:metadata'}) xpaths.append(signature_xpath) delay_xpath = tree.xpath('//jxd:x', namespaces={'jxd': 'jabber:x:delay'}) xpaths.append(delay_xpath) nosave_xpath = tree.xpath('//nos:x', namespaces={'nos': 'google:nosave'}) xpaths.append(nosave_xpath) record_xpath = tree.xpath('//arc:record', namespaces={'arc': 'http://jabber.org/protocol/archive'}) xpaths.append(record_xpath) xhtml_xpath = tree.xpath('//xht:html', namespaces={'xht': 'http://www.w3.org/1999/xhtml'}) xpaths.append(xhtml_xpath) xhtmlim_xpath = tree.xpath('//xim:html', namespaces={'xim': 'http://jabber.org/protocol/xhtml-im'}) xpaths.append(xhtmlim_xpath) gap_xpath = tree.xpath('//con:gap', namespaces={'con': 'google:archive:conversation'}) xpaths.append(gap_xpath) # Jabber "composing" element (someone was typing) composing_xpath = tree.xpath('//eve:x', namespaces={'eve': 'jabber:x:event'}) # Remove junk elements (there are a lot of them) xpaths.append(composing_xpath) for x in xpaths: for dud in x: dud.getparent().remove(dud) # Strip Google and experimental(?) AIM attributes etree.strip_attributes(tree, 'iconset', '{google:internal}cid', '{google:internal}sequence-no', '{google:internal}time-stamp', '{google:internal}interop-stanza', '{google:internal}dual-delivery', '{google:internal}interop-disable-legacy-archiver', '{google:aim}new-session', ) # Clean up namespaces remove_namespace(tree, '{jabber:client}') remove_namespace(tree, '{google:archive:conversation}') remove_namespace(tree, '{google:timestamp}') etree.cleanup_namespaces(tree) # Remove /resource from message 'from' and 'to' attributes for m in tree.xpath('//message'): from_field = m.attrib['from'].split('/')[0] to_field = m.attrib['to'].split('/')[0] m.attrib['from'] = from_field m.attrib['to'] = to_field # Record addresses for name map addresses[from_field] += 1 addresses[to_field] += 1 # Write out data f = open('{0}/{1}.conv'.format(xml_dir, thread_id), 'a') # append # Format manually b/c lxml's pretty print makes a TON of mistakes prev_m_as_string = '' for m in tree.xpath('//message'): m_as_string = etree.tostring(m) # Gotcha: When xpath() is used on an Element, if the XPath # expression is relative it's evaluated against the element. # If the expression is absolute it's evaluated against the tree to_field = m.attrib['to'] from_field = m.attrib['from'] body = m.xpath('./body') # In the case of sequential messages with identical timestamps, # we have to rely on line order in the mbox to order messages time_ms = m.xpath('./time')[0].attrib['ms'] if m_as_string != prev_m_as_string and len(body) != 0: # Don't print duplicate messages (sometimes the entire # message including timestamp is repeated), don't print # empty messages print(' <message to="{0}" from="{1}">'.format(to_field, from_field), file=f) print(' <body>{0}</body>'.format(html.escape(body[0].text)), file=f) print(' <time ms="{0}"/>'.format(time_ms), file=f) print(' </message>', file=f) prev_m_as_string = m_as_string f.close() num_parsed = num_messages - num_malformed - num_groupchats if num_malformed: print(' Malformed: {0}'.format(num_malformed), file=sys.stdout) if num_groupchats: print(' Group chats: {0} (unsupported)'.format(num_groupchats), file=sys.stdout) print(' Messages parsed: {0}'.format(num_parsed), file=sys.stdout) print('DONE', file=sys.stdout) return num_parsed
def strip_attributes(self, xml, path, attrs): for node in xml.xpath(path): for k in [i for i in node.keys() if i in attrs]: etree.strip_attributes(node, k) return xml
def sch2svg(sch_fp, svg_fp): """ converts Kicad .sch (eeschema) to svg leveraging binaries from plotkicadsch requires the .lib to be in the same folder Paramters: ---------- sch_fp: str full path to the Kicad schematics file svg_fp: str full path to the svg output Return: ------- svg_fp: str full path to the svg file created Usage: ------ N/A """ lib_fp = sch_fp.replace(".sch", "-cache.lib") #now test that svg is older than schematics: sch_is_younger = True if exists(svg_fp): if getmtime(sch_fp) < getmtime(svg_fp): sch_is_younger = False if sch_is_younger: logger.info(f"calling plotkicadsch on {sch_fp}") res = call([plotkicadsch_fp, "-f", sch_fp, "-l", lib_fp]) try: assert res == 0 except: logger.error( f"failed to convert sch {sch_fp} lib_fp: {lib_fp} to svg") raise if sch_is_younger: with open(sch_fp.replace(".sch", ".svg"), 'r') as fp: fxml = fp.read() mxml = etree.fromstring(fxml) # REMOVE the canvas lines for element in mxml.xpath(".//*[not(node())]"): if (element.tag.find("rect")) > 0: element.getparent().remove(element) #REMOVE the text from the canvas for element in mxml.xpath('//*[contains(text(),"Page:")]'): element.getparent().remove(element) #text in RED and GREEN changed back to black for el in mxml.xpath("//svg:text[@fill='#FF0000']",\ namespaces={'svg':'http://www.w3.org/2000/svg'}): el.attrib["fill"] = "#000000" for element in mxml.xpath('//svg:text[@fill="#00FF00"]',\ namespaces={'svg':'http://www.w3.org/2000/svg'}): element.attrib["fill"] = "#000000" #polyline in dark red to black for element in mxml.xpath('//svg:polyline[@stroke="#800000"]',\ namespaces={'svg':'http://www.w3.org/2000/svg'}): element.attrib["stroke"] = "#000000" x_min, x_max, y_min, y_max = get_svg_viewbox(mxml) for el in mxml.xpath("//*[@viewBox]"): width = (x_max - x_min) * 1.2 height = (y_max - y_min) * 1.2 el.attrib['viewBox'] = "%s %s %s %s" % ( x_min - width * 0.1, y_min - height * 0.1, width, height) etree.strip_attributes(el, 'width') etree.strip_attributes(el, 'height') cleaned = etree.tostring(mxml, pretty_print=True, encoding="unicode") with open(svg_fp, 'w') as fp: fp.write(cleaned) return (svg_fp)
def remove_attr(self, attribute: str): """ Remove an attribute from this node. """ etree.strip_attributes(self.lxml_element, attribute)
def parse_old_conversations(mbox_file, xml_dir, addresses): # Until about 2013-05-01 Google used XMPP-like XML for chat print('Parsing old-style XML chats... ', file=sys.stdout) sys.stdout.flush() mbox = mailbox.mbox(mbox_file) num_messages = 0 num_malformed = 0 # no XML num_groupchats = 0 for message in mbox: num_messages += 1 thread_id = message['X-GM-THRID'] payload = message.get_payload(i=0) transfer_encoding = payload['Content-Transfer-Encoding'] content_type = payload.get_content_type() if content_type != 'text/xml': # These are either empty messages or email messages that Google # confused while attempting to blur the distinction between chat # and email. In the latter case, we only care about chat messages # here, but the message should be correctly parsed as email num_malformed += 1 else: # Message contains good XML, clean it up payload_cleaned = clean_xml_payload(payload) tree = etree.fromstring(payload_cleaned) if tree.xpath('//cli:message[@type="groupchat"]', namespaces={'cli': 'jabber:client'}): # Skip group chats num_groupchats += 1 continue # Python's ElementTree fails miserably here, use lxml xpaths = [] # Google elements signature_xpath = tree.xpath('//met:google-mail-signature', namespaces={'met': 'google:metadata'}) xpaths.append(signature_xpath) delay_xpath = tree.xpath('//jxd:x', namespaces={'jxd': 'jabber:x:delay'}) xpaths.append(delay_xpath) nosave_xpath = tree.xpath('//nos:x', namespaces={'nos': 'google:nosave'}) xpaths.append(nosave_xpath) record_xpath = tree.xpath( '//arc:record', namespaces={'arc': 'http://jabber.org/protocol/archive'}) xpaths.append(record_xpath) xhtml_xpath = tree.xpath( '//xht:html', namespaces={'xht': 'http://www.w3.org/1999/xhtml'}) xpaths.append(xhtml_xpath) xhtmlim_xpath = tree.xpath( '//xim:html', namespaces={'xim': 'http://jabber.org/protocol/xhtml-im'}) xpaths.append(xhtmlim_xpath) gap_xpath = tree.xpath( '//con:gap', namespaces={'con': 'google:archive:conversation'}) xpaths.append(gap_xpath) # Jabber "composing" element (someone was typing) composing_xpath = tree.xpath('//eve:x', namespaces={'eve': 'jabber:x:event'}) # Remove junk elements (there are a lot of them) xpaths.append(composing_xpath) for x in xpaths: for dud in x: dud.getparent().remove(dud) # Strip Google and experimental(?) AIM attributes etree.strip_attributes( tree, 'iconset', '{google:internal}cid', '{google:internal}sequence-no', '{google:internal}time-stamp', '{google:internal}interop-stanza', '{google:internal}dual-delivery', '{google:internal}interop-disable-legacy-archiver', '{google:aim}new-session', ) # Clean up namespaces remove_namespace(tree, '{jabber:client}') remove_namespace(tree, '{google:archive:conversation}') remove_namespace(tree, '{google:timestamp}') etree.cleanup_namespaces(tree) # Remove /resource from message 'from' and 'to' attributes for m in tree.xpath('//message'): from_field = m.attrib['from'].split('/')[0] to_field = m.attrib['to'].split('/')[0] m.attrib['from'] = from_field m.attrib['to'] = to_field # Record addresses for name map addresses[from_field] += 1 addresses[to_field] += 1 # Write out data f = open('{0}/{1}.conv'.format(xml_dir, thread_id), 'a') # append # Format manually b/c lxml's pretty print makes a TON of mistakes prev_m_as_string = '' for m in tree.xpath('//message'): m_as_string = etree.tostring(m) # Gotcha: When xpath() is used on an Element, if the XPath # expression is relative it's evaluated against the element. # If the expression is absolute it's evaluated against the tree to_field = m.attrib['to'] from_field = m.attrib['from'] body = m.xpath('./body') # In the case of sequential messages with identical timestamps, # we have to rely on line order in the mbox to order messages time_ms = m.xpath('./time')[0].attrib['ms'] if m_as_string != prev_m_as_string and len(body) != 0: # Don't print duplicate messages (sometimes the entire # message including timestamp is repeated), don't print # empty messages print(' <message to="{0}" from="{1}">'.format( to_field, from_field), file=f) print(' <body>{0}</body>'.format( html.escape(body[0].text)), file=f) print(' <time ms="{0}"/>'.format(time_ms), file=f) print(' </message>', file=f) prev_m_as_string = m_as_string f.close() num_parsed = num_messages - num_malformed - num_groupchats if num_malformed: print(' Malformed: {0}'.format(num_malformed), file=sys.stdout) if num_groupchats: print(' Group chats: {0} (unsupported)'.format(num_groupchats), file=sys.stdout) print(' Messages parsed: {0}'.format(num_parsed), file=sys.stdout) print('DONE', file=sys.stdout) return num_parsed
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, "getroot"): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter("image"): el.tag = "img" if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add("script") if self.safe_attrs_only: safe_attrs = set(self.safe_attrs) for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs): # safe_attrs handles events attributes itself for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname.startswith("on"): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) # If we're deleting style then we don't have to remove JS links # from styles, otherwise... if not self.inline_style: for el in _find_styled_elements(doc): old = el.get("style") new = _css_javascript_re.sub("", old) new = _css_import_re.sub("", new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib["style"] elif new != old: el.set("style", new) if not self.style: for el in list(doc.iter("style")): if el.get("type", "").lower().strip() == "text/javascript": el.drop_tree() continue old = el.text or "" new = _css_javascript_re.sub("", old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub("", old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = "/* deleted */" elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add("style") if self.inline_style: etree.strip_attributes(doc, "style") if self.links: kill_tags.add("link") elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter("link")): if "stylesheet" in el.get("rel", "").lower(): # Note this kills alternate stylesheets as well if not self.allow_element(el): el.drop_tree() if self.meta: kill_tags.add("meta") if self.page_structure: remove_tags.update(("head", "html", "title")) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter("param")): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ("applet", "object"): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(("applet",)) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(("iframe", "embed", "layer", "object", "param")) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add("form") kill_tags.update(("button", "input", "select", "textarea")) if self.annoying_tags: remove_tags.update(("blink", "marquee")) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = "div" el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != "html": el.tag = "div" el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags" ) allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = "div" el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): rel = el.get("rel") if rel: if "nofollow" in rel and " nofollow " in (" %s " % rel): continue rel = "%s nofollow" % rel else: rel = "nofollow" el.set("rel", rel)
def build(do_id, c_id): """Create a DLXS-appropriate xml file for the item by using data from the assoicated mods, mets, and ocr files. After a collection-generic XML is created, it's passed to the handle_collection_extras method for any additional collection-specific transformation. Finally, the XML is written to a directory for that collection.""" item = drlutils.django.utils.get_item(do_id) mods_file = drlutils.django.utils.get_mods_path(item) if not mods_file: return do_id + ' - no mods file found' mets_file = drlutils.django.utils.get_mets_path(item) if not mets_file: return do_id + ' - no mets file found' try: # mods is transformed into the header mods = etree.parse(open(mods_file, 'r')) mods_stylesheet = etree.parse(open(drlutils.config.MODS2DLXS_XSL, 'r')) transform_mods = etree.XSLT(mods_stylesheet) dlxs = transform_mods(mods) # mets is transformed into the body mets = etree.parse(open(mets_file, 'r')) mets_stylesheet = etree.parse(open(drlutils.config.METS2DLXS_XSL, 'r')) transform_mets = etree.XSLT(mets_stylesheet) body = transform_mets(mets) # the body is appended to the header dlxs.getroot().append(body.getroot()) if has_ocr(do_id): try: ocr_file = drlutils.django.utils.get_ocr_zip_path(item) except: return do_id + ' - no ocr file found' # the ocr is unzipped and added to the pages ocr_zip = zipfile.ZipFile(ocr_file, 'r') for ocr in ocr_zip.namelist(): oimage = re.sub('.txt', '.tif', os.path.basename(ocr)) xpath_for_p_element = '//P[@REF=\'' + oimage + '\']' page_content = " ".join(ocr_zip.open(ocr).readlines()) filtered_text = ' ' if page_content: filtered_text = filter_ocr(page_content) or ' ' try: p = dlxs.xpath(xpath_for_p_element)[0] p.text = filtered_text except Exception as e: print 'OCR exception: %s - %s - %s' % (str(e), do_id, xpath_for_p_element) # remove the REF attribute of the P elements, which was our hook for the OCR for p_tag in dlxs.xpath('//P'): etree.strip_attributes(p_tag, 'REF') # add digital object identifier #idno = dlxs.xpath('//IDNO[@TYPE=\'uls-drl\']')[0] idno = dlxs.xpath('//IDNO')[0] idno.text = item.do_id # add digital publication date date = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/PUBLICATIONSTMT/DATE')[0] try: pub = item.online_pub_date.year date.text = str(pub) except: new_pub_date = str(datetime.date.today().year) date.text = new_pub_date # add availability statement availability = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/PUBLICATIONSTMT/AVAILABILITY/P')[0] availability.text = get_availability_statement(item) # add extent (number) of digital files pages = dlxs.xpath('//PB') page_count = str(len(pages)) extent = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/EXTENT')[0] extent.text = page_count + ' digitized page images' title_main = dlxs.xpath('//TITLE[@TYPE=\'245\']')[0] div1_head = dlxs.xpath('//DIV1/HEAD')[0] div1_head.text = title_main.text # add tei encoding level divs = dlxs.xpath('//DIV2') if len(divs) > 1: level = 2 else: level = 1 encodingdecl = dlxs.xpath('//EDITORIALDECL')[0] encodingdecl.set("N", str(level)) # handle collection-specific extra processing dlxs = handle_collection_extras(item.do_id, c_id, dlxs) outdir = os.path.join(drlutils.config.DLXS_XML_PATH, c_id) if not os.path.exists(outdir): os.makedirs(outdir) outfile = item.do_id + '.xml' outpath = os.path.join(outdir, outfile) etree.ElementTree(dlxs.getroot()).write(outpath, pretty_print=True) return None except Exception as e: return 'problem with %s: %s, %s' % (do_id, str(e), traceback.print_tb(sys.exc_info()[2]))
def Write(self, fxml): etree.strip_attributes(self.root, '{http://codespeak.net/lxml/objectify/pytype}pytype') objectify.deannotate(self.root, xsi_nil=True) etree.cleanup_namespaces(self.root) print >>fxml, (etree.tostring(self.root, pretty_print=True, xml_declaration=True, encoding='utf-8'))
def make_gpx_private(self, gpx_file=None): ''' wipes out private data from gpx files converts laps to waypoints ''' logging.debug(">>") if gpx_file is None: return None filen = os.path.basename(gpx_file) tmpdir = self.pytrainer_main.profile.tmpdir anon_gpx_file = "%s/%s" % (tmpdir, filen) # get saved private area polygon pP = loads(self.options['privPolygon']) pP = pP['geometry']['coordinates'][0] # converts polygon's 2D matrix into a vector of just the lats or lons vector = lambda lonLat: [pP[i][lonLat] for i in range(len(pP))] # 0:lon, 1:lat # try reading private area's bounds, stored as [lon,lat] NE_LAT = max([pP[i][1] for i in range(len(pP))]) NE_LON = max([pP[i][0] for i in range(len(pP))]) SW_LAT = min([pP[i][1] for i in range(len(pP))]) SW_LON = min([pP[i][0] for i in range(len(pP))]) logging.info("Anonymizing Area: NE:%f,%f -> SW: %f,%f" % (NE_LON, NE_LAT, SW_LON, SW_LAT)) # Config parameters, not used yet FILTER_BOX = True ERASE_TIME = True LAP_TO_WAYPOINT = True tree = etree.parse(gpx_file) _xmlns = tree.getroot().nsmap[None] _trkpt_path = '{%s}trk/{%s}trkseg/{%s}trkpt' % (_xmlns, _xmlns, _xmlns) # namespace of gpx files NS = dict(ns='http://www.topografix.com/GPX/1/1') myroot = tree.getroot() gpxdataNS = string.Template(\ ".//{http://www.cluetrust.com/XML/GPXDATA/1/0}$tag") lapTag = gpxdataNS.substitute(tag="lap") endPointTag = gpxdataNS.substitute(tag="endPoint") triggerTag = gpxdataNS.substitute(tag="trigger") laps = tree.findall(lapTag) mygpx = tree.find('gpx') for lap in laps: trigger = lap.find(triggerTag) # Watch out for manually triggered laps if trigger.text == 'manual': endPoint = lap.find(endPointTag) lat = endPoint.get("lat") lon = endPoint.get("lon") # Create waypt if not in home box try: if not ((SW_LAT < float(lat) < NE_LAT) and (SW_LON < float(lon) < NE_LON)): etree.SubElement(myroot, 'wpt', attrib={ 'lat': lat, 'lon': lon }) except: pass etree.strip_attributes(myroot, 'creator') # Wipe out home box for trkpt in tree.findall(_trkpt_path): lat = float(trkpt.attrib['lat']) lon = float(trkpt.attrib['lon']) if (lat < NE_LAT) & (lon < NE_LON) & (lat > SW_LAT) & (lon > SW_LON): par = trkpt.getparent() par.remove(trkpt) time = tree.xpath('//ns:trkpt/ns:time', namespaces=NS) for i in time: i.text = '1970-01-01T00:00:00+00:00' # osm regards <time> as mandatory. gnaa. ext = tree.xpath('//ns:gpx/ns:extensions', namespaces=NS) for i in ext: par = i.getparent() par.remove(i) meta = tree.xpath('//ns:gpx/ns:metadata', namespaces=NS) for i in meta: par = i.getparent() par.remove(i) ele = tree.xpath('//ns:trkpt/ns:ele', namespaces=NS) for i in ele: par = i.getparent() par.remove(i) # test schema on cleaned xml-tree # gpx.xsd from http://www.topografix.com/gpx.asp #xmlschema = etree.XMLSchema(etree.parse('gpx.xsd')) #xmlschema.validate(tree) # write new gpx file tree.write(anon_gpx_file, pretty_print=False, xml_declaration=True, encoding='UTF-8') logging.debug("<<") return anon_gpx_file
def main(): input_dir = Path(os.getenv("INPUT_RAWPATH", "content/GENERATED/")) output_dir = Path(os.getenv("INPUT_OUTPUTPATH", "content/api/")) if not input_dir.exists(): print("Exiting because there are no files to process...") print(f"{input_dir.resolve()} does not exist!") sys.exit(0) print(f"Processing content of {input_dir.resolve()}...") print(f"Outputting results to {output_dir.resolve()}...") for f in input_dir.glob("*_api.xml"): print(f"Processing {str(f)}...") tree = etree.parse(str(f), parser=etree.XMLParser(recover=True, remove_comments=True)) root = tree.getroot() remove_attrs = [ "noemph", "{*}space", "add_permalink", "is_multiline", "noindex", ] etree.strip_attributes( root, *remove_attrs, ) body = root.find("section") contents = CodeFile(body) contents.parse() # frontmatter, parsed = parse_file(f) output_xml = input_dir / f"{f.stem}-processed.xml" with output_xml.open("w") as fp: doc = E.document() if contents.domain == CodeFile.DOMAIN_PY: doc.set("api-lang", "python") doc.set("title", "Python API Documentation") elif contents.domain == CodeFile.DOMAIN_CPP: doc.set("api-lang", "cpp") doc.set("title", "C++ API Documentation") elif contents.domain == CodeFile.DOMAIN_C: doc.set("api-lang", "c") doc.set("title", "C API Documentation") title_element = E.document_title(doc.get("title")) doc.append(title_element) children = root.find("./section").getchildren() doc.extend(children) etree.indent(doc, space=" ", level=1) doc_text = etree.tostring(doc, encoding="unicode") fp.write(doc_text) fp.write("\n") print() htmlify(input_dir, output_dir) sys.exit(0)
ID = origUrl.text.replace(baseUrl, "") slug.text = ID items.append(slug) # --- Do the processing on original xml --- for items in root.findall(nodePath, di.nsDict): # --- Convert all the fields for keys in di.qdcFieldsDict: nestedFormat(items, keys) # --- Strip out the nodes leftover from shuffling things around in the hierarchy --- for leftovers in root.findall('item/metadata'): strip = leftovers.getparent() strip.remove(leftovers) # --- Clean up extraneous attriubtes on nodes --- etree.strip_attributes(root, "set", "batchsize", "id", "timestamp", "total") etree.cleanup_namespaces(root) # --- Create new xml file --- filename = xmlFile (prefix, sep, suffix) = filename.rpartition('.') new_filename = prefix + '_batch.xml' tree.write(new_filename, pretty_print=True, encoding="UTF-8", xml_declaration=True) print('\n', 'Your new file,', new_filename, ', has been created.')
for h in hs: rawtext = lxml.html.tostring(h, method="text", encoding="utf8") if rawtext != None: raw_h = ET.Element("head") raw_h.text = h.text h.addnext(raw_h) h.getparent().remove(h) for c in cs: if c.getparent() is not None: c.getparent().remove(c) for note in notes: if note.getparent() is not None: note.getparent().remove(note) for div in divs: ET.strip_attributes(div, 'rend') if 'type' in div.attrib: if div.attrib['type'] == 'section': div.attrib['type'] = 'chapter' if "n" in div.attrib: div.attrib['title'] = div.attrib['n'] del div.attrib['n'] # Reconstitution du Header fileDesc = ET.Element('fileDesc') titleStmt = ET.Element('titleStmt') title = ET.Element('title') if len(tree.xpath('.//title/text()')) > 0: title.text = tree.xpath('.//title/text()')[0]
# drop titlemap for t in root.findall('.//titlemap'): t.getparent().remove(t) # drop audio for a in root.findall('.//audio'): a.getparent().remove(a) # drop subpicture attributes for s in root.findall('.//subpicture'): present = s.get('present') et.strip_attributes(s, 'present') et.strip_attributes(s, 'id') if present != None and present == 'no': s.getparent().remove(s) for s in root.findall('.//pgc'): et.strip_attributes(s, 'next') et.strip_attributes(s, 'prev') # fix buttons for b in root.findall('.//buttons'): vod = b.getparent() for button in reversed(b.getchildren()): vod.addnext(button) vod.remove(b)
def __call__(self, doc): """ Cleans the document. """ try: getroot = doc.getroot except AttributeError: pass # Element instance else: doc = getroot() # ElementTree instance, instead of an element # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(self.safe_attrs) for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs): # safe_attrs handles events attributes itself for el in doc.iter(etree.Element): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) # If we're deleting style then we don't have to remove JS links # from styles, otherwise... if not self.inline_style: for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) if not self.style: for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments: kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') if self.inline_style: etree.strip_attributes(doc, 'style') if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well if not self.allow_element(el): el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet', )) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags" ) allow_tags = set(defs.tags) if allow_tags: # make sure we do not remove comments/PIs if users want them (which is rare enough) if not self.comments: allow_tags.add(etree.Comment) if not self.processing_instructions: allow_tags.add(etree.ProcessingInstruction) bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): rel = el.get('rel') if rel: if ('nofollow' in rel and ' nofollow ' in (' %s ' % rel)): continue rel = '%s nofollow' % rel else: rel = 'nofollow' el.set('rel', rel)
def html(self, catid, url, htfile, md = None, sub = False, ins = False): htfile = lybtools.htfile_tounicode(htfile) body = etree.ElementTree(etree.HTML(htfile).xpath('//body')[0]) sessdata = cherrypy.session strip_tags = ["script", "link"] etree.strip_elements(body, *strip_tags, with_tail=False) chg_tags = ["body", "a", "form", "input", "noscript"] etree.strip_tags(body, *chg_tags) etree.strip_tags(body, etree.Comment) #safe_tags = ['img'] for elem in body.xpath('//*'): if elem.tag == 'body': elem.tag = 'old-body' attr = elem.attrib if elem.tag in chg_tags: etree.strip_attributes(elem, *attr) if "class" in attr: etree.strip_attributes(elem, "class") if "id" in attr: etree.strip_attributes(elem, "id") if "onclick" in attr: etree.strip_attributes(elem, "onclick") if "style" in attr: attr['style'] = re.sub('url\(.+\)', 'url()', attr['style']) #if elem.tag not in safe_tags and (elem.text is None or elem.text.strip() == '') and elem.getchildren() == []: # elem.getparent().remove(elem) # continue if "src" in attr: m = re.search('data:(\S+);base64,(.+)', attr['src']) if not m: srcurl = urljoin(url, attr['src']) srcobjquery = urlsplit(srcurl)[3] srcqdict = parse_qs(srcobjquery) if 'lybsrcobj' in list(srcqdict.keys()): ohash = srcqdict['lybsrcobj'][0] srcquerydata = {'lybsrcobj': ohash} srcquery = urlencode(srcquerydata) # if ins: page = '/getobj?' # else: # page = '/edit/tmpstore?' # if ohash not in sessdata: # if md: # cat = Cat(int(catid)) # doco = cat[int(md)] # sessdata[ohash] = doco[ohash] elem.set('src', page + srcquery) continue try: srcu = URLOpener().open(srcurl) except: continue if srcu.code >= 400: continue srcdata = srcu.read() cont_type = srcu.headers['Content-Type'] srcftype = cont_type and lybtools.ctype(srcu.headers['Content-Type']) or 'none' else: srcdata = decodestring(m.group(2).encode('utf-8')) srcftype = m.group(1) srchashname = hashlib.sha1(srcdata).hexdigest() if srcftype == 'text/html': if elem.tag == 'img': continue srcdata = self.html(catid, srcu.url, srcdata, sub = True) if srchashname not in sessdata: sessdata[srchashname] = {'body': srcdata, 'type': srcftype} srcquerydata = {'lybsrcobj': srchashname} srcquery = urlencode(srcquerydata) if ins: page = '/getobj?' else: page = '/edit/tmpstore?' elem.set('src', page + srcquery) etree.strip_tags(body, 'old-body') ht_ml = etree.tounicode(body, method='html', pretty_print = True) if not sub and not ins: return self.ne(catid, url=url, html=ht_ml, md=md) else: return ht_ml
def normalizer(e): etree.strip_attributes(e, 'id') return e
def make_gpx_private(self, gpx_file=None): ''' wipes out private data from gpx files converts laps to waypoints ''' logging.debug(">>") if gpx_file is None: return None filen = os.path.basename(gpx_file) tmpdir = self.pytrainer_main.profile.tmpdir anon_gpx_file = "%s/%s" % (tmpdir, filen) # get saved private area polygon pP=loads(self.options['privPolygon']) pP=pP['geometry']['coordinates'][0] # converts polygon's 2D matrix into a vector of just the lats or lons vector = lambda lonLat: [pP[i][lonLat] for i in range(len(pP))] # 0:lon, 1:lat # try reading private area's bounds, stored as [lon,lat] NE_LAT = max([pP[i][1] for i in range(len(pP))]) NE_LON = max([pP[i][0] for i in range(len(pP))]) SW_LAT = min([pP[i][1] for i in range(len(pP))]) SW_LON = min([pP[i][0] for i in range(len(pP))]) logging.info("Anonymizing Area: NE:%f,%f -> SW: %f,%f" % (NE_LON, NE_LAT, SW_LON, SW_LAT)) # Config parameters, not used yet FILTER_BOX = True ERASE_TIME = True LAP_TO_WAYPOINT = True tree = etree.parse(gpx_file) _xmlns = tree.getroot().nsmap[None] _trkpt_path = '{%s}trk/{%s}trkseg/{%s}trkpt' % (_xmlns, _xmlns, _xmlns) # namespace of gpx files NS = dict(ns='http://www.topografix.com/GPX/1/1') myroot = tree.getroot() gpxdataNS = string.Template(\ ".//{http://www.cluetrust.com/XML/GPXDATA/1/0}$tag") lapTag = gpxdataNS.substitute(tag="lap") endPointTag = gpxdataNS.substitute(tag="endPoint") triggerTag = gpxdataNS.substitute(tag="trigger") laps = tree.findall(lapTag) mygpx = tree.find('gpx') for lap in laps: trigger = lap.find(triggerTag) # Watch out for manually triggered laps if trigger.text == 'manual': endPoint = lap.find(endPointTag) lat = endPoint.get("lat") lon = endPoint.get("lon") # Create waypt if not in home box try: if not ((SW_LAT < float(lat) < NE_LAT) and (SW_LON < float(lon) < NE_LON)): etree.SubElement(myroot, 'wpt', attrib= {'lat':lat, 'lon':lon}) except: pass etree.strip_attributes(myroot, 'creator') # Wipe out home box for trkpt in tree.findall(_trkpt_path): lat = float(trkpt.attrib['lat']) lon = float(trkpt.attrib['lon']) if (lat < NE_LAT) & (lon < NE_LON) & (lat > SW_LAT) & (lon > SW_LON): par = trkpt.getparent() par.remove(trkpt) time = tree.xpath('//ns:trkpt/ns:time', namespaces=NS) for i in time: i.text = '1970-01-01T00:00:00+00:00' # osm regards <time> as mandatory. gnaa. ext = tree.xpath('//ns:gpx/ns:extensions', namespaces=NS) for i in ext: par = i.getparent() par.remove(i) meta = tree.xpath('//ns:gpx/ns:metadata', namespaces=NS) for i in meta: par = i.getparent() par.remove(i) ele = tree.xpath('//ns:trkpt/ns:ele', namespaces=NS) for i in ele: par = i.getparent() par.remove(i) # test schema on cleaned xml-tree # gpx.xsd from http://www.topografix.com/gpx.asp #xmlschema = etree.XMLSchema(etree.parse('gpx.xsd')) #xmlschema.validate(tree) # write new gpx file tree.write(anon_gpx_file, pretty_print=False, xml_declaration=True, encoding='UTF-8') logging.debug("<<") return anon_gpx_file
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, 'getroot'): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML xhtml_to_html(doc) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set(self.kill_tags or ()) remove_tags = set(self.remove_tags or ()) allow_tags = set(self.allow_tags or ()) if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not self.safe_attrs_only: # safe_attrs handles events attributes itself for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) if not self.style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') etree.strip_attributes(doc, 'style') if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet',)) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() _kill.reverse() # start with innermost tags for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() allow_tags = self.allow_tags if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags") allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is doc: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): el.set('rel', 'nofollow')