def strip_full_message_quoting_html(message_body): """Assumes any encoding conversions have already been done """ #Most useful to develop this: #http://www.motobit.com/util/quoted-printable-decoder.asp #http://www.freeformatter.com/html-formatter.html #http://www.freeformatter.com/xpath-tester.html#ad-output debug = True from lxml import html, etree doc = None try: doc = html.fromstring(message_body) except etree.ParserError: # If the parsed HTML document is empty, we get a "ParserError: Document is empty" exception. So the stripped message we return is an empty string (if we keep the exception it blocks the SourceReader) return "" #Strip GMail quotes matches = doc.find_class('gmail_quote') if len(matches) > 0: if not matches[ 0].text or "---------- Forwarded message ----------" not in matches[ 0].text: matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Strip modern Apple Mail quotes find = etree.XPath( r"//child::blockquote[contains(@type,'cite')]/preceding-sibling::br[contains(@class,'Apple-interchange-newline')]/parent::node()/parent::node()" ) matches = find(doc) #log.debug(len(matches)) #for index,match in enumerate(matches): # log.debug("Match: %d: %s " % (index, html.tostring(match, encoding="unicode"))) if len(matches) == 1: matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Strip old AppleMail quotes (french) regexpNS = "http://exslt.org/regular-expressions" ##Trying to match: Le 6 juin 2011 à 11:02, Jean-Michel Cornu a écrit : find = etree.XPath( r"//child::div[re:test(text(), '^.*Le .*\d{4} .*:\d{2}, .* a .*crit :.*$', 'i')]/following-sibling::br[contains(@class,'Apple-interchange-newline')]/parent::node()", namespaces={'re': regexpNS}) matches = find(doc) if len(matches) == 1: matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Strip Outlook quotes (when outlook gives usable structure) find = etree.XPath( r"//body/child::blockquote/child::div[contains(@class,'OutlookMessageHeader')]/parent::node()" ) matches = find(doc) if len(matches) == 1: matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Strip Outlook quotes (when outlook gives NO usable structure) successiveStringsToMatch = [ '|'.join(['^From:.*$', '^De :.*$']), '|'.join(['^Sent:.*$', '^Envoy.+ :.*$']), '|'.join( ['^To:.*$', '^.+:.*$'] ), #Trying to match À, but unicode is really problematic in lxml regex '|'.join(['^Subject:.*$', '^Objet :.*$']), ] regexpNS = "http://exslt.org/regular-expressions" successiveStringsToMatchRegex = [] for singleHeaderLanguageRegex in successiveStringsToMatch: successiveStringsToMatchRegex.append( r"descendant::*[re:test(text(), '" + singleHeaderLanguageRegex + "')]") regex = " and ".join(successiveStringsToMatchRegex) find = etree.XPath(r"//descendant::div[" + regex + "]", namespaces={'re': regexpNS}) matches = find(doc) if len(matches) == 1: findQuoteBody = etree.XPath(r"//descendant::div[" + regex + "]/following-sibling::*", namespaces={'re': regexpNS}) quoteBodyElements = findQuoteBody(doc) for quoteElement in quoteBodyElements: #This moves the text to the tail of matches[0] quoteElement.drop_tree() matches[0].tail = None matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Strip Thunderbird quotes mainXpathFragment = "//child::blockquote[contains(@type,'cite') and boolean(@cite)]" find = etree.XPath(mainXpathFragment + "/self::blockquote") matches = find(doc) if len(matches) == 1: matchQuoteAnnounce = doc.xpath(mainXpathFragment + "/preceding-sibling::*") if len(matchQuoteAnnounce) > 0: matchQuoteAnnounce[-1].tail = None matches[0].drop_tree() return html.tostring(doc, encoding="unicode") #Nothing was stripped... return html.tostring(doc, encoding="unicode")
def __init__(self, data, xml): self.xml = xml self.update_key = None self.parse_data(data) self.question_title_xpath = etree.XPath('update-content/question/title') self.set_update_content()
def get_positions(self): profile_position_xpath = etree.XPath('positions/position') pos = profile_position_xpath(self.xml) for p in pos: obj = lixml.LinkedInXMLParser(etree.tostring(p)).results self.positions.append(obj)
import sys import os import Image from StringIO import StringIO import subprocess from lxml import etree import urllib2 import util SAXON_PATH = util.resource_filename('lib', 'saxon9he.jar') MATH2SVG_PATH = util.resource_filename('xslt2', 'math2svg-in-docbook.xsl') DOCBOOK_BOOK_XSL = util.makeXsl('moduledbk2book.xsl') MATH_XPATH = etree.XPath('//mml:math', namespaces=util.NAMESPACES) DOCBOOK_SVG_IMAGE_XPATH = etree.XPath('//db:imagedata[svg:svg]', namespaces=util.NAMESPACES) DOCBOOK_SVG_XPATH = etree.XPath('svg:svg', namespaces=util.NAMESPACES) DOCBOOK_IMAGE_XPATH = etree.XPath('//db:imagedata[@fileref]', namespaces=util.NAMESPACES) # ----------------------------- # Transform Structure: # # Every transform takes in 3 arguments: # - xml doc # - dictionary of files (string name, string bytes) # - optional dictionary of parameters (string, string) # # Every transform returns:
def xpath(self, xml_string, *xpath_args, **xpath_kwargs): "Does the given XPath query on the given string and yields etree elements." xml = etree.fromstring(xml_string) for el in etree.XPath(*xpath_args, **xpath_kwargs)(xml): yield el
def count_divs(tree): div_xpath = etree.XPath("//div") TestBlockifier.div_count = len(div_xpath(tree))
def main(): """""" with open('font-family.json') as _fontlist: fontlist = json.load(_fontlist) ffGlyphXPath = etree.XPath('//glyph[string-length(@unicode) = 1]') blocks = get_blocks() len_fontlist = len(fontlist) cps = load_cache() counter = 0 for item, (font_file, font_family) in enumerate(fontlist): if font_file[0] == "#": logger.warning("Skipping font {}".format(font_file[1:])) continue logger.info("Handling font {} ({}/{})".format(font_file, item, len_fontlist)) try: with open(font_file) as svg: r = etree.parse(svg) except IOError: logger.warning('Could not open %s' % font_file) continue except etree.XMLSyntaxError: logger.warning('Could not parse %s, no XML' % font_file) continue glyphs = ffGlyphXPath(r) len_glyphs = len(glyphs) for i, glyph in enumerate(glyphs): cp = glyph.get("unicode") ocp = ord(cp) cpn = '{0:04X}'.format(ocp) d = glyph.get("d", False) done = False if 0xE000 <= ocp <= 0xF8FF or \ 0xF0000 <= ocp <= 0x10FFFF: # private use areas. Skip. continue for blk in blocks: if blk[1] <= ocp and blk[2] >= ocp: try: if d and ocp not in cps: counter += 1 logger.info(" | Glyph {:3d}/{} ({}) of {}".format(i+1, len_glyphs, cp.encode('utf-8') if ocp >= 32 else '?', font_file)) emit(cp, d, font_family, blk, glyph) cps[ocp] = [ font_family ] elif ocp in cps and font_family not in cps[ocp]: logger.debug(" | Glyph {:3d}/{} ({}) of {}".format(i+1, len_glyphs, cp.encode('utf-8') if ocp >= 32 else '?', font_file)) emit_sql(cp, font_family, 0) cps[ocp].append(font_family) except (KeyboardInterrupt, SystemExit): if cps and ocp and ocp in cps: del cps[ocp] logger.warning('Shutting down, creating fonts and reports.') finish(cps, blocks, counter) raise done = True break if not done: logger.warning('No block found for U+{:04X}: not processed.'.format(ocp)) finish(cps, blocks, counter)
repl_ind_inv = r'[\1]' repl_dot_inv = '.' repl_str_inv = '*' repl_qtm_inv = '?' repl_emm_inv = '!' # Regular expressions to match attributes and indices within valid XPaths re_atr = re.compile(r'\[@' + pttrn_attr_name + "=['\"]" + pttrn_attr_val + "['\"]\]") re_ind = re.compile(r'\[([0-9]+?)\]') # Regular expressions to match attributes and indices within OpenMDAO variables transformed from xpaths re_atr_inv = re.compile(r':_:' + pttrn_attr_val + ':_:' + pttrn_attr_val + r'(?=/|$|:)') re_ind_inv = re.compile(r':_:_([0-9]+?)(?=/|$)') parser = etree.XMLParser(remove_blank_text=True, encoding='utf-8') find_text = etree.XPath('//text()') def xpath_to_param(xpath): # type: (str) -> str """Convert an XML XPath to a valid ``OpenMDAO`` parameter name. Parameters ---------- xpath : str XPath to convert. Returns ------- str Valid ``OpenMDAO`` parameter name.
def XPath(expr): ans = xpath_cache.get(expr, None) if ans is None: xpath_cache[expr] = ans = etree.XPath(expr, namespaces=NS_MAP) return ans
'{http://www.bibletechnologies.net/2003/OSIS/namespace}transChange', '{http://www.bibletechnologies.net/2003/OSIS/namespace}salute', '{http://www.bibletechnologies.net/2003/OSIS/namespace}signed', '{http://www.bibletechnologies.net/2003/OSIS/namespace}closer', '{http://www.bibletechnologies.net/2003/OSIS/namespace}speech', '{http://www.bibletechnologies.net/2003/OSIS/namespace}speaker', '{http://www.bibletechnologies.net/2003/OSIS/namespace}list', '{http://www.bibletechnologies.net/2003/OSIS/namespace}item', '{http://www.bibletechnologies.net/2003/OSIS/namespace}table', '{http://www.bibletechnologies.net/2003/OSIS/namespace}head', '{http://www.bibletechnologies.net/2003/OSIS/namespace}row', '{http://www.bibletechnologies.net/2003/OSIS/namespace}cell', '{http://www.bibletechnologies.net/2003/OSIS/namespace}caption') # Precompile a few xpath-querys verse_in_chapter = etree.XPath('//ns:chapter[1]/ns:verse', namespaces=NS) text_in_verse = etree.XPath('//ns:verse[1]/text()', namespaces=NS) class OSISBible(BibleImport): """ `OSIS <http://www.bibletechnologies.net/>`_ Bible format importer class. """ def process_books(self, bible_data): """ Extract and create the bible books from the parsed xml :param bible_data: parsed xml :return: None """ # Find books in the bible
seenpage = queue.Queue() url = 'http://www.bttiantang.com' def a(node): if re.search('/?Page', node.attrib['href']): if node.attrib['href'] not in seen: seen[node.attrib['href']] = node.text seenpage.put(url + node.attrib['href']) # return(node.attrib['href']) web = request.urlopen(url) html = web.read().decode('utf-8') HTML = etree.HTML(html) find = etree.XPath("//a[@href]") nodelist = find(HTML) pool = Pool(4) pool.map(a, nodelist[::-1]) pool.close() pool.join() #print(results) while (not seenpage.empty()): print(seenpage.get()) print(seen) ''' for node in nodelist: if re.search('/?Page', node.attrib['href']): print(node.attrib['href']) '''
# This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/>. """Helper functions for working with XML.""" import re import six from lxml import etree # some useful xpath expressions xml_preserve_ancestors = etree.XPath( "ancestor-or-self::*[attribute::xml:space='preserve']") """All ancestors with xml:space='preserve'""" xml_space_ancestors = etree.XPath("ancestor-or-self::*/attribute::xml:space") """All xml:space attributes in the ancestors""" string_xpath = etree.XPath("string()") """Return a non-normalized string in the node subtree""" string_xpath_normalized = etree.XPath("normalize-space()") """Return a (space) normalized string in the node subtree""" def getText(node, xml_space="preserve"): """Extracts the plain text content out of the given node.
def rows(table): # Yields lists of strings, with a string representing a <td>/<th> and # a list representing a <tr>. for tr in etree.XPath('tr')(table): yield [col.text_content() for col in etree.XPath('td | th')(tr)]
def retrieve_new_documents(self, limit=100): """Retrieves new documents using the EURLex search. Checks documents until it reaches a given limit. Args: limit (int): the maximium number of documents that should be retrieved. Returns: int: the number of newly found documents """ today = dt.datetime.combine(dt.date.today(), dt.time.min) entry_path = etree.XPath("//div[@class = 'SearchResult']") date_path = etree.XPath(""" .//dl/dd[preceding-sibling::dt[contains(text(), 'Date') or contains(text(), 'Datum')]]/text() """) doc_path = etree.XPath(""" .//ul[contains(@class, 'SearchResultDoc')]/li /a[contains(@href, 'PDF') or contains(@href, 'HTML')]/@href """) title_path = etree.XPath(".//h2/a[@class = 'title']/text()") detail_path = etree.XPath(".//h2/a[@class = 'title']/@href") timestamp = int(round(time.time() * 1000)) url_tmpl = ("https://eur-lex.europa.eu/search.html?lang=de&qid=" f"{timestamp}&type=quick&scope=EURLEX&sortOneOrder=desc" "&sortOne=DD&locale=de&page={}") has_unseen_documents = True doc_count = 0 page = 1 while (doc_count < limit) and has_unseen_documents: search_url = url_tmpl.format(page) logging.info(f"Crawling page '{search_url}' (page {page})") res = _retry_connection(search_url, "get") html_string = res.content tree = html.fromstring(html_string) for entry in entry_path(tree): if not isinstance(entry, list): entry = [entry] date_string = _flat_map(date_path, entry)[0] match = re.search(r"(\d+\/\d+\/\d+)", date_string) doc_date = dt.datetime.min if match: doc_date = dt.datetime.strptime(match[1], "%d/%m/%Y") if len(_flat_map(doc_path, entry)) == 0: continue link = _make_resource_path( _flat_map(doc_path, entry)[0], "https://eur-lex.europa.eu") detail = _make_resource_path( _flat_map(detail_path, entry)[0], "https://eur-lex.europa.eu") title = _flat_map(title_path, entry)[0] doc = { "url": link, "detail_url": detail, "date": doc_date, "title": title, "crawl_date": today } logging.debug(f"Process Document: {link} - {doc_date.date()}") num_docs = self.collection.count_documents({"url": link}) if num_docs > 0: logging.debug(f"Document was crawled before: '{link}'") # check whether this document had a date before the crawl # date, if not, break. duplicate_doc = self.collection.find_one({"url": link}) if duplicate_doc["date"] >= duplicate_doc["crawl_date"]: logging.debug("Document date lies in the future." " Continue...") continue logging.debug("Break!") has_unseen_documents = False break logging.debug(f"Found new document: {link}.") res = self.collection.insert_one(doc) doc_count += 1 page += 1 logging.info(f"Found {doc_count} new or potentially modified docs.") return doc_count
_resources_dir, "xsl", "iso-schematron-xslt1", "iso_abstract_expand.xsl" ) ) ) iso_svrl_for_xslt1 = _etree.XSLT( _etree.parse( os.path.join( _resources_dir, "xsl", "iso-schematron-xslt1", "iso_svrl_for_xslt1.xsl" ) ) ) # svrl result accessors svrl_validation_errors = _etree.XPath( "//svrl:failed-assert", namespaces={"svrl": SVRL_NS} ) # RelaxNG validator for schematron schemas schematron_schema_valid = _etree.RelaxNG( file=os.path.join(_resources_dir, "rng", "iso-schematron.rng") ) def stylesheet_params(**kwargs): """Convert keyword args to a dictionary of stylesheet parameters. XSL stylesheet parameters must be XPath expressions, i.e.: * string expressions, like "'5'" * simple (number) expressions, like "5"
def XPath(x): try: return etree.XPath(x, namespaces=XPNSMAP) except etree.XPathSyntaxError: raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x))
class Schematron(_etree._Validator): """An ISO Schematron validator. Pass a root Element or an ElementTree to turn it into a validator. Alternatively, pass a filename as keyword argument 'file' to parse from the file system. Schematron is a less well known, but very powerful schema language. The main idea is to use the capabilities of XPath to put restrictions on the structure and the content of XML documents. The standard behaviour is to fail on ``failed-assert`` findings only (``ASSERTS_ONLY``). To change this, you can either pass a report filter function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS`` or a custom ``XPath`` object), or subclass isoschematron.Schematron for complete control of the validation process. Built on the Schematron language 'reference' skeleton pure-xslt implementation, the validator is created as an XSLT 1.0 stylesheet using these steps: 0) (Extract from XML Schema or RelaxNG schema) 1) Process inclusions 2) Process abstract patterns 3) Compile the schematron schema to XSLT The ``include`` and ``expand`` keyword arguments can be used to switch off steps 1) and 2). To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the keyword arguments ``include_params``, ``expand_params`` or ``compile_params``. For convenience, the compile-step parameter ``phase`` is also exposed as a keyword argument ``phase``. This takes precedence if the parameter is also given in the parameter dictionary. If ``store_schematron`` is set to True, the (included-and-expanded) schematron document tree is stored and available through the ``schematron`` property. If ``store_xslt`` is set to True, the validation XSLT document tree will be stored and can be retrieved through the ``validator_xslt`` property. With ``store_report`` set to True (default: False), the resulting validation report document gets stored and can be accessed as the ``validation_report`` property. Here is a usage example:: >>> from lxml import etree >>> from lxml.isoschematron import Schematron >>> schematron = Schematron(etree.XML(''' ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" > ... <pattern id="id_only_attribute"> ... <title>id is the only permitted attribute name</title> ... <rule context="*"> ... <report test="@*[not(name()='id')]">Attribute ... <name path="@*[not(name()='id')]"/> is forbidden<name/> ... </report> ... </rule> ... </pattern> ... </schema>'''), ... error_finder=Schematron.ASSERTS_AND_REPORTS) >>> xml = etree.XML(''' ... <AAA name="aaa"> ... <BBB id="bbb"/> ... <CCC color="ccc"/> ... </AAA> ... ''') >>> schematron.validate(xml) False >>> xml = etree.XML(''' ... <AAA id="aaa"> ... <BBB id="bbb"/> ... <CCC/> ... </AAA> ... ''') >>> schematron.validate(xml) True """ # libxml2 error categorization for validation errors _domain = _etree.ErrorDomains.SCHEMATRONV _level = _etree.ErrorLevels.ERROR _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT # convenience definitions for common behaviours ASSERTS_ONLY = svrl_validation_errors # Default ASSERTS_AND_REPORTS = _etree.XPath( "//svrl:failed-assert | //svrl:successful-report", namespaces={"svrl": SVRL_NS} ) def _extract(self, element): """Extract embedded schematron schema from non-schematron host schema. This method will only be called by __init__ if the given schema document is not a schematron schema by itself. Must return a schematron schema document tree or None. """ schematron = None if element.tag == _xml_schema_root: schematron = self._extract_xsd(element) elif element.nsmap[element.prefix] == RELAXNG_NS: # RelaxNG does not have a single unique root element schematron = self._extract_rng(element) return schematron # customization points # etree.XSLT objects that provide the extract, include, expand, compile # steps _extract_xsd = extract_xsd _extract_rng = extract_rng _include = iso_dsdl_include _expand = iso_abstract_expand _compile = iso_svrl_for_xslt1 # etree.xpath object that determines input document validity when applied to # the svrl result report; must return a list of result elements (empty if # valid) _validation_errors = ASSERTS_ONLY def __init__( self, etree=None, file=None, include=True, expand=True, include_params={}, expand_params={}, compile_params={}, store_schematron=False, store_xslt=False, store_report=False, phase=None, error_finder=ASSERTS_ONLY, ): super(Schematron, self).__init__() self._store_report = store_report self._schematron = None self._validator_xslt = None self._validation_report = None if error_finder is not self.ASSERTS_ONLY: self._validation_errors = error_finder # parse schema document, may be a schematron schema or an XML Schema or # a RelaxNG schema with embedded schematron rules root = None try: if etree is not None: if _etree.iselement(etree): root = etree else: root = etree.getroot() elif file is not None: root = _etree.parse(file).getroot() except Exception: raise _etree.SchematronParseError( "No tree or file given: %s" % sys.exc_info()[1] ) if root is None: raise ValueError("Empty tree") if root.tag == _schematron_root: schematron = root else: schematron = self._extract(root) if schematron is None: raise _etree.SchematronParseError( "Document is not a schematron schema or schematron-extractable" ) # perform the iso-schematron skeleton implementation steps to get a # validating xslt if include: schematron = self._include(schematron, **include_params) if expand: schematron = self._expand(schematron, **expand_params) if not schematron_schema_valid(schematron): raise _etree.SchematronParseError( "invalid schematron schema: %s" % schematron_schema_valid.error_log ) if store_schematron: self._schematron = schematron # add new compile keyword args here if exposing them compile_kwargs = {"phase": phase} compile_params = _stylesheet_param_dict(compile_params, compile_kwargs) validator_xslt = self._compile(schematron, **compile_params) if store_xslt: self._validator_xslt = validator_xslt self._validator = _etree.XSLT(validator_xslt) def __call__(self, etree): """Validate doc using Schematron. Returns true if document is valid, false if not. """ self._clear_error_log() result = self._validator(etree) if self._store_report: self._validation_report = result errors = self._validation_errors(result) if errors: if _etree.iselement(etree): fname = etree.getroottree().docinfo.URL or "<file>" else: fname = etree.docinfo.URL or "<file>" for error in errors: # Does svrl report the line number, anywhere? Don't think so. self._append_log_message( domain=self._domain, type=self._error_type, level=self._level, line=0, message=_etree.tostring(error, encoding="unicode"), filename=fname, ) return False return True @property def schematron(self): """ISO-schematron schema document (None if object has been initialized with store_schematron=False). """ return self._schematron @property def validator_xslt(self): """ISO-schematron skeleton implementation XSLT validator document (None if object has been initialized with store_xslt=False). """ return self._validator_xslt @property def validation_report(self): """ISO-schematron validation result report (None if result-storing has been turned off). """ return self._validation_report
def XPath(expr): return etree.XPath(expr, namespaces={'h': XHTML_NS})
from os.path import dirname, realpath from lxml import html, etree fileDir = realpath(__file__) rootDir = dirname(dirname(fileDir)) dataFile = rootDir + '/pages/data/138222.html' root = html.parse(dataFile) find_text = etree.XPath("//h1/a/text()") title = find_text(root) print(title) def divider(): print("-" * 20 + "\n") # About Sections aboutSect = root.xpath('.//h2[text()="About this job"]/ancestor::section') # print(aboutSect[0]) aboutRoot = etree.ElementTree(aboutSect[0]) # print(etree.tostring(aboutRoot)) jobTypeSpan = aboutRoot.xpath( './/span[contains(text(),"Job type")]/ancestor::div[position()=1]') # print(jobTypeSpan) # print(etree.tostring(jobTypeSpan[0])) jobTypeRoot = etree.ElementTree(jobTypeSpan[0]) jobType = jobTypeRoot.xpath('//span/text()') print("Job Type: " + jobType[1])
import lxml.etree as etree from lxml.etree import iselement from w3lib.html import strip_html5_whitespace from w3lib.url import canonicalize_url from scrapy.link import Link from scrapy.utils.misc import arg_to_iter, rel_has_nofollow from scrapy.utils.python import unique as unique_list, to_native_str from scrapy.utils.response import get_base_url from scrapy.linkextractors import FilteringLinkExtractor # from lxml/src/lxml/html/__init__.py XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" _collect_string_content = etree.XPath("string()") class PartialParserLinkExtract(LxmlParserLinkExtractor): def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document selector_root = selector if not iselement(selector_root): selector_root = selector.root for el, attr, attr_val in self._iter_links(selector_root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val)
class WQXMapper: # ---------- dictionary of precompiled XPath query expressions # for retrieving logical context nodes (matches # wqx_mappings.context_xpaths): context_xpaths_compl = {} for nodename in context_xpaths: context_xpaths_compl[nodename] = etree.XPath(context_xpaths[nodename], namespaces=ns) # ---------- precompiled XPath query expressions ('nodeq') for retrieving # Logical Node nodesets: # relative expression from root # organizations context_xpaths_compl['org'] = etree.XPath('/wqx:WQX/wqx:Organization', namespaces=ns) # relative expressions from organization node # stations context_xpaths_compl['station'] = etree.XPath('wqx:MonitoringLocation', namespaces=ns) # activities context_xpaths_compl['activity'] = etree.XPath('wqx:Activity', namespaces=ns) # relative expression from activity node # results context_xpaths_compl['result'] = etree.XPath('wqx:Result', namespaces=ns) # ---------- dictionaries of precompiled XPath query expressions # for retrieving column values (keys are tabular column names): val_xpaths_compl = {} for node in context_xpaths_compl.keys(): print('doing node \'' + node + '\'') val_xpaths_compl[node] = {} for colname in val_xpaths[node].keys(): cur_node_dict = val_xpaths[node] cur_xpath = etree.XPath(cur_node_dict[colname] + '/text()', namespaces=ns, smart_strings=False) val_xpaths_compl[node][colname] = cur_xpath def make_rowpart(self, node, valq): ''' Applies the column val XPath query expression "valq" to the XML context node "node" and returns a dictionary whose keys are column names matching descendants of the context node. Dictionary values are merges of the text values of all descendant nodes that map to the key (Note that if there are multiple such values, there were multiple sibling nodes with non-empty text.) The "merge" is a single-space_delimited concatenation. ''' retval = {} # invoke the compiled XPaths for colname in valq: retval[colname] = ' '.join(valq[colname](node)) return retval def determine_table_type(self, response): ''' This method inspects the response .status_code and .url properties to determine what should be done. - If the status code is not a 2xx, it will raise a BaseException. - If the status code is 2xx, it will inspect the response.url to determinewhether the resultset should be framed as a Station or Result table (or, in future, a Biodata or Simplestation table.) If it cannot determine the correct table type, it will raise a BaseException. ''' if response.status_code < 200 or response.status_code >= 300: raise (BaseException('The response is not OK: status code "' + str(response.status_code) + ' ' + response.reason + '".')) table_type = '' if 'Station/search' in response.url: table_type = 'station' elif 'Result/search' in response.url: table_type = 'result' if not table_type: raise (BaseException( 'Unable to determine table type from response URL "' + response.url + '".')) return table_type def xml_to_dict_of_lists(self, table_type, root): ''' Given a known table_type and an XML root node, this method will attempt to construct a tabular WQX representation of the information contained in the XML. The tabular representation is "column-first": a dictionary whose keys are column names. Each column's value is a list of values. The length of each list is equal to the number of rows that will be represented in the table. The values in a list are determined by the val XPath expressions, based on the Logical Nodes in context when the row was evaluated. The Lists are all the same length. When the XML does not supply a value, an empty string is inserted. The number of rows in the tabular representation is equal to the length of any List. A single row is determined by slicing all of the Lists at the same index. If the table_type is not known, this method returns an empty list. If the XML root is not valid WQX, behavior is not specified. This method does not attempt XML validation. ''' datadict = {} for colname in tabular_defs[table_type]: datadict[colname] = [] orgs = self.context_xpaths_compl['org'](root) for org in orgs: org_rowpart = self.make_rowpart(org, self.val_xpaths_compl['org']) if table_type == 'result': activities = self.context_xpaths_compl['activity'](org) for activity in activities: activity_rowpart = self.make_rowpart( activity, self.val_xpaths_compl['activity']) results = self.context_xpaths_compl['result'](activity) for result in results: result_rowpart = self.make_rowpart( result, self.val_xpaths_compl['result']) this_row = {} this_row.update(org_rowpart) this_row.update(activity_rowpart) this_row.update(result_rowpart) for colname in tabular_defs['result']: val = this_row.get(colname) if not val: val = '' datadict[colname].append(val) elif table_type == 'station': stations = self.context_xpaths_compl['station'](org) for station in stations: station_rowpart = self.make_rowpart( station, self.val_xpaths_compl['station']) this_row = {} this_row.update(org_rowpart) this_row.update(station_rowpart) for colname in tabular_defs['station']: val = this_row.get(colname) if not val: val = '' datadict[colname].append(val) return datadict def xml_to_list_of_dicts(self, table_type, root): ''' Given a known table_type and an XML root node, this method will attempt to construct a tabular WQX representation of the information contained in the XML. The tabular representation is "row-first": a list of dictionaries. Each dict in the list corresponds to a table row. The dictionary keys are column names, and the values are the values extracted from the XML according to the context and val XPath expressions in wqx_mappings. The number of rows in the tabular representation is equal to the length of the returned list. A single row is determined by taking a single dict out of the returned List. If the table_type is not known, this method returns an empty list. If the XML root is not valid WQX, behavior is not specified. This method does not attempt XML validation. ''' rows = [] orgs = self.context_xpaths_compl['org'](root) for org in orgs: org_rowpart = self.make_rowpart(org, self.val_xpaths_compl['org']) if table_type == 'result': activities = self.context_xpaths_compl['activity'](org) for activity in activities: activity_rowpart = self.make_rowpart( activity, self.val_xpaths_compl['activity']) results = self.context_xpaths_compl['result'](activity) for result in results: result_rowpart = self.make_rowpart( result, self.val_xpaths_compl['result']) this_row = {} this_row.update(org_rowpart) this_row.update(activity_rowpart) this_row.update(result_rowpart) rows.append(this_row) elif table_type == 'station': stations = self.context_xpaths_compl['station'](org) for station in stations: station_rowpart = self.make_rowpart( station, self.val_xpaths_compl['station']) this_row = {} this_row.update(org_rowpart) this_row.update(station_rowpart) rows.append(this_row) return rows def make_dataframe_from_xml(self, table_type, root, columns_first=True): ''' This method accepts a known table_type and an XML root node. It returns a pandas.DataFrame containing the tabular representation of the data contained in the "root" argument. The DataFrame return value will have columns defined as being equal to the corresponding member of tabular_defs, even if the columns are not populated in any of the records embodied in the XML root. Expected behavior with respect to improper parameters is similar to that documented for xml_to_list_of_dicts(table_type, root). ''' dataframe = None col_defs = tabular_defs[table_type] if col_defs: if columns_first: data_rows = self.xml_to_dict_of_lists(table_type, root) else: data_rows = self.xml_to_list_of_dicts(table_type, root) dataframe = pandas.DataFrame(data=data_rows, columns=col_defs) return dataframe def make_dataframe_from_http_response(self, response, columns_first=True): ''' This method accepts a requests.response HTTP Response object. The assumption is that this response was obtained by calling a WQP RESTlike service as described at http://www.waterqualitydata.us/webservices_documentation.jsp. This method - checks the status code, raising BaseException if not 2xx - attempts to identify the table_type, raising BaseException if the response cannot be identified as a known type - attempts to parse the XML content, if any - attempts to convert the XML content to the correct tabular form - returns a pandas.DataFrame containing the tabular data ''' retval = None table_type = self.determine_table_type(response) if table_type and response.content: root = etree.fromstring(response.content) retval = self.make_dataframe_from_xml(table_type, root, columns_first) return retval
def main(): module = AnsibleModule( argument_spec=dict( path=dict(type='path', aliases=['dest', 'file']), xmlstring=dict(type='str'), xpath=dict(type='str'), namespaces=dict(type='dict', default={}), state=dict(type='str', default='present', choices=['absent', 'present'], aliases=['ensure']), value=dict(type='raw'), attribute=dict(type='raw'), add_children=dict(type='list'), set_children=dict(type='list'), count=dict(type='bool', default=False), print_match=dict(type='bool', default=False), pretty_print=dict(type='bool', default=False), content=dict(type='str', choices=['attribute', 'text']), input_type=dict(type='str', default='yaml', choices=['xml', 'yaml']), backup=dict(type='bool', default=False), strip_cdata_tags=dict(type='bool', default=False), insertbefore=dict(type='bool', default=False), insertafter=dict(type='bool', default=False), ), supports_check_mode=True, required_by=dict( add_children=['xpath'], attribute=['value'], content=['xpath'], set_children=['xpath'], value=['xpath'], ), required_if=[ ['count', True, ['xpath']], ['print_match', True, ['xpath']], ['insertbefore', True, ['xpath']], ['insertafter', True, ['xpath']], ], required_one_of=[ ['path', 'xmlstring'], [ 'add_children', 'content', 'count', 'pretty_print', 'print_match', 'set_children', 'value' ], ], mutually_exclusive=[ [ 'add_children', 'content', 'count', 'print_match', 'set_children', 'value' ], ['path', 'xmlstring'], ['insertbefore', 'insertafter'], ], ) xml_file = module.params['path'] xml_string = module.params['xmlstring'] xpath = module.params['xpath'] namespaces = module.params['namespaces'] state = module.params['state'] value = json_dict_bytes_to_unicode(module.params['value']) attribute = module.params['attribute'] set_children = json_dict_bytes_to_unicode(module.params['set_children']) add_children = json_dict_bytes_to_unicode(module.params['add_children']) pretty_print = module.params['pretty_print'] content = module.params['content'] input_type = module.params['input_type'] print_match = module.params['print_match'] count = module.params['count'] backup = module.params['backup'] strip_cdata_tags = module.params['strip_cdata_tags'] insertbefore = module.params['insertbefore'] insertafter = module.params['insertafter'] # Check if we have lxml 2.3.0 or newer installed if not HAS_LXML: module.fail_json(msg=missing_required_lib("lxml"), exception=LXML_IMP_ERR) elif LooseVersion('.'.join( to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'): module.fail_json( msg= 'The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine' ) elif LooseVersion('.'.join( to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'): module.warn( 'Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.' ) # Check if the file exists if xml_string: infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict')) elif os.path.isfile(xml_file): infile = open(xml_file, 'rb') else: module.fail_json(msg="The target XML source '%s' does not exist." % xml_file) # Parse and evaluate xpath expression if xpath is not None: try: etree.XPath(xpath) except etree.XPathSyntaxError as e: module.fail_json(msg="Syntax error in xpath expression: %s (%s)" % (xpath, e)) except etree.XPathEvalError as e: module.fail_json( msg="Evaluation error in xpath expression: %s (%s)" % (xpath, e)) # Try to parse in the target XML file try: parser = etree.XMLParser(remove_blank_text=pretty_print, strip_cdata=strip_cdata_tags) doc = etree.parse(infile, parser) except etree.XMLSyntaxError as e: module.fail_json(msg="Error while parsing document: %s (%s)" % (xml_file or 'xml_string', e)) # Ensure we have the original copy to compare global orig_doc orig_doc = copy.deepcopy(doc) if print_match: do_print_match(module, doc, xpath, namespaces) if count: count_nodes(module, doc, xpath, namespaces) if content == 'attribute': get_element_attr(module, doc, xpath, namespaces) elif content == 'text': get_element_text(module, doc, xpath, namespaces) # File exists: if state == 'absent': # - absent: delete xpath target delete_xpath_target(module, doc, xpath, namespaces) # - present: carry on # children && value both set?: should have already aborted by now # add_children && set_children both set?: should have already aborted by now # set_children set? if set_children: set_target_children(module, doc, xpath, namespaces, set_children, input_type) # add_children set? if add_children: add_target_children(module, doc, xpath, namespaces, add_children, input_type, insertbefore, insertafter) # No?: Carry on # Is the xpath target an attribute selector? if value is not None: set_target(module, doc, xpath, namespaces, attribute, value) # If an xpath was provided, we need to do something with the data if xpath is not None: ensure_xpath_exists(module, doc, xpath, namespaces) # Otherwise only reformat the xml data? if pretty_print: make_pretty(module, doc) module.fail_json(msg="Don't know what to do")
def find_id(self, element_id): """Find elements with the given ID""" find = etree.XPath("//*[@id=$id]") return FigureElement(find(self.root, id=element_id)[0])
def trans_mat_from_xml(elem): """ Take an xml element that represents a Coord_System_Definition and return a trans matrix. """ r_o = np.array( [float(f) for f in elem.xpath(r"./*[@name = 'position']//@value")]) n_x = np.array( [float(f) for f in elem.xpath(r"./*[@name = 'local_x']//@value")]) n_y = np.array( [float(f) for f in elem.xpath(r"./*[@name = 'local_y']//@value")]) return matrix_from_nx_ny_ro(n_x, n_y, r_o) xpath_val = etree.XPath(r".//*[@Name = $n]/Value/ValueExpression/Value") def trans_mat_from_avm_xml(elem): """ Take an xml element that represents a Coord_System_Definition and return a trans matrix. """ r_o = np.array([ float(f) for f in xpath_val(elem, n="position")[0].text[1:-1].split(",") ]) n_x = np.array([ float(f) for f in xpath_val(elem, n="local_x")[0].text[1:-1].split(",") ]) n_y = np.array([ float(f) for f in xpath_val(elem, n="local_y")[0].text[1:-1].split(",")
def parse_google_card(self, node): e = discord.Embed(colour=discord.Colour.blurple()) # check if it's a calculator card: calculator = node.find(".//span[@class='cwclet']") if calculator is not None: e.title = 'Calculator' result = node.find(".//span[@class='cwcot']") if result is not None: result = ' '.join((calculator.text, result.text.strip())) else: result = calculator.text + ' ???' e.description = result return e # check for unit conversion card unit_conversions = node.xpath(".//input[contains(@class, '_eif') and @value]") if len(unit_conversions) == 2: e.title = 'Unit Conversion' # the <input> contains our values, first value = second value essentially. # these <input> also have siblings with <select> and <option selected=1> # that denote what units we're using # We will get 2 <option selected="1"> nodes by traversing the parent # The first unit being converted (e.g. Miles) # The second unit being converted (e.g. Feet) xpath = etree.XPath("parent::div/select/option[@selected='1']/text()") try: first_node = unit_conversions[0] first_unit = xpath(first_node)[0] first_value = float(first_node.get('value')) second_node = unit_conversions[1] second_unit = xpath(second_node)[0] second_value = float(second_node.get('value')) e.description = ' '.join((str(first_value), first_unit, '=', str(second_value), second_unit)) except Exception: return None else: return e # check for currency conversion card if 'currency' in node.get('class', ''): currency_selectors = node.xpath(".//div[@class='ccw_unit_selector_cnt']") if len(currency_selectors) == 2: e.title = 'Currency Conversion' # Inside this <div> is a <select> with <option selected="1"> nodes # just like the unit conversion card. first_node = currency_selectors[0] first_currency = first_node.find("./select/option[@selected='1']") second_node = currency_selectors[1] second_currency = second_node.find("./select/option[@selected='1']") # The parent of the nodes have a <input class='vk_gy vk_sh ccw_data' value=...> xpath = etree.XPath("parent::td/parent::tr/td/input[@class='vk_gy vk_sh ccw_data']") try: first_value = float(xpath(first_node)[0].get('value')) second_value = float(xpath(second_node)[0].get('value')) values = ( str(first_value), first_currency.text, f'({first_currency.get("value")})', '=', str(second_value), second_currency.text, f'({second_currency.get("value")})' ) e.description = ' '.join(values) except Exception: return None else: return e # check for generic information card info = node.find(".//div[@class='_f2g']") if info is not None: try: e.title = ''.join(info.itertext()).strip() actual_information = info.xpath("parent::div/parent::div//div[@class='_XWk' or contains(@class, 'kpd-ans')]")[0] e.description = ''.join(actual_information.itertext()).strip() except Exception: return None else: return e # check for translation card translation = node.find(".//div[@id='tw-ob']") if translation is not None: src_text = translation.find(".//pre[@id='tw-source-text']/span") src_lang = translation.find(".//select[@id='tw-sl']/option[@selected='1']") dest_text = translation.find(".//pre[@id='tw-target-text']/span") dest_lang = translation.find(".//select[@id='tw-tl']/option[@selected='1']") # TODO: bilingual dictionary nonsense? e.title = 'Translation' try: e.add_field(name=src_lang.text, value=src_text.text, inline=True) e.add_field(name=dest_lang.text, value=dest_text.text, inline=True) except Exception: return None else: return e # check for "time in" card time = node.find("./div[@class='vk_bk vk_ans']") if time is not None: date = node.find("./div[@class='vk_gy vk_sh']") try: e.title = node.find('span').text e.description = f'{time.text}\n{"".join(date.itertext()).strip()}' except Exception: return None else: return e # time in has an alternative form without spans time = node.find("./div/div[@class='vk_bk vk_ans _nEd']") if time is not None: converted = "".join(time.itertext()).strip() try: # remove the in-between text parent = time.getparent() parent.remove(time) original = "".join(parent.itertext()).strip() e.title = 'Time Conversion' e.description = f'{original}...\n{converted}' except Exception: return None else: return e # check for definition card words = node.xpath(".//span[@data-dobid='hdw']") if words: lex = etree.XPath(".//div[@class='lr_dct_sf_h']/i/span") # this one is derived if we were based on the position from lex xpath = etree.XPath("../../../ol[@class='lr_dct_sf_sens']//" \ "div[not(@class and @class='lr_dct_sf_subsen')]/" \ "div[@class='_Jig']/div[@data-dobid='dfn']/span") for word in words: # we must go two parents up to get the root node root = word.getparent().getparent() pronunciation = root.find(".//span[@class='lr_dct_ph']/span") if pronunciation is None: continue lexical_category = lex(root) definitions = xpath(root) for category in lexical_category: definitions = xpath(category) try: descrip = [f'*{category.text}*'] for index, value in enumerate(definitions, 1): descrip.append(f'{index}. {value.text}') e.add_field(name=f'{word.text} /{pronunciation.text}/', value='\n'.join(descrip)) except: continue return e # check for weather card location = node.find("./div[@id='wob_loc']") if location is None: return None # these units should be metric date = node.find("./div[@id='wob_dts']") # <img alt="category here" src="cool image"> category = node.find(".//img[@id='wob_tci']") xpath = etree.XPath(".//div[@id='wob_d']//div[contains(@class, 'vk_bk')]//span[@class='wob_t']") temperatures = xpath(node) misc_info_node = node.find(".//div[@class='vk_gy vk_sh wob-dtl']") if misc_info_node is None: return None precipitation = misc_info_node.find("./div/span[@id='wob_pp']") humidity = misc_info_node.find("./div/span[@id='wob_hm']") wind = misc_info_node.find("./div/span/span[@id='wob_tws']") try: e.title = 'Weather for ' + location.text.strip() e.description = f'*{category.get("alt")}*' e.set_thumbnail(url='https:' + category.get('src')) if len(temperatures) == 4: first_unit = temperatures[0].text + temperatures[2].text second_unit = temperatures[1].text + temperatures[3].text units = f'{first_unit} | {second_unit}' else: units = 'Unknown' e.add_field(name='Temperature', value=units, inline=False) if precipitation is not None: e.add_field(name='Precipitation', value=precipitation.text) if humidity is not None: e.add_field(name='Humidity', value=humidity.text) if wind is not None: e.add_field(name='Wind', value=wind.text) except: return None return e
def validate_and_normalize_data(data, fmt=None): """ This function validates the data for given format (fmt). If the fmt is None it tires to guess the data format. Currently support data format checks are 1) xml 2) json 3) xpath :param data: The data which should be validated and normalised. :param fmt: This is an optional argument which indicated the format of the data. Valid values are "xml", "json" and "xpath". If the value is None the format of the data will be guessed and returned in the output. :return: * If the format identified is XML it parses the xml data and returns a tuple of lxml.etree.Element class object and the data format type which is "xml" in this case. * If the format identified is JSON it parses the json data and returns a tuple of dict object and the data format type which is "json" in this case. * If the format identified is XPATH it parses the XPATH data and returns a tuple of etree.XPath class object and the data format type which is "xpath" in this case. For this type lxml library is required to be installed. """ if data is None: return None, None if isinstance(data, string_types): data = data.strip() if (data.startswith("<") and data.endswith(">")) or fmt == "xml": try: result = fromstring(data) if fmt and fmt != "xml": raise Exception( "Invalid format '%s'. Expected format is 'xml' for data '%s'" % (fmt, data)) return result, "xml" except XMLSyntaxError as exc: if fmt == "xml": raise Exception( "'%s' XML validation failed with error '%s'" % ( data, to_native(exc, errors="surrogate_then_replace"), )) pass except Exception as exc: error = "'%s' recognized as XML but was not valid." % data raise Exception( error + to_native(exc, errors="surrogate_then_replace")) elif (data.startswith("{") and data.endswith("}")) or fmt == "json": try: result = json.loads(data) if fmt and fmt != "json": raise Exception( "Invalid format '%s'. Expected format is 'json' for data '%s'" % (fmt, data)) return result, "json" except ( TypeError, getattr(json.decoder, "JSONDecodeError", ValueError), ) as exc: if fmt == "json": raise Exception( "'%s' JSON validation failed with error '%s'" % ( data, to_native(exc, errors="surrogate_then_replace"), )) except Exception as exc: error = "'%s' recognized as JSON but was not valid." % data raise Exception( error + to_native(exc, errors="surrogate_then_replace")) else: try: if not HAS_LXML: raise Exception(missing_required_lib("lxml")) result = etree.XPath(data) if fmt and fmt != "xpath": raise Exception( "Invalid format '%s'. Expected format is 'xpath' for data '%s'" % (fmt, data)) return result, "xpath" except etree.XPathSyntaxError as exc: if fmt == "xpath": raise Exception( "'%s' XPath validation failed with error '%s'" % ( data, to_native(exc, errors="surrogate_then_replace"), )) pass except Exception as exc: error = "'%s' recognized as Xpath but was not valid." % data raise Exception( error + to_native(exc, errors="surrogate_then_replace")) elif isinstance(data, dict): if fmt and fmt != "json": raise Exception( "Invalid format '%s'. Expected format is 'json' for data '%s'" % (fmt, data)) try: result = json.loads(json.dumps(data)) return result, "json" except ( TypeError, getattr(json.decoder, "JSONDecodeError", ValueError), ) as exc: raise Exception( "'%s' JSON validation failed with error '%s'" % (data, to_native(exc, errors="surrogate_then_replace"))) except Exception as exc: error = "'%s' recognized as JSON but was not valid." % data raise Exception(error + to_native(exc, errors="surrogate_then_replace")) return data, None
def set_profile_url(self): try: profile_url_xpath = etree.XPath('site-standard-profile-request/url') self.profile_url = profile_url_xpath(self.xml)[0].text.strip() except: pass
match = re.search(args.pattern, obj) #debug("object ",obj,ife(match, " matches"," does not match")) if ((args.exclude and (match == None)) or (not args.exclude and (match != None))): exportObject(obj, args, prefix, extension, infile) elif (xpath_mode): from lxml import etree message("exporting from " + infile + " all objects " + ife(args.exclude, 'not ', '') + "matching " + args.xpath) parser = etree.XMLParser() #ns_clean=True) intree = etree.parse(infile, parser) if (len(parser.error_log) > 0): message("Could not parse ", infile, ":") debug(error_log) find = etree.XPath( "(" + args.xpath + ")/@id", namespaces=xpath_namespaces) #find the ids, not the objects objects = find(intree) message("found %i objects matching XPath" % len(objects)) if (not args.exclude): #include mode for obj in objects: exportObject(obj, args, prefix, extension, infile) else: #exclude mode objects_all = subprocess.check_output( [args.inkscape, "--query-all", infile]) #message(objects) for obj in objects_all.splitlines(): obj = obj.split(',')[0] #keep only ID: if not (obj in objects): exportObject(obj, args, prefix, extension, infile)
def get_educations(self): profile_education_xpath = etree.XPath('educations/education') eds = profile_education_xpath(self.xml) for e in eds: obj = lixml.LinkedInXMLParser(etree.tostring(e)).results self.educations.append(obj)
import ujson from collections import defaultdict, OrderedDict from lxml import etree import csv from .entry_data import word_dict NANORI = etree.XPath('./reading_meaning//nanori') ON_READINGS = etree.XPath('./reading_meaning//reading[@r_type="ja_on"]') KUN_READINGS = etree.XPath('./reading_meaning//reading[@r_type="ja_kun"]') MEANINGS = etree.XPath('./reading_meaning//meaning[not(@m_lang)]') GRADE = etree.XPath('./misc/grade') STROKE_COUNT = etree.XPath('./misc/stroke_count') CODEPOINT = etree.XPath('.//cp_value[@cp_type="ucs"]') JLPT = etree.XPath('./misc/jlpt') LITERAL = etree.XPath('literal') def nanori(character): readings = NANORI(character) return [reading.text for reading in readings] def on_readings(character): readings = ON_READINGS(character) return [reading.text for reading in readings] def kun_readings(character): readings = KUN_READINGS(character)