Beispiel #1
0
    def strip_full_message_quoting_html(message_body):
        """Assumes any encoding conversions have already been done
        """
        #Most useful to develop this:
        #http://www.motobit.com/util/quoted-printable-decoder.asp
        #http://www.freeformatter.com/html-formatter.html
        #http://www.freeformatter.com/xpath-tester.html#ad-output

        debug = True
        from lxml import html, etree

        doc = None
        try:
            doc = html.fromstring(message_body)
        except etree.ParserError:  # If the parsed HTML document is empty, we get a "ParserError: Document is empty" exception. So the stripped message we return is an empty string (if we keep the exception it blocks the SourceReader)
            return ""

        #Strip GMail quotes
        matches = doc.find_class('gmail_quote')
        if len(matches) > 0:
            if not matches[
                    0].text or "---------- Forwarded message ----------" not in matches[
                        0].text:
                matches[0].drop_tree()
                return html.tostring(doc, encoding="unicode")

        #Strip modern Apple Mail quotes
        find = etree.XPath(
            r"//child::blockquote[contains(@type,'cite')]/preceding-sibling::br[contains(@class,'Apple-interchange-newline')]/parent::node()/parent::node()"
        )
        matches = find(doc)
        #log.debug(len(matches))
        #for index,match in enumerate(matches):
        #    log.debug("Match: %d: %s " % (index, html.tostring(match, encoding="unicode")))
        if len(matches) == 1:
            matches[0].drop_tree()
            return html.tostring(doc, encoding="unicode")

        #Strip old AppleMail quotes (french)
        regexpNS = "http://exslt.org/regular-expressions"
        ##Trying to match:  Le 6 juin 2011 à 11:02, Jean-Michel Cornu a écrit :
        find = etree.XPath(
            r"//child::div[re:test(text(), '^.*Le .*\d{4} .*:\d{2}, .* a .*crit :.*$', 'i')]/following-sibling::br[contains(@class,'Apple-interchange-newline')]/parent::node()",
            namespaces={'re': regexpNS})
        matches = find(doc)
        if len(matches) == 1:
            matches[0].drop_tree()
            return html.tostring(doc, encoding="unicode")

        #Strip Outlook quotes (when outlook gives usable structure)
        find = etree.XPath(
            r"//body/child::blockquote/child::div[contains(@class,'OutlookMessageHeader')]/parent::node()"
        )
        matches = find(doc)
        if len(matches) == 1:
            matches[0].drop_tree()
            return html.tostring(doc, encoding="unicode")

        #Strip Outlook quotes (when outlook gives NO usable structure)
        successiveStringsToMatch = [
            '|'.join(['^From:.*$', '^De :.*$']),
            '|'.join(['^Sent:.*$', '^Envoy.+ :.*$']),
            '|'.join(
                ['^To:.*$', '^.+:.*$']
            ),  #Trying to match À, but unicode is really problematic in lxml regex
            '|'.join(['^Subject:.*$', '^Objet :.*$']),
        ]
        regexpNS = "http://exslt.org/regular-expressions"
        successiveStringsToMatchRegex = []
        for singleHeaderLanguageRegex in successiveStringsToMatch:
            successiveStringsToMatchRegex.append(
                r"descendant::*[re:test(text(), '" +
                singleHeaderLanguageRegex + "')]")

        regex = " and ".join(successiveStringsToMatchRegex)
        find = etree.XPath(r"//descendant::div[" + regex + "]",
                           namespaces={'re': regexpNS})
        matches = find(doc)
        if len(matches) == 1:
            findQuoteBody = etree.XPath(r"//descendant::div[" + regex +
                                        "]/following-sibling::*",
                                        namespaces={'re': regexpNS})
            quoteBodyElements = findQuoteBody(doc)
            for quoteElement in quoteBodyElements:
                #This moves the text to the tail of matches[0]
                quoteElement.drop_tree()
            matches[0].tail = None
            matches[0].drop_tree()
            return html.tostring(doc, encoding="unicode")

        #Strip Thunderbird quotes
        mainXpathFragment = "//child::blockquote[contains(@type,'cite') and boolean(@cite)]"
        find = etree.XPath(mainXpathFragment + "/self::blockquote")
        matches = find(doc)
        if len(matches) == 1:
            matchQuoteAnnounce = doc.xpath(mainXpathFragment +
                                           "/preceding-sibling::*")
            if len(matchQuoteAnnounce) > 0:
                matchQuoteAnnounce[-1].tail = None
                matches[0].drop_tree()
                return html.tostring(doc, encoding="unicode")

        #Nothing was stripped...
        return html.tostring(doc, encoding="unicode")
 def __init__(self, data, xml):
     self.xml = xml
     self.update_key = None
     self.parse_data(data)
     self.question_title_xpath = etree.XPath('update-content/question/title')
     self.set_update_content()
 def get_positions(self):
     profile_position_xpath = etree.XPath('positions/position')
     pos = profile_position_xpath(self.xml)
     for p in pos:
         obj = lixml.LinkedInXMLParser(etree.tostring(p)).results
         self.positions.append(obj)
Beispiel #4
0
import sys
import os
import Image
from StringIO import StringIO
import subprocess

from lxml import etree
import urllib2
import util

SAXON_PATH = util.resource_filename('lib', 'saxon9he.jar')
MATH2SVG_PATH = util.resource_filename('xslt2', 'math2svg-in-docbook.xsl')

DOCBOOK_BOOK_XSL = util.makeXsl('moduledbk2book.xsl')

MATH_XPATH = etree.XPath('//mml:math', namespaces=util.NAMESPACES)
DOCBOOK_SVG_IMAGE_XPATH = etree.XPath('//db:imagedata[svg:svg]',
                                      namespaces=util.NAMESPACES)
DOCBOOK_SVG_XPATH = etree.XPath('svg:svg', namespaces=util.NAMESPACES)
DOCBOOK_IMAGE_XPATH = etree.XPath('//db:imagedata[@fileref]',
                                  namespaces=util.NAMESPACES)

# -----------------------------
# Transform Structure:
#
# Every transform takes in 3 arguments:
# - xml doc
# - dictionary of files (string name, string bytes)
# - optional dictionary of parameters (string, string)
#
# Every transform returns:
Beispiel #5
0
 def xpath(self, xml_string, *xpath_args, **xpath_kwargs):
     "Does the given XPath query on the given string and yields etree elements."
     xml = etree.fromstring(xml_string)
     for el in etree.XPath(*xpath_args, **xpath_kwargs)(xml):
         yield el
Beispiel #6
0
 def count_divs(tree):
     div_xpath = etree.XPath("//div")
     TestBlockifier.div_count = len(div_xpath(tree))
Beispiel #7
0
def main():
    """"""
    with open('font-family.json') as _fontlist:
        fontlist = json.load(_fontlist)

    ffGlyphXPath = etree.XPath('//glyph[string-length(@unicode) = 1]')

    blocks = get_blocks()

    len_fontlist = len(fontlist)
    cps = load_cache()
    counter = 0

    for item, (font_file, font_family) in enumerate(fontlist):
        if font_file[0] == "#":
            logger.warning("Skipping font {}".format(font_file[1:]))
            continue

        logger.info("Handling font {} ({}/{})".format(font_file, item, len_fontlist))

        try:
            with open(font_file) as svg:
                r = etree.parse(svg)
        except IOError:
            logger.warning('Could not open %s' % font_file)
            continue
        except etree.XMLSyntaxError:
            logger.warning('Could not parse %s, no XML' % font_file)
            continue
        glyphs = ffGlyphXPath(r)
        len_glyphs = len(glyphs)

        for i, glyph in enumerate(glyphs):
            cp = glyph.get("unicode")
            ocp = ord(cp)
            cpn = '{0:04X}'.format(ocp)
            d = glyph.get("d", False)
            done = False
            if 0xE000 <= ocp <= 0xF8FF or \
               0xF0000 <= ocp <= 0x10FFFF:
                # private use areas. Skip.
                continue

            for blk in blocks:
                if blk[1] <= ocp and blk[2] >= ocp:

                    try:
                        if d and ocp not in cps:
                            counter += 1
                            logger.info("  | Glyph {:3d}/{} ({}) of {}".format(i+1, len_glyphs,
                                cp.encode('utf-8') if ocp >= 32 else '?', font_file))
                            emit(cp, d, font_family, blk, glyph)
                            cps[ocp] = [ font_family ]
                        elif ocp in cps and font_family not in cps[ocp]:
                            logger.debug("  | Glyph {:3d}/{} ({}) of {}".format(i+1, len_glyphs,
                                cp.encode('utf-8') if ocp >= 32 else '?', font_file))
                            emit_sql(cp, font_family, 0)
                            cps[ocp].append(font_family)
                    except (KeyboardInterrupt, SystemExit):
                        if cps and ocp and ocp in cps:
                            del cps[ocp]
                        logger.warning('Shutting down, creating fonts and reports.')
                        finish(cps, blocks, counter)
                        raise

                    done = True
                    break

            if not done:
                logger.warning('No block found for U+{:04X}: not processed.'.format(ocp))

    finish(cps, blocks, counter)
repl_ind_inv = r'[\1]'
repl_dot_inv = '.'
repl_str_inv = '*'
repl_qtm_inv = '?'
repl_emm_inv = '!'

# Regular expressions to match attributes and indices within valid XPaths
re_atr = re.compile(r'\[@' + pttrn_attr_name + "=['\"]" + pttrn_attr_val + "['\"]\]")
re_ind = re.compile(r'\[([0-9]+?)\]')

# Regular expressions to match attributes and indices within OpenMDAO variables transformed from xpaths
re_atr_inv = re.compile(r':_:' + pttrn_attr_val + ':_:' + pttrn_attr_val + r'(?=/|$|:)')
re_ind_inv = re.compile(r':_:_([0-9]+?)(?=/|$)')

parser = etree.XMLParser(remove_blank_text=True, encoding='utf-8')
find_text = etree.XPath('//text()')


def xpath_to_param(xpath):
    # type: (str) -> str
    """Convert an XML XPath to a valid ``OpenMDAO`` parameter name.

    Parameters
    ----------
        xpath : str
            XPath to convert.

    Returns
    -------
        str
            Valid ``OpenMDAO`` parameter name.
Beispiel #9
0
def XPath(expr):
    ans = xpath_cache.get(expr, None)
    if ans is None:
        xpath_cache[expr] = ans = etree.XPath(expr, namespaces=NS_MAP)
    return ans
Beispiel #10
0
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}transChange',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}salute',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}signed',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}closer',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}speech',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}speaker',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}list',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}item',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}table',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}head',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}row',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}cell',
    '{http://www.bibletechnologies.net/2003/OSIS/namespace}caption')

# Precompile a few xpath-querys
verse_in_chapter = etree.XPath('//ns:chapter[1]/ns:verse', namespaces=NS)
text_in_verse = etree.XPath('//ns:verse[1]/text()', namespaces=NS)


class OSISBible(BibleImport):
    """
    `OSIS <http://www.bibletechnologies.net/>`_ Bible format importer class.
    """
    def process_books(self, bible_data):
        """
        Extract and create the bible books from the parsed xml

        :param bible_data: parsed xml
        :return: None
        """
        # Find books in the bible
Beispiel #11
0
seenpage = queue.Queue()
url = 'http://www.bttiantang.com'


def a(node):
    if re.search('/?Page', node.attrib['href']):
        if node.attrib['href'] not in seen:
            seen[node.attrib['href']] = node.text
            seenpage.put(url + node.attrib['href'])


#        return(node.attrib['href'])

web = request.urlopen(url)
html = web.read().decode('utf-8')
HTML = etree.HTML(html)
find = etree.XPath("//a[@href]")
nodelist = find(HTML)
pool = Pool(4)
pool.map(a, nodelist[::-1])
pool.close()
pool.join()
#print(results)
while (not seenpage.empty()):
    print(seenpage.get())
print(seen)
'''
for node in nodelist:
    if re.search('/?Page', node.attrib['href']):
        print(node.attrib['href'])
'''
Beispiel #12
0
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
"""Helper functions for working with XML."""

import re
import six

from lxml import etree

# some useful xpath expressions
xml_preserve_ancestors = etree.XPath(
    "ancestor-or-self::*[attribute::xml:space='preserve']")
"""All ancestors with xml:space='preserve'"""

xml_space_ancestors = etree.XPath("ancestor-or-self::*/attribute::xml:space")
"""All xml:space attributes in the ancestors"""

string_xpath = etree.XPath("string()")
"""Return a non-normalized string in the node subtree"""

string_xpath_normalized = etree.XPath("normalize-space()")
"""Return a (space) normalized string in the node subtree"""


def getText(node, xml_space="preserve"):
    """Extracts the plain text content out of the given node.
Beispiel #13
0
def rows(table):
    # Yields lists of strings, with a string representing a <td>/<th> and
    # a list representing a <tr>.
    for tr in etree.XPath('tr')(table):
        yield [col.text_content() for col in etree.XPath('td | th')(tr)]
Beispiel #14
0
    def retrieve_new_documents(self, limit=100):
        """Retrieves new documents using the EURLex search.

        Checks documents until it reaches a given limit.

        Args:
            limit (int): the maximium number of documents that should be
                retrieved.

        Returns:
            int: the number of newly found documents
        """
        today = dt.datetime.combine(dt.date.today(), dt.time.min)

        entry_path = etree.XPath("//div[@class = 'SearchResult']")
        date_path = etree.XPath("""
            .//dl/dd[preceding-sibling::dt[contains(text(), 'Date') or
                                           contains(text(), 'Datum')]]/text()
            """)
        doc_path = etree.XPath("""
            .//ul[contains(@class, 'SearchResultDoc')]/li
            /a[contains(@href, 'PDF') or contains(@href, 'HTML')]/@href
            """)
        title_path = etree.XPath(".//h2/a[@class = 'title']/text()")
        detail_path = etree.XPath(".//h2/a[@class = 'title']/@href")

        timestamp = int(round(time.time() * 1000))
        url_tmpl = ("https://eur-lex.europa.eu/search.html?lang=de&qid="
                    f"{timestamp}&type=quick&scope=EURLEX&sortOneOrder=desc"
                    "&sortOne=DD&locale=de&page={}")

        has_unseen_documents = True
        doc_count = 0
        page = 1

        while (doc_count < limit) and has_unseen_documents:
            search_url = url_tmpl.format(page)
            logging.info(f"Crawling page '{search_url}' (page {page})")
            res = _retry_connection(search_url, "get")
            html_string = res.content
            tree = html.fromstring(html_string)

            for entry in entry_path(tree):
                if not isinstance(entry, list):
                    entry = [entry]

                date_string = _flat_map(date_path, entry)[0]
                match = re.search(r"(\d+\/\d+\/\d+)", date_string)

                doc_date = dt.datetime.min
                if match:
                    doc_date = dt.datetime.strptime(match[1], "%d/%m/%Y")
                if len(_flat_map(doc_path, entry)) == 0:
                    continue
                link = _make_resource_path(
                    _flat_map(doc_path, entry)[0], "https://eur-lex.europa.eu")
                detail = _make_resource_path(
                    _flat_map(detail_path, entry)[0],
                    "https://eur-lex.europa.eu")
                title = _flat_map(title_path, entry)[0]

                doc = {
                    "url": link,
                    "detail_url": detail,
                    "date": doc_date,
                    "title": title,
                    "crawl_date": today
                }

                logging.debug(f"Process Document: {link} - {doc_date.date()}")

                num_docs = self.collection.count_documents({"url": link})

                if num_docs > 0:
                    logging.debug(f"Document was crawled before: '{link}'")
                    # check whether this document had a date before the crawl
                    # date, if not, break.
                    duplicate_doc = self.collection.find_one({"url": link})

                    if duplicate_doc["date"] >= duplicate_doc["crawl_date"]:
                        logging.debug("Document date lies in the future."
                                      " Continue...")
                        continue

                    logging.debug("Break!")
                    has_unseen_documents = False
                    break

                logging.debug(f"Found new document: {link}.")
                res = self.collection.insert_one(doc)
                doc_count += 1
            page += 1
        logging.info(f"Found {doc_count} new or potentially modified docs.")
        return doc_count
Beispiel #15
0
            _resources_dir, "xsl", "iso-schematron-xslt1", "iso_abstract_expand.xsl"
        )
    )
)
iso_svrl_for_xslt1 = _etree.XSLT(
    _etree.parse(
        os.path.join(
            _resources_dir, "xsl", "iso-schematron-xslt1", "iso_svrl_for_xslt1.xsl"
        )
    )
)


# svrl result accessors
svrl_validation_errors = _etree.XPath(
    "//svrl:failed-assert", namespaces={"svrl": SVRL_NS}
)


# RelaxNG validator for schematron schemas
schematron_schema_valid = _etree.RelaxNG(
    file=os.path.join(_resources_dir, "rng", "iso-schematron.rng")
)


def stylesheet_params(**kwargs):
    """Convert keyword args to a dictionary of stylesheet parameters.
    XSL stylesheet parameters must be XPath expressions, i.e.:

    * string expressions, like "'5'"
    * simple (number) expressions, like "5"
Beispiel #16
0
def XPath(x):
    try:
        return etree.XPath(x, namespaces=XPNSMAP)
    except etree.XPathSyntaxError:
        raise ConversionError(
            'The syntax of the XPath expression %s is invalid.' % repr(x))
Beispiel #17
0
class Schematron(_etree._Validator):
    """An ISO Schematron validator.

    Pass a root Element or an ElementTree to turn it into a validator.
    Alternatively, pass a filename as keyword argument 'file' to parse from
    the file system.

    Schematron is a less well known, but very powerful schema language.
    The main idea is to use the capabilities of XPath to put restrictions on
    the structure and the content of XML documents.

    The standard behaviour is to fail on ``failed-assert`` findings only
    (``ASSERTS_ONLY``).  To change this, you can either pass a report filter
    function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
    or a custom ``XPath`` object), or subclass isoschematron.Schematron for
    complete control of the validation process.

    Built on the Schematron language 'reference' skeleton pure-xslt
    implementation, the validator is created as an XSLT 1.0 stylesheet using
    these steps:

     0) (Extract from XML Schema or RelaxNG schema)
     1) Process inclusions
     2) Process abstract patterns
     3) Compile the schematron schema to XSLT

    The ``include`` and ``expand`` keyword arguments can be used to switch off
    steps 1) and 2).
    To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
    keyword arguments ``include_params``, ``expand_params`` or
    ``compile_params``.
    For convenience, the compile-step parameter ``phase`` is also exposed as a
    keyword argument ``phase``. This takes precedence if the parameter is also
    given in the parameter dictionary.

    If ``store_schematron`` is set to True, the (included-and-expanded)
    schematron document tree is stored and available through the ``schematron``
    property.
    If ``store_xslt`` is set to True, the validation XSLT document tree will be
    stored and can be retrieved through the ``validator_xslt`` property.
    With ``store_report`` set to True (default: False), the resulting validation
    report document gets stored and can be accessed as the ``validation_report``
    property.

    Here is a usage example::

      >>> from lxml import etree
      >>> from lxml.isoschematron import Schematron

      >>> schematron = Schematron(etree.XML('''
      ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
      ...   <pattern id="id_only_attribute">
      ...     <title>id is the only permitted attribute name</title>
      ...     <rule context="*">
      ...       <report test="@*[not(name()='id')]">Attribute
      ...         <name path="@*[not(name()='id')]"/> is forbidden<name/>
      ...       </report>
      ...     </rule>
      ...   </pattern>
      ... </schema>'''),
      ... error_finder=Schematron.ASSERTS_AND_REPORTS)

      >>> xml = etree.XML('''
      ... <AAA name="aaa">
      ...   <BBB id="bbb"/>
      ...   <CCC color="ccc"/>
      ... </AAA>
      ... ''')

      >>> schematron.validate(xml)
      False

      >>> xml = etree.XML('''
      ... <AAA id="aaa">
      ...   <BBB id="bbb"/>
      ...   <CCC/>
      ... </AAA>
      ... ''')

      >>> schematron.validate(xml)
      True
    """

    # libxml2 error categorization for validation errors
    _domain = _etree.ErrorDomains.SCHEMATRONV
    _level = _etree.ErrorLevels.ERROR
    _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT

    # convenience definitions for common behaviours
    ASSERTS_ONLY = svrl_validation_errors  # Default
    ASSERTS_AND_REPORTS = _etree.XPath(
        "//svrl:failed-assert | //svrl:successful-report", namespaces={"svrl": SVRL_NS}
    )

    def _extract(self, element):
        """Extract embedded schematron schema from non-schematron host schema.
        This method will only be called by __init__ if the given schema document
        is not a schematron schema by itself.
        Must return a schematron schema document tree or None.
        """
        schematron = None
        if element.tag == _xml_schema_root:
            schematron = self._extract_xsd(element)
        elif element.nsmap[element.prefix] == RELAXNG_NS:
            # RelaxNG does not have a single unique root element
            schematron = self._extract_rng(element)
        return schematron

    # customization points
    # etree.XSLT objects that provide the extract, include, expand, compile
    # steps
    _extract_xsd = extract_xsd
    _extract_rng = extract_rng
    _include = iso_dsdl_include
    _expand = iso_abstract_expand
    _compile = iso_svrl_for_xslt1

    # etree.xpath object that determines input document validity when applied to
    # the svrl result report; must return a list of result elements (empty if
    # valid)
    _validation_errors = ASSERTS_ONLY

    def __init__(
        self,
        etree=None,
        file=None,
        include=True,
        expand=True,
        include_params={},
        expand_params={},
        compile_params={},
        store_schematron=False,
        store_xslt=False,
        store_report=False,
        phase=None,
        error_finder=ASSERTS_ONLY,
    ):
        super(Schematron, self).__init__()

        self._store_report = store_report
        self._schematron = None
        self._validator_xslt = None
        self._validation_report = None
        if error_finder is not self.ASSERTS_ONLY:
            self._validation_errors = error_finder

        # parse schema document, may be a schematron schema or an XML Schema or
        # a RelaxNG schema with embedded schematron rules
        root = None
        try:
            if etree is not None:
                if _etree.iselement(etree):
                    root = etree
                else:
                    root = etree.getroot()
            elif file is not None:
                root = _etree.parse(file).getroot()
        except Exception:
            raise _etree.SchematronParseError(
                "No tree or file given: %s" % sys.exc_info()[1]
            )
        if root is None:
            raise ValueError("Empty tree")
        if root.tag == _schematron_root:
            schematron = root
        else:
            schematron = self._extract(root)
        if schematron is None:
            raise _etree.SchematronParseError(
                "Document is not a schematron schema or schematron-extractable"
            )
        # perform the iso-schematron skeleton implementation steps to get a
        # validating xslt
        if include:
            schematron = self._include(schematron, **include_params)
        if expand:
            schematron = self._expand(schematron, **expand_params)
        if not schematron_schema_valid(schematron):
            raise _etree.SchematronParseError(
                "invalid schematron schema: %s" % schematron_schema_valid.error_log
            )
        if store_schematron:
            self._schematron = schematron
        # add new compile keyword args here if exposing them
        compile_kwargs = {"phase": phase}
        compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
        validator_xslt = self._compile(schematron, **compile_params)
        if store_xslt:
            self._validator_xslt = validator_xslt
        self._validator = _etree.XSLT(validator_xslt)

    def __call__(self, etree):
        """Validate doc using Schematron.

        Returns true if document is valid, false if not.
        """
        self._clear_error_log()
        result = self._validator(etree)
        if self._store_report:
            self._validation_report = result
        errors = self._validation_errors(result)
        if errors:
            if _etree.iselement(etree):
                fname = etree.getroottree().docinfo.URL or "<file>"
            else:
                fname = etree.docinfo.URL or "<file>"
            for error in errors:
                # Does svrl report the line number, anywhere? Don't think so.
                self._append_log_message(
                    domain=self._domain,
                    type=self._error_type,
                    level=self._level,
                    line=0,
                    message=_etree.tostring(error, encoding="unicode"),
                    filename=fname,
                )
            return False
        return True

    @property
    def schematron(self):
        """ISO-schematron schema document (None if object has been initialized
        with store_schematron=False).
        """
        return self._schematron

    @property
    def validator_xslt(self):
        """ISO-schematron skeleton implementation XSLT validator document (None
        if object has been initialized with store_xslt=False).
        """
        return self._validator_xslt

    @property
    def validation_report(self):
        """ISO-schematron validation result report (None if result-storing has
        been turned off).
        """
        return self._validation_report
Beispiel #18
0
def XPath(expr):
    return etree.XPath(expr, namespaces={'h': XHTML_NS})
Beispiel #19
0
from os.path import dirname, realpath
from lxml import html, etree

fileDir = realpath(__file__)
rootDir = dirname(dirname(fileDir))
dataFile = rootDir + '/pages/data/138222.html'

root = html.parse(dataFile)
find_text = etree.XPath("//h1/a/text()")
title = find_text(root)
print(title)


def divider():
    print("-" * 20 + "\n")


# About Sections
aboutSect = root.xpath('.//h2[text()="About this job"]/ancestor::section')
# print(aboutSect[0])
aboutRoot = etree.ElementTree(aboutSect[0])
# print(etree.tostring(aboutRoot))
jobTypeSpan = aboutRoot.xpath(
    './/span[contains(text(),"Job type")]/ancestor::div[position()=1]')
# print(jobTypeSpan)
# print(etree.tostring(jobTypeSpan[0]))

jobTypeRoot = etree.ElementTree(jobTypeSpan[0])

jobType = jobTypeRoot.xpath('//span/text()')
print("Job Type: " + jobType[1])
Beispiel #20
0
import lxml.etree as etree
from lxml.etree import iselement
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url

from scrapy.link import Link
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list, to_native_str
from scrapy.utils.response import get_base_url
from scrapy.linkextractors import FilteringLinkExtractor

# from lxml/src/lxml/html/__init__.py
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

_collect_string_content = etree.XPath("string()")


class PartialParserLinkExtract(LxmlParserLinkExtractor):
    def _extract_links(self, selector, response_url, response_encoding,
                       base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        selector_root = selector
        if not iselement(selector_root):
            selector_root = selector.root
        for el, attr, attr_val in self._iter_links(selector_root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                if self.strip:
                    attr_val = strip_html5_whitespace(attr_val)
Beispiel #21
0
class WQXMapper:

    # ---------- dictionary of precompiled XPath query expressions
    #            for retrieving logical context nodes (matches
    #            wqx_mappings.context_xpaths):
    context_xpaths_compl = {}
    for nodename in context_xpaths:
        context_xpaths_compl[nodename] = etree.XPath(context_xpaths[nodename],
                                                     namespaces=ns)

    # ---------- precompiled XPath query expressions ('nodeq') for retrieving
    #            Logical Node nodesets:

    # relative expression from root
    # organizations
    context_xpaths_compl['org'] = etree.XPath('/wqx:WQX/wqx:Organization',
                                              namespaces=ns)

    # relative expressions from organization node
    # stations
    context_xpaths_compl['station'] = etree.XPath('wqx:MonitoringLocation',
                                                  namespaces=ns)
    # activities
    context_xpaths_compl['activity'] = etree.XPath('wqx:Activity',
                                                   namespaces=ns)

    # relative expression from activity node
    # results
    context_xpaths_compl['result'] = etree.XPath('wqx:Result', namespaces=ns)

    # ---------- dictionaries of precompiled XPath query expressions
    #            for retrieving column values (keys are tabular column names):
    val_xpaths_compl = {}
    for node in context_xpaths_compl.keys():
        print('doing node \'' + node + '\'')
        val_xpaths_compl[node] = {}
        for colname in val_xpaths[node].keys():
            cur_node_dict = val_xpaths[node]
            cur_xpath = etree.XPath(cur_node_dict[colname] + '/text()',
                                    namespaces=ns,
                                    smart_strings=False)
            val_xpaths_compl[node][colname] = cur_xpath

    def make_rowpart(self, node, valq):
        '''
        Applies the column val XPath query expression "valq" to the
        XML context node "node" and returns a dictionary whose keys
        are column names matching descendants of the context node.
        
        Dictionary values are merges of the text values of all descendant
        nodes that map to the key (Note that if there are multiple such values, 
        there were multiple sibling nodes with non-empty text.) 
        The "merge" is a single-space_delimited concatenation.
        '''
        retval = {}
        # invoke the compiled XPaths
        for colname in valq:
            retval[colname] = ' '.join(valq[colname](node))
        return retval

    def determine_table_type(self, response):
        '''
        This method inspects the response .status_code and .url properties
        to determine what should be done.
            - If the status code is not a 2xx, it will raise a BaseException.
            - If the status code is 2xx, it will inspect the response.url to 
        determinewhether the resultset should be framed as a Station or 
        Result table (or, in future, a Biodata or Simplestation table.) 
        If it cannot determine the correct table type, it will raise a 
        BaseException.
        '''
        if response.status_code < 200 or response.status_code >= 300:
            raise (BaseException('The response is not OK: status code "' +
                                 str(response.status_code) + ' ' +
                                 response.reason + '".'))

        table_type = ''
        if 'Station/search' in response.url:
            table_type = 'station'
        elif 'Result/search' in response.url:
            table_type = 'result'

        if not table_type:
            raise (BaseException(
                'Unable to determine table type from response URL "' +
                response.url + '".'))
        return table_type

    def xml_to_dict_of_lists(self, table_type, root):
        '''
        Given a known table_type and an XML root node, this method will attempt
        to construct a tabular WQX representation of the information contained
        in the XML.

        The tabular representation is "column-first": a dictionary whose keys 
        are column names.

        Each column's value is a list of values. The length of each list is
        equal to the number of rows that will be represented in the table.
        The values in a list are determined by the val XPath expressions, 
        based on the Logical Nodes in context when the row was evaluated.

        The Lists are all the same length. When the XML does not supply a
        value, an empty string is inserted.

        The number of rows in the tabular representation is equal to the length 
        of any List.

        A single row is determined by slicing all of the Lists at the same 
        index.

        If the table_type is not known, this method returns an empty list.

        If the XML root is not valid WQX, behavior is not specified. This method
        does not attempt XML validation.
        '''
        datadict = {}
        for colname in tabular_defs[table_type]:
            datadict[colname] = []
        orgs = self.context_xpaths_compl['org'](root)
        for org in orgs:
            org_rowpart = self.make_rowpart(org, self.val_xpaths_compl['org'])
            if table_type == 'result':
                activities = self.context_xpaths_compl['activity'](org)
                for activity in activities:
                    activity_rowpart = self.make_rowpart(
                        activity, self.val_xpaths_compl['activity'])
                    results = self.context_xpaths_compl['result'](activity)
                    for result in results:
                        result_rowpart = self.make_rowpart(
                            result, self.val_xpaths_compl['result'])
                        this_row = {}
                        this_row.update(org_rowpart)
                        this_row.update(activity_rowpart)
                        this_row.update(result_rowpart)
                        for colname in tabular_defs['result']:
                            val = this_row.get(colname)
                            if not val:
                                val = ''
                            datadict[colname].append(val)
            elif table_type == 'station':
                stations = self.context_xpaths_compl['station'](org)
                for station in stations:
                    station_rowpart = self.make_rowpart(
                        station, self.val_xpaths_compl['station'])
                    this_row = {}
                    this_row.update(org_rowpart)
                    this_row.update(station_rowpart)
                    for colname in tabular_defs['station']:
                        val = this_row.get(colname)
                        if not val:
                            val = ''
                        datadict[colname].append(val)
        return datadict

    def xml_to_list_of_dicts(self, table_type, root):
        '''
        Given a known table_type and an XML root node, this method will attempt
        to construct a tabular WQX representation of the information contained
        in the XML.

        The tabular representation is "row-first":  a list of dictionaries. 
        Each dict in the list corresponds to a table row. The dictionary keys are 
        column names, and the values are the values extracted from the XML according 
        to the context and val XPath expressions in wqx_mappings.

        The number of rows in the tabular representation is equal to the
        length of the returned list.

        A single row is determined by taking a single dict out of the returned List.

        If the table_type is not known, this method returns an empty list.

        If the XML root is not valid WQX, behavior is not specified. This method
        does not attempt XML validation.
        '''
        rows = []
        orgs = self.context_xpaths_compl['org'](root)
        for org in orgs:
            org_rowpart = self.make_rowpart(org, self.val_xpaths_compl['org'])
            if table_type == 'result':
                activities = self.context_xpaths_compl['activity'](org)
                for activity in activities:
                    activity_rowpart = self.make_rowpart(
                        activity, self.val_xpaths_compl['activity'])
                    results = self.context_xpaths_compl['result'](activity)
                    for result in results:
                        result_rowpart = self.make_rowpart(
                            result, self.val_xpaths_compl['result'])
                        this_row = {}
                        this_row.update(org_rowpart)
                        this_row.update(activity_rowpart)
                        this_row.update(result_rowpart)
                        rows.append(this_row)
            elif table_type == 'station':
                stations = self.context_xpaths_compl['station'](org)
                for station in stations:
                    station_rowpart = self.make_rowpart(
                        station, self.val_xpaths_compl['station'])
                    this_row = {}
                    this_row.update(org_rowpart)
                    this_row.update(station_rowpart)
                    rows.append(this_row)
        return rows

    def make_dataframe_from_xml(self, table_type, root, columns_first=True):
        '''
        This method accepts a known table_type and an XML root node. It returns
        a pandas.DataFrame containing the tabular representation of the data
        contained in the "root" argument. The DataFrame return value will
        have columns defined as being equal to the corresponding member of 
        tabular_defs, even if the columns are not populated in any of the records
        embodied in the XML root.

        Expected behavior with respect to improper parameters is similar to 
        that documented for xml_to_list_of_dicts(table_type, root).
        '''
        dataframe = None
        col_defs = tabular_defs[table_type]

        if col_defs:
            if columns_first:
                data_rows = self.xml_to_dict_of_lists(table_type, root)
            else:
                data_rows = self.xml_to_list_of_dicts(table_type, root)
            dataframe = pandas.DataFrame(data=data_rows, columns=col_defs)
        return dataframe

    def make_dataframe_from_http_response(self, response, columns_first=True):
        '''
        This method accepts a requests.response HTTP Response object.
        The assumption is that this response was obtained by calling
        a WQP RESTlike service as described at 
        http://www.waterqualitydata.us/webservices_documentation.jsp.

        This method
          - checks the status code, raising BaseException if not 2xx
          - attempts to identify the table_type, raising BaseException if 
            the response cannot be identified as a known type
          - attempts to parse the XML content, if any
          - attempts to convert the XML content to the correct tabular form
          - returns a pandas.DataFrame containing the tabular data
        '''

        retval = None
        table_type = self.determine_table_type(response)

        if table_type and response.content:
            root = etree.fromstring(response.content)
            retval = self.make_dataframe_from_xml(table_type, root,
                                                  columns_first)

        return retval
Beispiel #22
0
def main():
    module = AnsibleModule(
        argument_spec=dict(
            path=dict(type='path', aliases=['dest', 'file']),
            xmlstring=dict(type='str'),
            xpath=dict(type='str'),
            namespaces=dict(type='dict', default={}),
            state=dict(type='str',
                       default='present',
                       choices=['absent', 'present'],
                       aliases=['ensure']),
            value=dict(type='raw'),
            attribute=dict(type='raw'),
            add_children=dict(type='list'),
            set_children=dict(type='list'),
            count=dict(type='bool', default=False),
            print_match=dict(type='bool', default=False),
            pretty_print=dict(type='bool', default=False),
            content=dict(type='str', choices=['attribute', 'text']),
            input_type=dict(type='str',
                            default='yaml',
                            choices=['xml', 'yaml']),
            backup=dict(type='bool', default=False),
            strip_cdata_tags=dict(type='bool', default=False),
            insertbefore=dict(type='bool', default=False),
            insertafter=dict(type='bool', default=False),
        ),
        supports_check_mode=True,
        required_by=dict(
            add_children=['xpath'],
            attribute=['value'],
            content=['xpath'],
            set_children=['xpath'],
            value=['xpath'],
        ),
        required_if=[
            ['count', True, ['xpath']],
            ['print_match', True, ['xpath']],
            ['insertbefore', True, ['xpath']],
            ['insertafter', True, ['xpath']],
        ],
        required_one_of=[
            ['path', 'xmlstring'],
            [
                'add_children', 'content', 'count', 'pretty_print',
                'print_match', 'set_children', 'value'
            ],
        ],
        mutually_exclusive=[
            [
                'add_children', 'content', 'count', 'print_match',
                'set_children', 'value'
            ],
            ['path', 'xmlstring'],
            ['insertbefore', 'insertafter'],
        ],
    )

    xml_file = module.params['path']
    xml_string = module.params['xmlstring']
    xpath = module.params['xpath']
    namespaces = module.params['namespaces']
    state = module.params['state']
    value = json_dict_bytes_to_unicode(module.params['value'])
    attribute = module.params['attribute']
    set_children = json_dict_bytes_to_unicode(module.params['set_children'])
    add_children = json_dict_bytes_to_unicode(module.params['add_children'])
    pretty_print = module.params['pretty_print']
    content = module.params['content']
    input_type = module.params['input_type']
    print_match = module.params['print_match']
    count = module.params['count']
    backup = module.params['backup']
    strip_cdata_tags = module.params['strip_cdata_tags']
    insertbefore = module.params['insertbefore']
    insertafter = module.params['insertafter']

    # Check if we have lxml 2.3.0 or newer installed
    if not HAS_LXML:
        module.fail_json(msg=missing_required_lib("lxml"),
                         exception=LXML_IMP_ERR)
    elif LooseVersion('.'.join(
            to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'):
        module.fail_json(
            msg=
            'The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine'
        )
    elif LooseVersion('.'.join(
            to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'):
        module.warn(
            'Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.'
        )

    # Check if the file exists
    if xml_string:
        infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict'))
    elif os.path.isfile(xml_file):
        infile = open(xml_file, 'rb')
    else:
        module.fail_json(msg="The target XML source '%s' does not exist." %
                         xml_file)

    # Parse and evaluate xpath expression
    if xpath is not None:
        try:
            etree.XPath(xpath)
        except etree.XPathSyntaxError as e:
            module.fail_json(msg="Syntax error in xpath expression: %s (%s)" %
                             (xpath, e))
        except etree.XPathEvalError as e:
            module.fail_json(
                msg="Evaluation error in xpath expression: %s (%s)" %
                (xpath, e))

    # Try to parse in the target XML file
    try:
        parser = etree.XMLParser(remove_blank_text=pretty_print,
                                 strip_cdata=strip_cdata_tags)
        doc = etree.parse(infile, parser)
    except etree.XMLSyntaxError as e:
        module.fail_json(msg="Error while parsing document: %s (%s)" %
                         (xml_file or 'xml_string', e))

    # Ensure we have the original copy to compare
    global orig_doc
    orig_doc = copy.deepcopy(doc)

    if print_match:
        do_print_match(module, doc, xpath, namespaces)

    if count:
        count_nodes(module, doc, xpath, namespaces)

    if content == 'attribute':
        get_element_attr(module, doc, xpath, namespaces)
    elif content == 'text':
        get_element_text(module, doc, xpath, namespaces)

    # File exists:
    if state == 'absent':
        # - absent: delete xpath target
        delete_xpath_target(module, doc, xpath, namespaces)

    # - present: carry on

    # children && value both set?: should have already aborted by now
    # add_children && set_children both set?: should have already aborted by now

    # set_children set?
    if set_children:
        set_target_children(module, doc, xpath, namespaces, set_children,
                            input_type)

    # add_children set?
    if add_children:
        add_target_children(module, doc, xpath, namespaces, add_children,
                            input_type, insertbefore, insertafter)

    # No?: Carry on

    # Is the xpath target an attribute selector?
    if value is not None:
        set_target(module, doc, xpath, namespaces, attribute, value)

    # If an xpath was provided, we need to do something with the data
    if xpath is not None:
        ensure_xpath_exists(module, doc, xpath, namespaces)

    # Otherwise only reformat the xml data?
    if pretty_print:
        make_pretty(module, doc)

    module.fail_json(msg="Don't know what to do")
Beispiel #23
0
 def find_id(self, element_id):
     """Find elements with the given ID"""
     find = etree.XPath("//*[@id=$id]")
     return FigureElement(find(self.root, id=element_id)[0])
Beispiel #24
0

def trans_mat_from_xml(elem):
    """
    Take an xml element that represents a Coord_System_Definition and return a trans matrix.
    """
    r_o = np.array(
        [float(f) for f in elem.xpath(r"./*[@name = 'position']//@value")])
    n_x = np.array(
        [float(f) for f in elem.xpath(r"./*[@name = 'local_x']//@value")])
    n_y = np.array(
        [float(f) for f in elem.xpath(r"./*[@name = 'local_y']//@value")])
    return matrix_from_nx_ny_ro(n_x, n_y, r_o)


xpath_val = etree.XPath(r".//*[@Name = $n]/Value/ValueExpression/Value")


def trans_mat_from_avm_xml(elem):
    """
    Take an xml element that represents a Coord_System_Definition and return a trans matrix.
    """
    r_o = np.array([
        float(f)
        for f in xpath_val(elem, n="position")[0].text[1:-1].split(",")
    ])
    n_x = np.array([
        float(f) for f in xpath_val(elem, n="local_x")[0].text[1:-1].split(",")
    ])
    n_y = np.array([
        float(f) for f in xpath_val(elem, n="local_y")[0].text[1:-1].split(",")
Beispiel #25
0
    def parse_google_card(self, node):
        e = discord.Embed(colour=discord.Colour.blurple())

        # check if it's a calculator card:
        calculator = node.find(".//span[@class='cwclet']")
        if calculator is not None:
            e.title = 'Calculator'
            result = node.find(".//span[@class='cwcot']")
            if result is not None:
                result = ' '.join((calculator.text, result.text.strip()))
            else:
                result = calculator.text + ' ???'
            e.description = result
            return e

        # check for unit conversion card

        unit_conversions = node.xpath(".//input[contains(@class, '_eif') and @value]")
        if len(unit_conversions) == 2:
            e.title = 'Unit Conversion'

            # the <input> contains our values, first value = second value essentially.
            # these <input> also have siblings with <select> and <option selected=1>
            # that denote what units we're using

            # We will get 2 <option selected="1"> nodes by traversing the parent
            # The first unit being converted (e.g. Miles)
            # The second unit being converted (e.g. Feet)

            xpath = etree.XPath("parent::div/select/option[@selected='1']/text()")
            try:
                first_node = unit_conversions[0]
                first_unit = xpath(first_node)[0]
                first_value = float(first_node.get('value'))
                second_node = unit_conversions[1]
                second_unit = xpath(second_node)[0]
                second_value = float(second_node.get('value'))
                e.description = ' '.join((str(first_value), first_unit, '=', str(second_value), second_unit))
            except Exception:
                return None
            else:
                return e

        # check for currency conversion card
        if 'currency' in node.get('class', ''):
            currency_selectors = node.xpath(".//div[@class='ccw_unit_selector_cnt']")
            if len(currency_selectors) == 2:
                e.title = 'Currency Conversion'
                # Inside this <div> is a <select> with <option selected="1"> nodes
                # just like the unit conversion card.

                first_node = currency_selectors[0]
                first_currency = first_node.find("./select/option[@selected='1']")

                second_node = currency_selectors[1]
                second_currency = second_node.find("./select/option[@selected='1']")

                # The parent of the nodes have a <input class='vk_gy vk_sh ccw_data' value=...>
                xpath = etree.XPath("parent::td/parent::tr/td/input[@class='vk_gy vk_sh ccw_data']")
                try:
                    first_value = float(xpath(first_node)[0].get('value'))
                    second_value = float(xpath(second_node)[0].get('value'))

                    values = (
                        str(first_value),
                        first_currency.text,
                        f'({first_currency.get("value")})',
                        '=',
                        str(second_value),
                        second_currency.text,
                        f'({second_currency.get("value")})'
                    )
                    e.description = ' '.join(values)
                except Exception:
                    return None
                else:
                    return e

        # check for generic information card
        info = node.find(".//div[@class='_f2g']")
        if info is not None:
            try:
                e.title = ''.join(info.itertext()).strip()
                actual_information = info.xpath("parent::div/parent::div//div[@class='_XWk' or contains(@class, 'kpd-ans')]")[0]
                e.description = ''.join(actual_information.itertext()).strip()
            except Exception:
                return None
            else:
                return e

        # check for translation card
        translation = node.find(".//div[@id='tw-ob']")
        if translation is not None:
            src_text = translation.find(".//pre[@id='tw-source-text']/span")
            src_lang = translation.find(".//select[@id='tw-sl']/option[@selected='1']")

            dest_text = translation.find(".//pre[@id='tw-target-text']/span")
            dest_lang = translation.find(".//select[@id='tw-tl']/option[@selected='1']")

            # TODO: bilingual dictionary nonsense?

            e.title = 'Translation'
            try:
                e.add_field(name=src_lang.text, value=src_text.text, inline=True)
                e.add_field(name=dest_lang.text, value=dest_text.text, inline=True)
            except Exception:
                return None
            else:
                return e

        # check for "time in" card
        time = node.find("./div[@class='vk_bk vk_ans']")
        if time is not None:
            date = node.find("./div[@class='vk_gy vk_sh']")
            try:
                e.title = node.find('span').text
                e.description = f'{time.text}\n{"".join(date.itertext()).strip()}'
            except Exception:
                return None
            else:
                return e

        # time in has an alternative form without spans
        time = node.find("./div/div[@class='vk_bk vk_ans _nEd']")
        if time is not None:
            converted = "".join(time.itertext()).strip()
            try:
                # remove the in-between text
                parent = time.getparent()
                parent.remove(time)
                original = "".join(parent.itertext()).strip()
                e.title = 'Time Conversion'
                e.description = f'{original}...\n{converted}'
            except Exception:
                return None
            else:
                return e

        # check for definition card
        words = node.xpath(".//span[@data-dobid='hdw']")
        if words:
            lex = etree.XPath(".//div[@class='lr_dct_sf_h']/i/span")

            # this one is derived if we were based on the position from lex
            xpath = etree.XPath("../../../ol[@class='lr_dct_sf_sens']//" \
                                "div[not(@class and @class='lr_dct_sf_subsen')]/" \
                                "div[@class='_Jig']/div[@data-dobid='dfn']/span")
            for word in words:
                # we must go two parents up to get the root node
                root = word.getparent().getparent()

                pronunciation = root.find(".//span[@class='lr_dct_ph']/span")
                if pronunciation is None:
                    continue

                lexical_category = lex(root)
                definitions = xpath(root)

                for category in lexical_category:
                    definitions = xpath(category)
                    try:
                        descrip = [f'*{category.text}*']
                        for index, value in enumerate(definitions, 1):
                            descrip.append(f'{index}. {value.text}')

                        e.add_field(name=f'{word.text} /{pronunciation.text}/', value='\n'.join(descrip))
                    except:
                        continue

            return e

        # check for weather card
        location = node.find("./div[@id='wob_loc']")
        if location is None:
            return None


        # these units should be metric

        date = node.find("./div[@id='wob_dts']")

        # <img alt="category here" src="cool image">
        category = node.find(".//img[@id='wob_tci']")

        xpath = etree.XPath(".//div[@id='wob_d']//div[contains(@class, 'vk_bk')]//span[@class='wob_t']")
        temperatures = xpath(node)

        misc_info_node = node.find(".//div[@class='vk_gy vk_sh wob-dtl']")

        if misc_info_node is None:
            return None

        precipitation = misc_info_node.find("./div/span[@id='wob_pp']")
        humidity = misc_info_node.find("./div/span[@id='wob_hm']")
        wind = misc_info_node.find("./div/span/span[@id='wob_tws']")


        try:
            e.title = 'Weather for ' + location.text.strip()
            e.description = f'*{category.get("alt")}*'
            e.set_thumbnail(url='https:' + category.get('src'))

            if len(temperatures) == 4:
                first_unit = temperatures[0].text + temperatures[2].text
                second_unit = temperatures[1].text + temperatures[3].text
                units = f'{first_unit} | {second_unit}'
            else:
                units = 'Unknown'

            e.add_field(name='Temperature', value=units, inline=False)

            if precipitation is not None:
                e.add_field(name='Precipitation', value=precipitation.text)

            if humidity is not None:
                e.add_field(name='Humidity', value=humidity.text)

            if wind is not None:
                e.add_field(name='Wind', value=wind.text)
        except:
            return None

        return e
Beispiel #26
0
def validate_and_normalize_data(data, fmt=None):
    """
    This function validates the data for given format (fmt).
    If the fmt is None it tires to guess the data format.
    Currently support data format checks are
    1) xml
    2) json
    3) xpath
    :param data: The data which should be validated and normalised.
    :param fmt: This is an optional argument which indicated the format
    of the data. Valid values are "xml", "json" and "xpath". If the value
    is None the format of the data will be guessed and returned in the output.
    :return:
        *  If the format identified is XML it parses the xml data and returns
           a tuple of lxml.etree.Element class object and the data format type
           which is "xml" in this case.

        *  If the format identified is JSON it parses the json data and returns
           a tuple of dict object and the data format type
           which is "json" in this case.

        *  If the format identified is XPATH it parses the XPATH data and returns
           a tuple of etree.XPath class object and the data format type
           which is "xpath" in this case. For this type lxml library is required
           to be installed.
    """
    if data is None:
        return None, None

    if isinstance(data, string_types):
        data = data.strip()
        if (data.startswith("<") and data.endswith(">")) or fmt == "xml":
            try:
                result = fromstring(data)
                if fmt and fmt != "xml":
                    raise Exception(
                        "Invalid format '%s'. Expected format is 'xml' for data '%s'"
                        % (fmt, data))
                return result, "xml"
            except XMLSyntaxError as exc:
                if fmt == "xml":
                    raise Exception(
                        "'%s' XML validation failed with error '%s'" % (
                            data,
                            to_native(exc, errors="surrogate_then_replace"),
                        ))
                pass
            except Exception as exc:
                error = "'%s' recognized as XML but was not valid." % data
                raise Exception(
                    error + to_native(exc, errors="surrogate_then_replace"))
        elif (data.startswith("{") and data.endswith("}")) or fmt == "json":
            try:
                result = json.loads(data)
                if fmt and fmt != "json":
                    raise Exception(
                        "Invalid format '%s'. Expected format is 'json' for data '%s'"
                        % (fmt, data))
                return result, "json"
            except (
                    TypeError,
                    getattr(json.decoder, "JSONDecodeError", ValueError),
            ) as exc:
                if fmt == "json":
                    raise Exception(
                        "'%s' JSON validation failed with error '%s'" % (
                            data,
                            to_native(exc, errors="surrogate_then_replace"),
                        ))
            except Exception as exc:
                error = "'%s' recognized as JSON but was not valid." % data
                raise Exception(
                    error + to_native(exc, errors="surrogate_then_replace"))
        else:
            try:
                if not HAS_LXML:
                    raise Exception(missing_required_lib("lxml"))

                result = etree.XPath(data)
                if fmt and fmt != "xpath":
                    raise Exception(
                        "Invalid format '%s'. Expected format is 'xpath' for data '%s'"
                        % (fmt, data))
                return result, "xpath"
            except etree.XPathSyntaxError as exc:
                if fmt == "xpath":
                    raise Exception(
                        "'%s' XPath validation failed with error '%s'" % (
                            data,
                            to_native(exc, errors="surrogate_then_replace"),
                        ))
                pass
            except Exception as exc:
                error = "'%s' recognized as Xpath but was not valid." % data
                raise Exception(
                    error + to_native(exc, errors="surrogate_then_replace"))

    elif isinstance(data, dict):
        if fmt and fmt != "json":
            raise Exception(
                "Invalid format '%s'. Expected format is 'json' for data '%s'"
                % (fmt, data))

        try:
            result = json.loads(json.dumps(data))
            return result, "json"
        except (
                TypeError,
                getattr(json.decoder, "JSONDecodeError", ValueError),
        ) as exc:
            raise Exception(
                "'%s' JSON validation failed with error '%s'" %
                (data, to_native(exc, errors="surrogate_then_replace")))
        except Exception as exc:
            error = "'%s' recognized as JSON but was not valid." % data
            raise Exception(error +
                            to_native(exc, errors="surrogate_then_replace"))

    return data, None
 def set_profile_url(self):
     try:
         profile_url_xpath = etree.XPath('site-standard-profile-request/url')
         self.profile_url = profile_url_xpath(self.xml)[0].text.strip()
     except:
         pass
            match = re.search(args.pattern, obj)
            #debug("object ",obj,ife(match, " matches"," does not match"))
            if ((args.exclude and (match == None))
                    or (not args.exclude and (match != None))):
                exportObject(obj, args, prefix, extension, infile)
    elif (xpath_mode):
        from lxml import etree
        message("exporting from " + infile + " all objects " +
                ife(args.exclude, 'not ', '') + "matching " + args.xpath)
        parser = etree.XMLParser()  #ns_clean=True)
        intree = etree.parse(infile, parser)
        if (len(parser.error_log) > 0):
            message("Could not parse ", infile, ":")
            debug(error_log)
        find = etree.XPath(
            "(" + args.xpath + ")/@id",
            namespaces=xpath_namespaces)  #find the ids, not the objects
        objects = find(intree)
        message("found %i objects matching XPath" % len(objects))
        if (not args.exclude):  #include mode
            for obj in objects:
                exportObject(obj, args, prefix, extension, infile)
        else:  #exclude mode
            objects_all = subprocess.check_output(
                [args.inkscape, "--query-all", infile])
            #message(objects)
            for obj in objects_all.splitlines():
                obj = obj.split(',')[0]  #keep only ID:
                if not (obj in objects):
                    exportObject(obj, args, prefix, extension, infile)
 def get_educations(self):
     profile_education_xpath = etree.XPath('educations/education')
     eds = profile_education_xpath(self.xml)
     for e in eds:
         obj = lixml.LinkedInXMLParser(etree.tostring(e)).results
         self.educations.append(obj)
Beispiel #30
0
import ujson
from collections import defaultdict, OrderedDict
from lxml import etree
import csv

from .entry_data import word_dict


NANORI = etree.XPath('./reading_meaning//nanori')
ON_READINGS = etree.XPath('./reading_meaning//reading[@r_type="ja_on"]')
KUN_READINGS = etree.XPath('./reading_meaning//reading[@r_type="ja_kun"]')
MEANINGS = etree.XPath('./reading_meaning//meaning[not(@m_lang)]')
GRADE = etree.XPath('./misc/grade')
STROKE_COUNT = etree.XPath('./misc/stroke_count')
CODEPOINT = etree.XPath('.//cp_value[@cp_type="ucs"]')
JLPT = etree.XPath('./misc/jlpt')
LITERAL = etree.XPath('literal')


def nanori(character):
    readings = NANORI(character)
    return [reading.text for reading in readings]


def on_readings(character):
    readings = ON_READINGS(character)
    return [reading.text for reading in readings]


def kun_readings(character):
    readings = KUN_READINGS(character)