Python XMLParserの例、lxml.etree.XMLParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: pushparser.py プロジェクト: seecr/meresco-xml

class PushParser(object):
    def __init__(self, elementPath, onResultDo):
        builder = SubTreesTreeBuilder(elementPath=elementPath, onResult=onResultDo)
        self._parser = XMLParser(target=builder)

    def feed(self, data):
        self._parser.feed(data)

コード例 #2

0

ファイルを表示

ファイル: NOAATideData.py プロジェクト: DanRamage/commonfiles

  def getWaterLevelRawSixMinuteDataExt(self,
                                    beginDate,
                                    endDate,
                                    station,
                                    datum='MLLW',
                                    unit='feet',
                                    shift='GMT'):
    if self.logger:
      self.logger.debug("SOAP WSDL: %s" % (self.baseUrl))
    soapClient = Client(self.baseUrl, retxml=True)
    if(unit == 'feet'):
      unit = 1
    else:
      unit = 2
    if(shift == 'GMT'):
      shift = 0
    else:
      shift = 1

    ret_xml = soapClient.service.getWaterLevelRawSixMin(station, beginDate, endDate, datum, unit, shift)
    if self.logger:
      self.logger.debug(ret_xml)
    parser = XMLParser(remove_blank_text=True, huge_tree=True)
    parser.set_element_class_lookup(objectify.ObjectifyElementClassLookup())
    objectify.set_default_parser(parser)
    root = objectify.fromstring(ret_xml)
    objectify.deannotate(root, cleanup_namespaces=True)

    return(root)

コード例 #3

0

ファイルを表示

ファイル: checks.py プロジェクト: nexdatas/configserver

def checknxmls(utest, xml1, xmls):
    """ compare xmls via unittests

    :param utest: unittest case object
    :type utest: :obj:`unittest.TestCase`
    :param xml1: first xml
    :type xml1: :obj:`str`
    :param xmls: list of xml to compare
    :type xmls: :obj:`list` < :obj:`str` >
    """

    n1 = et.fromstring(xml1,
                       parser=XMLParser(collect_ids=False,
                                        remove_blank_text=True))
    ns = []
    for xml2 in xmls:
        ns.append(
            et.fromstring(xml2,
                          parser=XMLParser(collect_ids=False,
                                           remove_blank_text=True)))
    for i, n2 in enumerate(ns):
        try:
            checknodes(utest, n1, n2)
            break
        except Exception:
            print("%s\n!=\n%s" % (xml1, xml2))
            if i + 1 == len(ns):
                raise

コード例 #4

0

ファイルを表示

ファイル: SvgBinding.py プロジェクト: soedjais/augustus

    def loadXml(data, **parserOptions):
        """Load SVG from an XML string, fileName, or file-like object.

        @type data: string or file-like object
        @param data: The serialized SVG, fileName, or file-like object that generates SVG as XML.
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: SvgBinding
        @return: An in-memory representation of the SVG.
        """

        if isinstance(data, basestring):
            if os.path.exists(data):
                data = open(data)
            else:
                data = StringIO(data)

        newParserOptions = {"huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementDefaultClassLookup(element=SvgBinding)
        parser.set_element_class_lookup(lookup)

        return parse(data, parser).getroot()

コード例 #5

0

ファイルを表示

ファイル: ValidateFilingText.py プロジェクト: fewang0521/python_dart

def checkfile(modelXbrl, filepath):
    result = []
    lineNum = 1
    foundXmlDeclaration = False
    isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM"
    file, encoding = modelXbrl.fileSource.file(filepath)
    parserResults = {}
    class checkFileType(object):
        def start(self, tag, attr): # check root XML element type
            parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet")
        def end(self, tag): pass
        def data(self, data): pass
        def close(self): pass
    _parser = XMLParser(target=checkFileType())
    _isTestcase = False
    
    with file as f:
        while True:
            line = f.readline()
            if line == "":
                break;
            # check for disallowed characters or entity codes
            for match in docCheckPattern.finditer(line):
                text = match.group()
                if text.startswith("&"):
                    if not text in xhtmlEntities:
                        modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"),
                            _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
                elif isEFM and not _isTestcase:
                    if len(text) == 1:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), 
                            file=os.path.basename(filepath), line=lineNum, column=match.start())
                    else:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
            if lineNum == 1:
                xmlDeclarationMatch = XMLdeclaration.search(line)
                if xmlDeclarationMatch: # remove it for lxml
                    start,end = xmlDeclarationMatch.span()
                    line = line[0:start] + line[end:]
                    foundXmlDeclaration = True
            if _parser: # feed line after removal of xml declaration
                _parser.feed(line.encode('utf-8','ignore'))
                if "rootIsTestcase" in parserResults: # root XML element has been encountered
                    _isTestcase = parserResults["rootIsTestcase"]
                    _parser = None # no point to parse past the root element
            result.append(line)
            lineNum += 1
    result = ''.join(result)
    if not foundXmlDeclaration: # may be multiline, try again
        xmlDeclarationMatch = XMLdeclaration.search(result)
        if xmlDeclarationMatch: # remove it for lxml
            start,end = xmlDeclarationMatch.span()
            result = result[0:start] + result[end:]
            foundXmlDeclaration = True
    return (io.StringIO(initial_value=result), encoding)

コード例 #6

0

ファイルを表示

ファイル: util.py プロジェクト: tobpe/exchangelib

def to_xml(text):
    try:
        if PY2:
            # On python2, fromstring expects an encoded string
            return fromstring((text[BOM_LEN:] if text.startswith(BOM) else text).encode('utf-8'))
        return fromstring(text[BOM_LEN:] if text.startswith(BOM) else text)
    except ParseError:
        # Exchange servers may spit out the weirdest XML. lxml is pretty good at recovering from errors
        log.warning('Fallback to lxml processing of faulty XML')
        magical_parser = XMLParser(recover=True, resolve_entities=False)
        magical_parser.set_element_class_lookup(ElementDefaultClassLookup(element=RestrictedElement))
        no_bom_text = text[BOM_LEN:] if text.startswith(BOM) else text
        try:
            root = parse(io.BytesIO(no_bom_text.encode('utf-8')), parser=magical_parser)
        except AssertionError as e:
            raise ParseError(*e.args)
        try:
            return fromstring(tostring(root))
        except ParseError as e:
            if hasattr(e, 'position'):
                e.lineno, e.offset = e.position
            if not e.lineno:
                raise ParseError('%s' % text_type(e))
            try:
                offending_line = no_bom_text.splitlines()[e.lineno - 1]
            except IndexError:
                raise ParseError('%s' % text_type(e))
            else:
                offending_excerpt = offending_line[max(0, e.offset - 20):e.offset + 20]
                raise ParseError('%s\nOffending text: [...]%s[...]' % (text_type(e), offending_excerpt))
        except TypeError:
            raise ParseError('This is not XML: %s' % text)

コード例 #7

0

ファイルを表示

ファイル: ValidateFilingText.py プロジェクト: JTYim/Arelle

def checkfile(modelXbrl, filepath):
    result = []
    lineNum = 1
    foundXmlDeclaration = False
    isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM"
    file, encoding = modelXbrl.fileSource.file(filepath)
    parserResults = {}
    class checkFileType(object):
        def start(self, tag, attr): # check root XML element type
            parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet")
        def end(self, tag): pass
        def data(self, data): pass
        def close(self): pass
    _parser = XMLParser(target=checkFileType())
    _isTestcase = False
    
    with file as f:
        while True:
            line = f.readline()
            if line == "":
                break;
            # check for disallowed characters or entity codes
            for match in docCheckPattern.finditer(line):
                text = match.group()
                if text.startswith("&"):
                    if not text in xhtmlEntities:
                        modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"),
                            _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
                elif isEFM and not _isTestcase:
                    if len(text) == 1:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), 
                            file=os.path.basename(filepath), line=lineNum, column=match.start())
                    else:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
            if lineNum == 1:
                xmlDeclarationMatch = XMLdeclaration.search(line)
                if xmlDeclarationMatch: # remove it for lxml
                    start,end = xmlDeclarationMatch.span()
                    line = line[0:start] + line[end:]
                    foundXmlDeclaration = True
            if _parser: # feed line after removal of xml declaration
                _parser.feed(line.encode('utf-8','ignore'))
                if "rootIsTestcase" in parserResults: # root XML element has been encountered
                    _isTestcase = parserResults["rootIsTestcase"]
                    _parser = None # no point to parse past the root element
            result.append(line)
            lineNum += 1
    result = ''.join(result)
    if not foundXmlDeclaration: # may be multiline, try again
        xmlDeclarationMatch = XMLdeclaration.search(result)
        if xmlDeclarationMatch: # remove it for lxml
            start,end = xmlDeclarationMatch.span()
            result = result[0:start] + result[end:]
            foundXmlDeclaration = True
    return (io.StringIO(initial_value=result), encoding)

コード例 #8

0

ファイルを表示

ファイル: XMLConfigurator.py プロジェクト: nexdatas/configserver

    def createConfiguration(self, names):
        """ creates the final configuration string in the xmlstring attribute

        :param names: list of component names
        :type names: :obj:`list` <:obj:`str`>
        """
        cnf = self.__mergeVars(names, withVariables=True)
        cnf = self.__instantiate(cnf)
        cnfMerged = self.__merge([cnf])
        if cnfMerged and hasattr(cnfMerged, "strip") and cnfMerged.strip():
            if sys.version_info > (3,):
                reparsed = et.fromstring(
                    bytes(cnfMerged, "UTF-8"),
                    parser=XMLParser(collect_ids=False))
            else:
                reparsed = et.fromstring(
                    cnfMerged, parser=XMLParser(collect_ids=False))
            xmls = _tostr(etree.tostring(reparsed, encoding='unicode',
                                         method='xml', pretty_print=True))
            if xmls.startswith("<?xml"):
                self.xmlstring = xmls
            else:
                self.xmlstring = "<?xml version='1.0' encoding='utf8'?>" + xmls
        else:
            self.xmlstring = ''
        self._streams.info("XMLConfigurator::createConfiguration() "
                           "- Create configuration")

コード例 #9

0

ファイルを表示

ファイル: SvgBinding.py プロジェクト: Huskyeder/augustus

    def loadXml(data, **parserOptions):
        """Load SVG from an XML string, fileName, or file-like object.

        @type data: string or file-like object
        @param data: The serialized SVG, fileName, or file-like object that generates SVG as XML.
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: SvgBinding
        @return: An in-memory representation of the SVG.
        """

        if isinstance(data, basestring):
            if os.path.exists(data):
                data = open(data)
            else:
                data = StringIO(data)

        newParserOptions = {"huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementDefaultClassLookup(element=SvgBinding)
        parser.set_element_class_lookup(lookup)

        return parse(data, parser).getroot()

コード例 #10

0

ファイルを表示

class PushParser(object):
    def __init__(self, elementPath, onResultDo):
        builder = SubTreesTreeBuilder(elementPath=elementPath,
                                      onResult=onResultDo)
        self._parser = XMLParser(target=builder)

    def feed(self, data):
        self._parser.feed(data)

コード例 #11

0

ファイルを表示

 def testFilterTag(self):
     target = Target('mies')
     p = XMLParser(target = target)
     p.feed("<aap><mies>")
     p.feed("noot")
     p.feed("</mies>")
     p.feed("</aap>")
     self.assertEqual("<mies>noot</mies>", lxmltostring(target.root))

コード例 #12

0

ファイルを表示

ファイル: ModelLoader.py プロジェクト: soedjais/augustus

    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath(
                "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding

コード例 #13

0

ファイルを表示

ファイル: streaminglxmltest.py プロジェクト: seecr/meresco-oai

 def testFilterTag(self):
     target = Target('mies')
     p = XMLParser(target = target)
     p.feed("<aap><mies>")
     p.feed("noot")
     p.feed("</mies>")
     p.feed("</aap>")
     self.assertEquals("<mies>noot</mies>", lxmltostring(target.root))

コード例 #14

0

ファイルを表示

ファイル: subtreestreebuildertest.py プロジェクト: seecr/meresco-xml

 def testOnResult(self):
     trees = []
     def onResult(tree):
         trees.append(tree)
     xml = """<a><b>Dit is een tag in een tag</b></a>"""
     builder = SubTreesTreeBuilder(elementPath=['r', 'a', 'b'], onResult=onResult)
     parser = XMLParser(target=builder)
     parser.feed("<r>")
     parser.feed(xml)
     parser.feed(xml)
     self.assertEquals(2, len(trees))
     self.assertEquals('<b>Dit is een tag in een tag</b>', tostring(trees[0]))

コード例 #15

0

ファイルを表示

ファイル: ModelLoader.py プロジェクト: Huskyeder/augustus

    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding

コード例 #16

0

ファイルを表示

ファイル: subtreestreebuildertest.py プロジェクト: seecr/meresco-xml

    def testIdentityTransformWithNS(self):
        builder = SubTreesTreeBuilder(buildFor={
            'one': lambda stack: [d['tag'] for d in stack] == ['{u:ri/default#}root'],
        })
        parser = XMLParser(target=builder)
        parser.feed(XML_NS)
        parser.close()

        subtrees = [t for t in builder.getSubtrees()]
        self.assertEquals(1, len(subtrees))

        id, lxml = subtrees[0]
        self.assertEquals('one', id)
        self.assertEqualsLxml(parseString(XML_NS), lxml)

コード例 #17

0

ファイルを表示

 def testTwoTags(self):
     target = Target('aap')
     p = XMLParser(target = target)
     p.feed("<aap>")
     p.feed("noot")
     p.feed("</aap>")
     self.assertEqual("<aap>noot</aap>", lxmltostring(target.root))

コード例 #18

0

ファイルを表示

 def egress(self, envelope, http_headers, operation, binding_options):
     xmlString = etree.tostring(envelope, encoding='unicode')
     xmlString = xmlString.replace("&lt;", "<")
     xmlString = xmlString.replace("&gt;", ">")
     etree.fromstring(xmlString,
                      parser=XMLParser(recover=True, strip_cdata=False))
     return envelope, http_headers

コード例 #19

0

ファイルを表示

ファイル: parsing.py プロジェクト: pombreda/calibre-1

def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw)
    if force_html5_parse:
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
        parser = XMLParser(no_network=True)
        ans = fromstring(raw, parser=parser)
        if ans.tag != '{%s}html' % html_ns:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
            for elem in ans.iter(LxmlElement):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)

コード例 #20

0

ファイルを表示

ファイル: create_vis_model.py プロジェクト: ericustc/RFC

def create_vis_model(in_file, out_file, num=2, expert_ind=1):
    xml_parser = XMLParser(remove_blank_text=True)
    tree = parse(in_file, parser=xml_parser)
    remove_elements = ['actuator', 'contact', 'equality']
    for elem in remove_elements:
        node = tree.getroot().find(elem)
        if node is not None:
            node.getparent().remove(node)
    
    option = tree.getroot().find('option')
    flag = SubElement(option, 'flag', {'contact': 'disable'})
    option.addnext(Element('size', {'njmax': '1000'}))

    default = tree.getroot().find('default')
    default_c = SubElement(default, 'default', {'class': 'expert'})
    SubElement(default_c, 'geom', {'rgba': '0.7 0.0 0.0 1'})

    worldbody = tree.getroot().find('worldbody')
    body = worldbody.find('body')
    for i in range(1, num):
        new_body = deepcopy(body)
        if i == expert_ind:
            new_body.attrib['childclass'] = 'expert'
        new_body.attrib['name'] = '%d_%s' % (i, new_body.attrib['name'])
        for node in new_body.findall(".//body"):
            node.attrib['name'] = '%d_%s' % (i, node.attrib['name'])
        for node in new_body.findall(".//joint"):
            node.attrib['name'] = '%d_%s' % (i, node.attrib['name'])
        for node in new_body.findall(".//site"):
            node.attrib['name'] = '%d_%s' % (i, node.attrib['name'])
        worldbody.append(new_body)
    tree.write(out_file, pretty_print=True)

コード例 #21

0

ファイルを表示

ファイル: extract_from_xml_tags.py プロジェクト: mikewheel/spectral_clustering_word_embeddings

    def run(self):
        with open(self.path_to_xml_fragments, "r") as input_f:
            # Needs a root element to parse as XML
            xml_str = '<root>\n' + input_f.read() + '\n</root>'
            
            # Handle malformed XML from WikiExtractor: https://stackoverflow.com/a/9050454/8857601
            parser = XMLParser(encoding='utf-8', recover=True, remove_blank_text=True)
            tree = parse(StringIO(xml_str), parser=parser)
            root = tree.getroot()
        
        # For dataframe construction below
        doc_rows = []

        # For each child tag under the newly-constructed root tag
        for doc_tag in root.findall('doc'):
            # Extract target information from this tag
            doc_id = doc_tag.attrib.get("id")
            doc_url = doc_tag.attrib.get("url")
            doc_title = doc_tag.attrib.get("title")
            doc_text = doc_tag.text.strip()
            
            tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(doc_text)]
            
            with open(config.LINE_SENTENCE_CORPUS_FILE, "a") as corpus_f:
                for sent in tokenized_text:
                    corpus_f.write(" ".join(sent) + "\n")
            
            doc_rows.append([doc_id, doc_url, doc_title, tokenized_text])
            
        # Construct a dataframe, and then construct parquet file output
        df = pandas.DataFrame(doc_rows, columns=["id", "url", "title", "tokenized_text"])
        df.to_parquet(open(self.output().path, "wb"))

コード例 #22

0

ファイルを表示

def process_saml_md_about_sps(saml_md: bytes):
    saml_md_tree = XML(saml_md)
    localparser = XMLParser(
        remove_blank_text=True, resolve_entities=False, remove_comments=False)
    ref = files('SPF_SAML_metadata_processor').joinpath(REMOVE_NAMESPACE_PREFIXES_XSL_FILE_PATH)
    with ref.open('rb') as xslt_root1_file:
        xslt_root1 = parse(xslt_root1_file, parser=localparser)

        transform1 = XSLT(xslt_root1)
        saml_md_tree_1 = transform1(saml_md_tree)

    ref = files('SPF_SAML_metadata_processor').joinpath(REMOVE_KEY_WHITESPACE_XSL_FILE_PATH)
    with ref.open('rb') as xslt_root2_file:
        xslt_root2 = parse(xslt_root2_file, parser=localparser)

    transform2 = XSLT(xslt_root2)
    saml_md_2 = transform2(saml_md_tree_1)

    canonicalized_saml_md_2 = BytesIO()
    saml_md_2.write_c14n(
        canonicalized_saml_md_2, exclusive=True, with_comments=False)

    saml_md_tree_3 = XML(canonicalized_saml_md_2.getvalue(),
                         localparser).getroottree()

    return saml_md_tree_3

コード例 #23

0

ファイルを表示

 def _convert(self, anObject):
     parseKwargs = {}
     if not self._parseOptions is None:
         parseKwargs = dict(parser=XMLParser(**self._parseOptions))
     if isinstance(anObject, str):
         anObject = bytes(anObject, encoding='utf-8')
     return parse(BytesIO(bytes(anObject)), **parseKwargs)

コード例 #24

0

ファイルを表示

def xmlpreprocess(fname, output):
    schema_file = "https://www.linutronix.de/projects/Elbe/dbsfed.xsd"
    parser = XMLParser(huge_tree=True)
    schema_tree = etree.parse(schema_file)
    schema = etree.XMLSchema(schema_tree)

    try:
        xml = parse(fname, parser=parser)
        xml.xinclude()

        if schema.validate(xml):
            xml.write(output,
                      encoding="UTF-8",
                      pretty_print=True,
                      compression=9)
            return

    except etree.XMLSyntaxError:
        raise XMLPreprocessError("XML Parse error\n" + str(sys.exc_info()[1]))
    except:
        XMLPreprocessError("Unknown Exception during validation\n" +
                           str(sys.exc_info()[1]))

    # We have errors, return them in string form...
    errors = []
    for err in schema.error_log:
        errors.append("%s:%d error %s" % (err.filename, err.line, err.message))

    raise XMLPreprocessError(errors)

コード例 #25

0

ファイルを表示

def to_xml(text, encoding):
    from xml.etree.ElementTree import fromstring, ParseError
    processed = text.lstrip(BOM).encode(encoding or 'utf-8')
    try:
        return fromstring(processed)
    except ParseError:
        from io import BytesIO
        from lxml.etree import XMLParser, parse, tostring
        # Exchange servers may spit out the weirdest XML. lxml is pretty good at recovering from errors
        log.warning('Fallback to lxml processing of faulty XML')
        magical_parser = XMLParser(encoding=encoding or 'utf-8', recover=True)
        root = parse(BytesIO(processed), magical_parser)
        try:
            return fromstring(tostring(root))
        except ParseError as e:
            line_no, col_no = e.lineno, e.offset
            try:
                offending_line = processed.splitlines()[line_no - 1]
            except IndexError:
                offending_line = ''
            offending_excerpt = offending_line[max(0, col_no - 20):col_no +
                                               20].decode('ascii', 'ignore')
            raise ParseError('%s\nOffending text: [...]%s[...]' %
                             (text_type(e), offending_excerpt))
        except TypeError:
            raise ParseError('This is not XML: %s' % text)

コード例 #26

0

ファイルを表示

def check_xml_parsing(name, mt, raw):
    raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
    # Get rid of entities as named entities trip up the XML parser
    eproc = EntitityProcessor(mt)
    eraw = entity_pat.sub(eproc, raw)
    parser = XMLParser(recover=False)
    errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
    errors = []
    if eproc.ok_named_entities:
        errors.append(NamedEntities(name))
    if eproc.bad_entities:
        position = PositionFinder(raw)
        for offset, ent in eproc.bad_entities:
            lnum, col = position(offset)
            errors.append(BadEntity(ent, name, lnum, col))

    try:
        root = fromstring(eraw, parser=parser)
    except UnicodeDecodeError:
        return errors + [DecodeError(name)]
    except XMLSyntaxError as err:
        try:
            line, col = err.position
        except:
            line = col = None
        return errors + [errcls(err.message, name, line, col)]
    except Exception as err:
        return errors + [errcls(err.message, name)]

    if mt in OEB_DOCS:
        if root.nsmap.get(root.prefix, None) != XHTML_NS:
            errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None)))

    return errors

コード例 #27

0

ファイルを表示

ファイル: testLxml.py プロジェクト: tututu-patch/DTD-Attacks

 def testDOS_entitySize(self):
     parser = XMLParser()
     tree = parse('../../xml_files_windows/dos/dos_entitySize.xml', parser)
     root = tree.getroot()
     count = root.text.count("dos")
     expectedCount = 3400000
     self.assertEqual(expectedCount, count)

コード例 #28

0

ファイルを表示

ファイル: wordcount_Q5.py プロジェクト: ImScientist/TDI_MapReduce

 def mapper_0(self, _, line):
     line = line.decode("utf-8")    # convert byte string to an unicode string;   unicode(line, encoding) does the same
     self.buf.write(line)
     if re.search(r'</page>', line):
         try:
             magical_parser = XMLParser(encoding='utf-8', recover=True)
             root = etree.parse(StringIO(self.buf.getvalue()), magical_parser).getroot()
             t = root.find("title").text
             for rev in root.findall("revision"):
                 
                 time_element = rev.find("timestamp")
                 text_element = rev.find("text")
                 if time_element is not None and text_element is not None:
                     
                     timestamp  = datetime.strptime(time_element.text, "%Y-%m-%dT%H:%M:%SZ")
                     links_list = mwparserfromhell.parse(text_element.text.encode('ascii', 'ignore')).filter_wikilinks()
                     links_set  = set()
                     
                     for link in links_list:
                         link_stripped = parselink.search(unicode(link)).groups()[0]
                         if link_stripped is not None:
                             links_set.add(link_stripped)
                     
                     yield (t, (timestamp, len(links_set)))
             
             self.buf.truncate(0)
             self.buf.seek(0)
         except:
             self.buf.truncate(0)
             self.buf.seek(0)

コード例 #29

0

ファイルを表示

ファイル: util.py プロジェクト: wucl202000/exchangelib

def to_xml(text):
    try:
        if PY2:
            # On python2, fromstring expects an encoded string
            return fromstring((text[BOM_LEN:] if text.startswith(BOM) else
                               text).encode('utf-8'))
        return fromstring(text[BOM_LEN:] if text.startswith(BOM) else text)
    except ParseError:
        from lxml.etree import XMLParser, parse, tostring
        # Exchange servers may spit out the weirdest XML. lxml is pretty good at recovering from errors
        log.warning('Fallback to lxml processing of faulty XML')
        magical_parser = XMLParser(recover=True)
        no_bom_text = text[BOM_LEN:] if text.startswith(BOM) else text
        root = parse(io.BytesIO(no_bom_text.encode('utf-8')), magical_parser)
        try:
            return fromstring(tostring(root))
        except ParseError as e:
            if hasattr(e, 'position'):
                e.lineno, e.offset = e.position
            if not e.lineno:
                raise ParseError('%s' % text_type(e))
            try:
                offending_line = no_bom_text.splitlines()[e.lineno - 1]
            except IndexError:
                raise ParseError('%s' % text_type(e))
            else:
                offending_excerpt = offending_line[max(0, e.offset -
                                                       20):e.offset + 20]
                raise ParseError('%s\nOffending text: [...]%s[...]' %
                                 (text_type(e), offending_excerpt))
        except TypeError:
            raise ParseError('This is not XML: %s' % text)

コード例 #30

0

ファイルを表示

ファイル: testLxml.py プロジェクト: tututu-patch/DTD-Attacks

 def testInternalSubset_PEReferenceInDTD_resolve_entities(self):
     parser = XMLParser(resolve_entities=False)
     tree = parse(
         '../../xml_files_windows/xxep/internalSubset_PEReferenceInDTD.xml',
         parser)
     root = tree.getroot()
     self.assertEquals(None, root.text)

コード例 #31

0

ファイルを表示

ファイル: testLxml.py プロジェクト: tututu-patch/DTD-Attacks

 def testParameterEntity_core(self):
     with self.assertRaises(XMLSyntaxError):
         #gives an XMLSyntaxError when trying to access resource
         parser = XMLParser()
         tree = parse(
             '../../xml_files_windows/xxep/parameterEntity_core.xml',
             parser)

コード例 #32

0

ファイルを表示

ファイル: _client.py プロジェクト: mgrrx/aioros

def parse_xml(body: bytes,
              method_name: str,
              *,
              huge_tree: bool = False) -> XmlRpcTypes:
    parser = XMLParser(huge_tree=huge_tree)
    response = fromstring(body, parser)
    if not validate_schema(response):
        raise ValueError("Invalid body")

    result = cast(List[_Element], response.xpath("//params/param/value"))
    if result:
        if len(result) < 2:
            return xml2py(result[0])

        return [xml2py(item) for item in result]

    fault = cast(List[_Element], response.xpath("//fault/value"))
    if fault:
        err = cast(XmlRpcStructType, xml2py(fault[0]))

        raise xml2py_exception(
            cast(int, err.get("faultCode", XMLRPCSystemError.code)),
            cast(str, err.get("faultString", "Unknown error")),
            default_exc_class=ServerError,
        )

    raise ParseError(f'Respond body for method "{method_name}" '
                     "not contains any response.")

コード例 #33

0

ファイルを表示

ファイル: testLxml.py プロジェクト: tututu-patch/DTD-Attacks

 def testInternalSubset_PEReferenceInDTD(self):
     parser = XMLParser()
     tree = parse(
         '../../xml_files_windows/xxep/internalSubset_PEReferenceInDTD.xml',
         parser)
     root = tree.getroot()
     self.assertEquals("it_works", root.text)

コード例 #34

0

ファイルを表示

 def root_from_url(cls, browser, url, timeout, log):
     log.info('Fetching: %s' % url)
     response = browser.open_novisit(url, timeout=timeout)
     raw = response.read()
     parser = XMLParser(recover=True, no_network=True)
     return fromstring(xml_to_unicode(clean_ascii_chars(raw),
         strip_encoding_pats=True)[0], parser=parser)

コード例 #35

0

ファイルを表示

ファイル: ValidateFilingText.py プロジェクト: fewang0521/python_dart

def referencedFiles(modelXbrl, localFilesOnly=True):
    _parser = XMLParser(resolve_entities=False, remove_comments=True, remove_pis=True, recover=True)
    referencedFiles = set()
    # add referenced files that are html-referenced image and other files
    def addReferencedFile(docElt, elt):
        if elt.tag in ("a", "img", "{http://www.w3.org/1999/xhtml}a", "{http://www.w3.org/1999/xhtml}img"):
            for attrTag, attrValue in elt.items():
                if (attrTag in ("href", "src") and 
                    scheme(attrValue) not in ("data", "javascript") and (
                        not localFilesOnly or 
                        (not isHttpUrl(attrValue) and not os.path.isabs(attrValue)))):
                    attrValue = attrValue.partition('#')[0] # remove anchor
                    if attrValue: # ignore anchor references to base document
                        base = docElt.modelDocument.baseForElement(docElt)
                        normalizedUri = docElt.modelXbrl.modelManager.cntlr.webCache.normalizeUrl(attrValue, base)
                        if not docElt.modelXbrl.fileSource.isInArchive(normalizedUri):
                            normalizedUri = docElt.modelXbrl.modelManager.cntlr.webCache.getfilename(normalizedUri)
                        if modelXbrl.fileSource.isInArchive(normalizedUri, checkExistence=True) or os.path.exists(normalizedUri):
                            referencedFiles.add(attrValue) # add file name within source directory
    for fact in modelXbrl.facts:
        if fact.concept is not None and fact.isItem and fact.concept.isTextBlock:
            # check for img and other filing references so that referenced files are included in the zip.
            text = fact.textValue
            for xmltext in [text] + CDATApattern.findall(text):
                try:
                    for elt in XML("<body>\n{0}\n</body>\n".format(xmltext), parser=_parser).iter():
                        addReferencedFile(fact, elt)
                except (XMLSyntaxError, UnicodeDecodeError):
                    pass  # TODO: Why ignore UnicodeDecodeError?
    # footnote or other elements
    for elt in modelXbrl.modelDocument.xmlRootElement.iter("{http://www.w3.org/1999/xhtml}a", "{http://www.w3.org/1999/xhtml}img"):
        addReferencedFile(elt, elt)
    return referencedFiles

コード例 #36

0

ファイルを表示

ファイル: ppx.py プロジェクト: peteradrichem/Xul

def main():
    """ppx command line script entry point."""
    # Logging to the console.
    setup_logger_console()

    # Command line.
    args = parse_cl()

    # Initialise XML parser and remove blank text for 'pretty_print' formatting.
    #   https://lxml.de/FAQ.html#parsing-and-serialisation
    parser = XMLParser(remove_blank_text=True)

    # Pretty print XML sources.
    for xml_s in args.xml_sources:
        pp_xml(xml_s,
               parser=parser,
               syntax=args.syntax,
               xml_declaration=args.declaration)

    if not args.xml_sources:
        # Read from a pipe when no XML source is specified.
        if not stdin.isatty():
            pp_xml(stdin,
                   parser=parser,
                   syntax=args.syntax,
                   xml_declaration=args.declaration)
        else:
            stderr.write("Error: no XML source specified\n")

コード例 #37

0

ファイルを表示

    def from_elements(executed_query, positions, estimates):
        """
        Constructs results from XML elements in a WebMIaS response.

        Parameters
        ----------
        executed_query : ExecutedQuery
            The executed query with the WebMIaS response.
        positions : dict of (string, double)
            A map from paragraph identifiers to estimated positions of paragraphs in their parent
            documents. The positions are in the range [0; 1].
        estimates : sequence of double
            Estimates of P(relevant | position) in the form of a histogram.

        Returns
        -------
        ExecutedProcessedQuery
            An executed query with processed results.
        """
        assert isinstance(executed_query, ExecutedQuery)

        parser = XMLParser(encoding="utf-8", recover=True)
        response = etree.fromstring(executed_query.response_text,
                                    parser=parser)
        results = (MIaSResult.from_element(executed_query, result, positions,
                                           estimates)
                   for result in response.xpath(XPATH_RESULT))

        return ExecutedProcessedQuery(executed_query, results)

コード例 #38

0

ファイルを表示

def extract_geo_data(filename_kml):
    root = etree.parse(dl_path + '/' + filename_kml,
                       parser=XMLParser(huge_tree=True))

    s_StationIDs = pd.Series(
        root.xpath(
            "//kml:Document/kml:Placemark/kml:name/text()",
            namespaces={
                "dwd":
                "https://opendata.dwd.de/weather/lib/pointforecast_dwd_extension_V1_0.xsd",
                "kml": "http://www.opengis.net/kml/2.2"
            }))

    s_GeoLocations = pd.Series(
        root.xpath(
            "//kml:Document/kml:Placemark/kml:Point/kml:coordinates/text()",
            namespaces={
                "dwd":
                "https://opendata.dwd.de/weather/lib/pointforecast_dwd_extension_V1_0.xsd",
                "kml": "http://www.opengis.net/kml/2.2"
            }))

    df_GeoID = pd.concat(
        [s_StationIDs,
         s_GeoLocations.str.split(",", expand=True)], axis=1)

    df_GeoID.columns = ['StationID', 'lat', 'long', 'height']

    df_GeoID.to_csv(dl_path + '/' + 'geo_coordinates.csv',
                    index=False,
                    header=False)

    return s_StationIDs

コード例 #39

0

ファイルを表示

ファイル: streaminglxmltest.py プロジェクト: seecr/meresco-oai

 def testTwoTags(self):
     target = Target('aap')
     p = XMLParser(target = target)
     p.feed("<aap>")
     p.feed("noot")
     p.feed("</aap>")
     self.assertEquals("<aap>noot</aap>", lxmltostring(target.root))

コード例 #40

0

ファイルを表示

ファイル: subtreestreebuildertest.py プロジェクト: seecr/meresco-xml

def parseIncrementallyBy20(builder, inputXml):
    parser = XMLParser(target=builder)
    xmlStream = StringIO(inputXml)
    result = []
    data = xmlStream.read(20)
    loops = 0
    while data:
        loops += 1
        parser.feed(data)
        for id, subtree in builder.getSubtrees():
            result.append((id, subtree))
        data = xmlStream.read(20)
    retval = parser.close()
    for id, subtree in builder.getSubtrees():
        result.append((id, subtree))
    assert retval is None, 'Errr?'
    assert ceil(len(inputXml) / 20.0) == loops, 'Errr?'
    return result, loops

コード例 #41

0

ファイルを表示

ファイル: subtreestreebuilder.py プロジェクト: seecr/meresco-xml

    def start(self):
        def isPath(stack):
            return [d['tag'] for d in stack] == self._path
        builder = SubTreesTreeBuilder(buildFor={
            'simple': isPath,
        })
        def processSubtrees():
            for id, subtree in builder.getSubtrees():
                self._callback(subtree)
        parser = XMLParser(target=builder)

        data = self._stream.read(4096)
        while data:
            parser.feed(data)
            processSubtrees()
            data = self._stream.read(4096)
        parser.close()
        processSubtrees()

コード例 #42

0

ファイルを表示

ファイル: SvgBinding.py プロジェクト: Huskyeder/augustus

def makeElementMaker():
    """Obtain a factory for making in-memory SVG objects.

    This factory is an lxml ElementMaker, pre-loaded with the SVG
    namespace and this ModelLoader's current tag-to-class
    relationship.  See the lxml documentation for how to use an
    ElementMaker.

    The C{SvgBinding} class has an C{elementMaker} attribute that
    should be used instead of calling this function.

    @see: The lxml U{ElementMaker documentation<http://lxml.de/api/lxml.builder.ElementMaker-class.html>}, which explains how to use an ElementMaker factory.
    """

    parser = XMLParser(huge_tree=True)
    lookup = ElementDefaultClassLookup(element=SvgBinding)
    parser.set_element_class_lookup(lookup)

    return ElementMaker(namespace=defs.SVG_NAMESPACE, nsmap={None: defs.SVG_NAMESPACE, "xlink": defs.XLINK_NAMESPACE}, makeelement=parser.makeelement)

コード例 #43

0

ファイルを表示

ファイル: parser.py プロジェクト: invenia/iterparse

def iterparse(source, events=('end',), tag=None, **kwargs):
    """
    Iteratively parse an xml file, firing end events for any requested
    tags

    stream: The XML stream to parse.
    tag: The iterable of tags to fire events on.
    size: (optional, 1024) The number of bytes to read at a time.
    """
    # Note: We need to remove all kwargs not supported by XMLParser
    # which but are supported by iterparse: source, events, tag, html,
    # recover, huge_tree.
    #
    # http://lxml.de/api/lxml.etree.XMLParser-class.html
    # http://lxml.de/api/lxml.etree.iterparse-class.html
    size = kwargs.pop('size', 1024)

    target_kwargs = dict(
        strip_namespace=kwargs.pop('strip_namespace', False),
        ignore_namespace=kwargs.pop('ignore_namespace', False),
        debug=kwargs.pop('debug', False),
    )

    target = MinimalTarget(events=events, tags=tag, **target_kwargs)
    parser = XMLParser(target=target, **kwargs)

    raw = source.read(size)

    while raw:
        try:
            parser.feed(raw)
        finally:
            # Note: When exceptions are raised within the parser the
            # target's close method will be called.
            events = target.completed_events
            while events:
                yield events.pop(0)

        raw = source.read(size)

コード例 #44

0

ファイルを表示

ファイル: parse.py プロジェクト: campaul/RefactorLib

def dictnode_to_lxml(tree, node_lookup=None, encoding=None):
	"""
	Input: A dictionary-based representation of a node tree.
	Output: An lxml representation of the same.

	Each dictionary has three attributes:
	    name -- The type of node, a string. In html, this would be the tag name.
		text -- The content of the node: <b>text</b>
		tail -- Any content after the end of this node, but before the start of the next: <br/>tail
		attrs -- A dictionary of any extra attributes.
		children -- An ordered list of more node-dictionaries.
	"""
	if not node_lookup:
		from node import node_lookup

	from lxml.etree import XMLParser
	lxml_parser_object = XMLParser(encoding=encoding)
	lxml_parser_object.set_element_class_lookup(node_lookup)
	Element = lxml_parser_object.makeelement

	root = None
	stack = [ (tree,root) ]

	while stack:
		node, parent = stack.pop()


		if parent is None:
			# We use this roundabout method becuase the encoding is always set
			# to 'UTF8' if we use parser.makeelement()
			lxml_parser_object.feed('<trash></trash>')
			lxmlnode = lxml_parser_object.close()
			lxmlnode.tag = node['name']
			lxmlnode.attrib.update(node.get('attrs', {}))
			root = lxmlnode
		else:
			lxmlnode = Element(node['name'], attrib=node.get('attrs', {}))
			parent.append(lxmlnode)

		lxmlnode.text = node['text']
		lxmlnode.tail = node['tail']

		for child in reversed(node['children']):
			stack.append((child, lxmlnode))

	return root

コード例 #45

0

ファイルを表示

ファイル: ModelLoader.py プロジェクト: Huskyeder/augustus

 def makeelement(parserSelf, *args, **kwds):
     result = XMLParser.makeelement(parserSelf, *args, **kwds)
     if isinstance(result, PmmlBinding):
         result.modelLoader = self
     return result

コード例 #46

0

ファイルを表示

ファイル: pushparser.py プロジェクト: seecr/meresco-xml

 def __init__(self, elementPath, onResultDo):
     builder = SubTreesTreeBuilder(elementPath=elementPath, onResult=onResultDo)
     self._parser = XMLParser(target=builder)

コード例 #47

0

ファイルを表示

ファイル: ModelLoader.py プロジェクト: Huskyeder/augustus

    def loadJson(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as a JSON string, fileName,
        dict, or file-like object.

        There is no standard XML-to-JSON specification, so we define
        our own.  Our specification is very similar to U{this
        proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>},
        which collects subelements of different tagnames into
        different JSON lists, rather than having one long list and
        needing to specify the tag of each element in that list.  This
        has the following advantages, particularly useful for PMML:
          - Frequent tagnames (like <Segment>) are not repeated,
            wasting space.
          - Subelements with a given tagname can be quickly queried,
            without having to iterate over a list that contains
            non-matching tagnames.
        It has the following disadvantages:
          - The relative order of subelements with different tagnames
            is not preserved.
        We therefore additionally include a JSON attribute named "#"
        to specify the ordering of subelements in the XML
        representation.  Also, the specification referenced above
        represents single-child subelements as JSON objects and
        multiple children as JSON lists, but for consistency and ease
        of parsing, we always use lists.  The last difference is that
        we include "#tail" as well as "#text", so that text outside of
        an element is preserved (rarely relevant for PMML, but
        included for completeness).

        Note that this method returns a JSON-like dictionary, not a
        string.  To serialize to JSON, use the C{json} module from the
        Python Standard Library, a faster variant, or an exotic
        serializer such as BSON.

        @type data: string, dict, or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised.
        """

        if hasattr(data, "read"):
            data = json.load(data)
        elif isinstance(data, basestring):
            if os.path.exists(data):
                data = json.load(open(data))
            else:
                data = json.loads(data)

        if not isinstance(data, dict):
            raise ValueError("JSON object must be a mapping at the top level")

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        try:
            nsmap = data["#nsmap"]
        except KeyError:
            raise ValueError("JSON object must have a \"#nsmap\" key at the top level")

        if "" in nsmap:
            nsmap[None] = nsmap[""]
            del nsmap[""]
        del data["#nsmap"]
        
        if len(data) != 1:
            raise ValueError("JSON object must have exactly one PMML object at the top level")

        tag = data.keys()[0]
        data = data[tag]
        if not isinstance(data, list) or len(data) != 1:
            raise ValueError("Top-level PMML object must be a list with exactly one item")
        data = data[0]
        
        pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap)

        if validate:
            schema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding

コード例 #48

0

ファイルを表示

ファイル: tmpl.py プロジェクト: Goldmund-Wyldebeast-Wunderliebe/dm.xmlsec.binding

# Copyright (C) 2012 by Dr. Dieter Maurer <*****@*****.**>; see 'LICENSE.txt' for details
"""Auxiliary classes to construct signature/encryption templates."""

from lxml.etree import (
    ElementBase,
    parse as et_parse,
    fromstring as et_fromstring,
    XML as et_xml,
    XMLParser,
    ElementNamespaceClassLookup,
    ElementDefaultClassLookup,
)
from dm.xmlsec.binding import DSigNs, dsig, EncNs, enc

# set up our own parser and related `etree` infrastructure
parser = XMLParser()
# apparently, `parser` has a `set_element_class_lookup` but not corresponding `get`
# class_lookup = ElementNamespaceClassLookup(parser.get_element_class_lookup())
class_lookup = ElementNamespaceClassLookup(ElementDefaultClassLookup())
parser.set_element_class_lookup(class_lookup)

Element = parser.makeelement


def SubElement(node, *args, **kw):
    node.append(Element(*args, **kw))


def parse(file, parser=parser):
    return et_parse(file, parser=parser)

コード例 #49

0

ファイルを表示

ファイル: parsing.py プロジェクト: miurahr/calibre

def create_lxml_context():
    parser = XMLParser(no_network=True)
    parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
    return parser

コード例 #50

0

ファイルを表示

ファイル: ValidateFilingText.py プロジェクト: Arelle/Arelle

def checkfile(modelXbrl, filepath):
    result = []
    lineNum = 1
    foundXmlDeclaration = False
    isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM"
    file, encoding = modelXbrl.fileSource.file(filepath)
    parserResults = {}
    class checkFileType(object):
        def start(self, tag, attr, nsmap=None): # check root XML element type
            parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet")
            if tag in ("{http://www.w3.org/1999/xhtml}html", "{http://www.w3.org/1999/xhtml}xhtml"):
                if nsmap and any(ns in ixbrlAll for ns in nsmap.values()):
                    parserResults["isInline"] = True
                else:
                    parserResults["maybeInline"] = True
        def end(self, tag): pass
        def data(self, data): pass
        def close(self): pass
    _parser = XMLParser(target=checkFileType())
    _isTestcase = False
    mayBeInline = isInline = False
    
    with file as f:
        while True:
            line = f.readline()
            if line == "":
                break;
            # check for disallowed characters or entity codes
            for match in docCheckPattern.finditer(line):
                text = match.group()
                if text.startswith("&"):
                    if not text in xhtmlEntities:
                        modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"),
                            _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
                elif isEFM and not _isTestcase:
                    if len(text) == 1:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), 
                            file=os.path.basename(filepath), line=lineNum, column=match.start())
                    else:
                        modelXbrl.error("EFM.5.02.01.01",
                            _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"),
                            modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start())
            if lineNum == 1:
                xmlDeclarationMatch = XMLdeclaration.search(line)
                if xmlDeclarationMatch: # remove it for lxml
                    start,end = xmlDeclarationMatch.span()
                    line = line[0:start] + line[end:]
                    foundXmlDeclaration = True
            if _parser: # feed line after removal of xml declaration
                _parser.feed(line.encode('utf-8','ignore'))
                if "rootIsTestcase" in parserResults: # root XML element has been encountered
                    _isTestcase = parserResults["rootIsTestcase"]
                    if "isInline" in parserResults:
                        isInline = True
                    elif "maybeInline" in parserResults:
                        mayBeInline = True
                    _parser = None # no point to parse past the root element
            if mayBeInline and inlinePattern.search(line):
                mayBeInline = False
                isInline = True
            if isInline:
                for match in inlineSelfClosedElementPattern.finditer(line):
                    selfClosedLocalName = match.group(3)
                    if selfClosedLocalName not in elementsWithNoContent:
                        modelXbrl.warning("ixbrl:selfClosedTagWarning",
                                          _("Self-closed element \"%(element)s\" may contain text or other elements and should not use self-closing tag syntax (/>) when empty; change these to end-tags in file %(file)s line %(line)s column %(column)s"),
                                          modelDocument=filepath, element=match.group(1), file=os.path.basename(filepath), line=lineNum, column=match.start())
            result.append(line)
            lineNum += 1
    result = ''.join(result)
    if not foundXmlDeclaration: # may be multiline, try again
        xmlDeclarationMatch = XMLdeclaration.search(result)
        if xmlDeclarationMatch: # remove it for lxml
            start,end = xmlDeclarationMatch.span()
            result = result[0:start] + result[end:]
            foundXmlDeclaration = True

    return (io.StringIO(initial_value=result), encoding)

コード例 #51

0

ファイルを表示

ファイル: io.py プロジェクト: GandaG/fomod-designer

#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from os import listdir, makedirs
from os.path import join
from lxml.etree import (PythonElementClassLookup, XMLParser, tostring, fromstring, CommentBase, Comment,
                        Element, SubElement, parse, ParseError, ElementTree, CustomElementClassLookup)
from .exceptions import MissingFileError, ParserError, TagNotFound

module_parser = XMLParser(remove_pis=True, remove_blank_text=True)


class _CommentLookup(CustomElementClassLookup):
    def lookup(self, elem_type, doc, namespace, name):
        from .nodes import NodeComment

        if elem_type == "comment":
            return NodeComment
        else:
            return None


class _NodeClassLookup(PythonElementClassLookup):
    """
    Class that handles the custom lookup for the element factories.

コード例 #52

0

ファイルを表示

ファイル: lxmlclient.py プロジェクト: liboz/suds-lxml

import logging
import time

from lxml import objectify
from lxml.etree import XMLParser
from suds import WebFault, TypeNotFound
from suds.client import Client as sudsClient
from suds.plugin import MessagePlugin
from suds.xsd.query import TypeQuery, ElementQuery

parser = XMLParser(remove_blank_text=True, huge_tree=True)
parser.set_element_class_lookup(objectify.ObjectifyElementClassLookup())
objectify.set_default_parser(parser)

logger = logging.getLogger('suds.client.lxml')
logging.getLogger('suds.client').setLevel(logging.CRITICAL)  # Don't show suds messages!


class SoapObject:
    def __init__(self, name):
        self.__name__ = name
    
    def __len__(self):
        return len(self.__dict__.items()) - 1 # ignore the __name__ property
    
    def __repr__(self):
        return self.__str__()
        
    def __str__(self):
        return self.__name__