Python XML.getiteratorの例、xml.etree.ElementTree.XML.getiterator Pythonの例

コード例 #1

0

ファイルを表示

ファイル: TestTextApplicationBackup.py プロジェクト: LManX/TextSummarizer

def openFile():
    f = askopenfile(mode="r",
                    filetypes=[("Text Files", '.txt'),
                               ("Office Open XML", '.docx')])
    if f != None:
        ext = splitext(f.name)[1]
        if ext == ".txt":
            print(splitext(f.name, )[1])
            t = f.read()
            text.delete(0.0, END)
            text.insert(0.0, t)
        elif ext == ".docx":
            doc = zipfile.ZipFile(f.name)
            xml_Content = doc.read('word/document.xml')
            doc.close()
            tree = XML(xml_Content)

            paragraphs = []
            for paragraph in tree.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                if texts:
                    paragraphs.append(''.join(texts))

            text.delete(0.0, END)
            text.insert(0.0, '\n\n'.join(paragraphs))
        else:
            print("Oops! not a supported file type.")

コード例 #2

0

ファイルを表示

ファイル: viddler.py プロジェクト: GunioRobot/python-viddler

 def _process_response(self, response, returntype=dict):
   """ big ugly function.. slowly improving. """
   if DEBUG:
     print response
   # Parse XML
   root = XML(response)
   # Dict repsonse
   if returntype==dict:
       response_data = xmldict.XmlDictConfig(root)
       if response_data.has_key("error"):
           raise RemoteError(response_data)
   # List response
   elif returntype==list:
       response_data = []
       return_dict = {}
       for elem in root.getiterator():
           if elem.tag == root_tag and return_dict.has_key(elem.tag):
               response_data.append(return_dict)
               return_dict = {elem.tag: elem.text}
           else:
               return_dict[elem.tag] = elem.text
           if return_dict.has_key("error"):
               raise RemoteError(return_dict)
       # add final dict to the list
       response_data.append(return_dict)
   else:
       raise InvalidParameterError("unkown datatype: %s" % (returntype))
   if DEBUG:
     print response_data
   return response_data

コード例 #3

0

ファイルを表示

ファイル: utils.py プロジェクト: aarora08/PaaP

def parse_docx(file_path: Path) -> str:
    """
    NOTE:
        http://xmlstackoverflow.blogspot.com/2014/09/reading-doc-extension-file-elementtree.html
        All microsoft files are zipped xml documents.
    """
    WORD_NAMESPACE = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    PARA = WORD_NAMESPACE + "p"
    TEXT = WORD_NAMESPACE + "t"

    with ZipFile(file_path) as document:
        try:
            xml_content = document.read("word/document.xml")
        except Exception as e:
            print(f"FAILED:{document}")
            logging.error(f"Failed to parsed document {file_path}", e)
            return
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text.replace("\xa0", " ").strip("  ")
            for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append("".join(texts))
    return " ".join(paragraphs)

コード例 #4

0

ファイルを表示

    def get_docx_text(self, path):
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))
        return '\n\n'.join(paragraphs)

コード例 #5

0

ファイルを表示

ファイル: IOfile.py プロジェクト: leksuss/APITextIndexer

def read_docx_file(file, block_size=4096):
    word_namespace = \
        '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = word_namespace + 'p'
    text = word_namespace + 't'
    try:
        with zipfile.ZipFile(file) as document:
            xml_content = document.read('word/document.xml')
            tree = XML(xml_content)
    except Exception as e:
        return False, str(e).replace('zip', 'docx')
    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [
            node.text for node in paragraph.getiterator(text) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    return True, '\n'.join(paragraphs)

コード例 #6

0

ファイルを表示

ファイル: DocFile.py プロジェクト: azraelrabbit/cv-data-extractor

    def word2text(self):
        document = zipfile.ZipFile(self.location)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = list()
        cleared_text = list()
        for paragraph in tree.getiterator(PARA):
            text = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if text:
                for word in text:
                    word = unicodedata.normalize("NFKD", word)
                    word.strip()
                    print(word)
        # print(cleared_text)
        # if text:
        #     paragraphs.append(''.join(text))
        return paragraphs

コード例 #7

0

ファイルを表示

#Response from fuzzwork:
#{"3532":{"buy":{"weightedAverage":"7522844.02163","max":"11500000.06","min":"343368.68","stddev":"3969289.36861","median":"9750000.015","volume":"104.0","orderCount":"10","percentile":"11500000.0538"},"sell":{"weightedAverage":"29857450.9729","max":"68499999.98","min":"15399998.88","stddev":"13610076.8519","median":"18900000.01","volume":"120.0","orderCount":"21","percentile":"15399998.8933"}}}
#END RESPONSE

        if (contents[0:5] == "<?xml"):
            # Parse the XML response from EVE-Central
            # See usage here for ElementTree:  http://effbot.org/zone/element-index.htm
            tree = XML(contents)  # From a string
            if (verbose == 1):
                print "Data received from fuzzwork:"
                print tree
            #print "XML Element count = ", len(tree)

            # the tree root is the toplevel html element
            tree_map = dict((c, p) for p in tree.getiterator() for c in p)
            root = tree

            for c in root.getchildren():
                if (verbose == 1):
                    print c.tag
                for d in c.getchildren():
                    if (verbose == 1):
                        print "    ", d.tag
                    for e in d.getchildren():
                        if (verbose == 1):
                            print "        ", e.tag
                        for f in e.getchildren():
                            if (verbose == 1):
                                print "            ", f.tag, " = ", f.text
                            if ((e.tag == "sell")

コード例 #8

0

ファイルを表示

ファイル: ElementTree_XML.py プロジェクト: robertopauletto/PyMOTW-it_2.0

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

from xml.etree.ElementTree import XML

parsed = XML('''
<root>
  <group>
    <child id="a">Questo è il figlio "a".</child>
    <child id="b">Questo è il figlio "b".</child>
  </group>
  <group>
    <child id="c">Questo è il figlio "c".</child>
  </group>
</root>
''')

print 'parsed =', parsed

for elem in parsed.getiterator():
    print elem.tag
    if elem.text is not None and elem.text.strip():
        print '  text: "%s"' % elem.text
    if elem.tail is not None and elem.tail.strip():
        print '  tail: "%s"' % elem.tail
    for name, value in sorted(elem.attrib.items()):
        print '  %-4s = "%s"' % (name, value)
    print

コード例 #9

0

ファイルを表示

ファイル: nordea.py プロジェクト: scred/django-payments

    def refund(self, payment):
        """
        Refunds one Nordea payment in full.

        Note that the refund operation executes another payment at the
        bank and that those two payments are not explicitly linked. As
        a result, if you query the original payment after refund, OK
        is still returned.

        More properly implemented refunds would actually return an
        another Payment instance that describes the refund.

        FIXME: Refund works in the sense that it executes the
        refund. However, the response MAC calculation fails and the
        calculated MAC doesn't match the returned one. No idea why
        this is. As a result an exception is thrown even though
        everything went a ok.
        """

        # FIXME: we need for parameters flexible forward and backwards
        # parameter marshalling methods

        print "refund() called"
        
        data = {}

        data["SOLOPMT_VERSION"] = "0001"
        data["SOLOPMT_TIMESTMP"] = "199911161024590003" # FIXME
        data["SOLOPMT_RCV_ID"] = self.get_setting("merchant_key")
        #data["SOLOPMT_LANGUAGE"] = payment.get_value("language")
        data["SOLOPMT_LANGUAGE"] = "1" # FIXME
        data["SOLOPMT_RESPTYPE"] = "xml" # "html" or "xml"
        #data["SOLOPMT_RESPDATA"] = "http://158.233.9.9/hsmok.htm" # FIXME
        data["SOLOPMT_RESPDATA"] = "text/xml" # FIXME
        #data["SOLOPMT_RESPDETL"] = "Y"
        data["SOLOPMT_STAMP"] = payment.get_value("code")
        #data["SOLOPMT_REF"] = payment.get_value("fi_reference")
        data["SOLOPMT_AMOUNT"] = payment.get_value("amount")
        data["SOLOPMT_CUR"] = payment.get_value("currency")
        #data["SOLOPMT_REF2"] = returned payment ref
        data["SOLOPMT_KEYVERS"] = "0001" # FIXME
        data["SOLOPMT_ALG"] = "01"

        order = ("VERSION", "TIMESTMP", "RCV_ID", "LANGUAGE", "RESPTYPE",
                 "RESPDATA", "RESPDETL", "STAMP", "REF", "AMOUNT", "CUR",
                 "REF2", "KEYVERS", "ALG", "secret")

#        order = ("VERSION", "TIMESTMP", "RCV_ID", "LANGUAGE", "RESPTYPE",
#                 "RESPDATA", "RESPDETL", "STAMP",
#                 #"REF", "AMOUNT", "CUR", 
#                 "KEYVERS", "ALG", "secret")

        s = ""
        for p in order:
            if p == "secret":
                s += self.get_setting("merchant_secret") + "&"
            else:
                key = "SOLOPMT_%s" % p
                if key in data:
                    s += data[key] + "&"

        #print "MAC:", s
        #print "WANT:", "0001&199911161024590001&12345678&1&html&http://158.233.9.9/hsmok.htm&Y&501&0001&01&LEHTI&"

        import md5
        m = md5.new(s)
        data["SOLOPMT_MAC"] = m.hexdigest().upper()

        #print "pp.get_query_form() called"

        print "MAC-0:", data["SOLOPMT_MAC"]

        import urllib2
        from urllib import urlencode
        
        con = urllib2.urlopen(self.REFUND_URL, urlencode(data))
        resp = con.read()
        #print resp

        if False:
            fh = open("nordea.xml", "w")
            fh.write(resp)
            fh.close()
        else:
            fh = open("nordea.xml", "r")
            resp = fh.read()
            fh.close()

        #print resp

        # https:///cgi-bin/SOLOPM10
        #http = httplib.HTTPSConnection("solo3.nordea.fi", 443)
        #http.putrequest("POST", "/cgi-bin/SOLOPM10")

        from xml.etree.ElementTree import XML, SubElement

        REFUND_RESP_PARAMS = (
            "SOLOPMT_VERSION",
            "SOLOPMT_TIMESTMP",
            "SOLOPMT_RCV_ID",
            "SOLOPMT_RESPCODE",
            "SOLOPMT_STAMP",
            "SOLOPMT_RCV_ACCOUNT",
            "SOLOPMT_REF",
            "SOLOPMT_DATE",
            "SOLOPMT_AMOUNT",
            "SOLOPMT_PAID",
            "SOLOPMT_CUR", 
            "SOLOPMT_STATUS",
            "SOLOPMT_KEYVERS",
            "SOLOPMT_ALG",
        )

        macs = ""
        respdata = {}
        xml = XML(resp)
        for e in xml.getiterator():
            if e.tag.startswith("SOLOPMT_"):
                respdata[e.tag] = e.text

        for p in REFUND_RESP_PARAMS:
            if p in respdata:
                macs += respdata[p] + "&"
                print "USE:", p
            else:
                print "NOT:", p
            # print "%s = %s" % (e.tag, e.text)
        macs += self.get_setting("merchant_secret") + "&"

        respmac = respdata["SOLOPMT_MAC"]

        import md5
        m = md5.new(macs)
        print "macs", macs
        print "MAC-A", m.hexdigest().upper()
        print "MAC-B", respmac

        from exceptions import PaymentInvalidMacError

        if respmac != m.hexdigest().upper():
            raise PaymentInvalidMacError("Response MAC is invalid.")

        return respdata

コード例 #10

0

ファイルを表示

ファイル: nordea.py プロジェクト: scred/django-payments

    def query(self, payment):
        """
        The Nordea payment query interface has two basic operations
        modes: one targeted at operator instigated queries and another
        for automated queries. The RESPTYPE variable determines the
        mode of operation.

        With RESPTYPE being 'html' the mode targeted at human
        operators is invoked. You post a form and it returns a HTML
        page with the payment details. If you've set the RESPDATA
        parameter, there is a form on the details page that will post
        the payment details to the specified URL.

        On the other hand if RESPTYPE is 'xml' the post to Nordea will
        return an XML document with the payment details that can then
        be processed in anyway that makes sense.

        Here only the automated query is implemented and is intended
        for background queries.
        """

        # FIXME: we need for parameters flexible forward and backwards
        # parameter marshalling methods

        import urllib2
        from urllib import urlencode
        import md5
        from xml.etree.ElementTree import XML

        data = {}

        data["SOLOPMT_VERSION"] = "0001"
        data["SOLOPMT_TIMESTMP"] = "199911161024590001" # FIXME
        data["SOLOPMT_RCV_ID"] = self.get_setting("merchant_key")
        #data["SOLOPMT_LANGUAGE"] = payment.get_value("language")
        data["SOLOPMT_LANGUAGE"] = "1" # FIXME
        data["SOLOPMT_RESPTYPE"] = "xml" # "html" or "xml"
        #data["SOLOPMT_RESPDATA"] = "http://158.233.9.9/hsmok.htm" # FIXME
        data["SOLOPMT_RESPDATA"] = "text/xml" # FIXME
        #data["SOLOPMT_RESPDETL"] = "Y"
        data["SOLOPMT_STAMP"] = payment.get_value("code")
#        data["SOLOPMT_REF"] = payment.get_value("fi_reference")
#        data["SOLOPMT_AMOUNT"] = payment.get_value("amount")
#        data["SOLOPMT_CUR"] = payment.get_value("currency")
        data["SOLOPMT_KEYVERS"] = "0001" # FIXME
        data["SOLOPMT_ALG"] = "01"        

        order = ("VERSION", "TIMESTMP", "RCV_ID", "LANGUAGE", "RESPTYPE",
                 "RESPDATA", "RESPDETL", "STAMP", "REF", "AMOUNT", "CUR",
                 "KEYVERS", "ALG", "secret")

        s = ""
        for p in order:
            if p == "secret":
                s += self.get_setting("merchant_secret") + "&"
            else:
                key = "SOLOPMT_%s" % p
                if key in data:
                    s += data[key] + "&"

        m = md5.new(s)
        data["SOLOPMT_MAC"] = m.hexdigest().upper()
        
        con = urllib2.urlopen(self.QUERY_URL, urlencode(data))
        resp = con.read()

        QUERY_RESP_PARAMS = (
            "SOLOPMT_VERSION",
            "SOLOPMT_TIMESTMP",
            "SOLOPMT_RCV_ID",
            "SOLOPMT_RESPCODE",
            "SOLOPMT_STAMP",
            "SOLOPMT_RCV_ACCOUNT",
            "SOLOPMT_REF",
            "SOLOPMT_DATE",
            "SOLOPMT_AMOUNT",
            "SOLOPMT_CUR",
            "SOLOPMT_PAID",
            "SOLOPMT_STATUS",
            "SOLOPMT_KEYVERS",
            "SOLOPMT_ALG",
        )

        macs = ""
        respdata = {}
        respmac = ""
        xml = XML(resp)
        for e in xml.getiterator():
            if e.tag in QUERY_RESP_PARAMS:
                macs += e.text + "&"
                respdata[e.tag] = e.text
            if e.tag == "SOLOPMT_MAC":
                respmac = e.text
        macs += self.get_setting("merchant_secret") + "&"

        m = md5.new(macs)

        if respmac != m.hexdigest().upper():
            raise PaymentInvalidMacError("Response MAC is invalid.")

        if respdata["SOLOPMT_RESPCODE"] == "OK":
            return (True, respdata)
        else:
            return (False, respdata)

コード例 #11

0

ファイルを表示

ファイル: toolkit.py プロジェクト: eachanjohnson/labnoteify

 def htmlify(self):
     #print 'HTMLifying ', self.path
     html = ''
     if self.type == 'image':
         import os.path
         if '.noteify' in self.filename and '.png' in self.filename:
             html = ''
         else:
             html = '<img src="../img/{}" height="750" />'.format(self.filename)
     elif self.type == 'code':
         html = '<pre style="background-color:rgba(0, 0, 255, 0.2);"><code>{}</code></pre>'.format(
             open(self.path, 'rU').read()
         )
     elif self.type == 'PDF':
         from wand.image import Image
         import os
         import os.path
         filename_root = self.filename.split('.pdf')[0]
         new_filename = filename_root + '.noteify.png'
         if os.path.exists(self.path.split(self.filename)[0] + new_filename) or os.path.exists(self.path.split(self.filename)[0] + filename_root + '-1.png'):
             pass
         else:
             with Image(filename=self.path, resolution=300) as img:
                 try:
                     img.save(filename=self.path.split(self.filename)[0] + new_filename)
                 except:
                     pass
         self.filename = sorted([filename for filename in os.listdir(self.path.split(self.filename)[0])
                          if filename_root in filename and '.noteify' in filename and '.png' in filename])
         for filename in self.filename:
             html += '<img src="../pdf/{}" height="750" />'.format(filename)
     elif self.type == 'Markdown':
         import markdown
         converter = markdown.Converter()
         html = converter.convert(self.path)
     elif self.type == 'Powerpoint PPTX':
         from comtypes import client
         powerpoint = client.CreateObject('Powerpoint.Application')
         powerpoint.Presentations.Open(self.path)
         powerpoint.ActivePresentation.Export(self.path, 'PNG')
         powerpoint.ActivePresentation.Close()
         powerpoint.Quit()
     elif self.type == 'Excel XLSX':
         import xlrd
         import csv
         try:
             workbook = xlrd.open_workbook(self.path)
         except xlrd.biffh.XLRDError:
             html = ''
         else:
             sheet_names = workbook.sheet_names()
             for sheet_name in sheet_names:
                 sheet = workbook.sheet_by_name(sheet_name)
                 with open('.xltemp.csv', 'w') as f:
                     c = csv.writer(f, quoting=csv.QUOTE_ALL)
                     for rownum in xrange(sheet.nrows):
                         try:
                             c.writerow(sheet.row_values(rownum))
                         except UnicodeEncodeError:
                             c.writerow([''])
             html = self.csv_to_html(filename='.xltemp.csv')
     elif self.type == 'Word DOCX':
         from xml.etree.ElementTree import XML
         import zipfile
         WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
         PARA = WORD_NAMESPACE + 'p'
         TEXT = WORD_NAMESPACE + 't'
         docx = zipfile.ZipFile(self.path)
         xml_content = docx.read('word/document.xml')
         docx.close()
         tree = XML(xml_content)
         html = '<p>'
         paragraphs = []
         for paragraph in tree.getiterator(PARA):
             texts = [node.text
                      for node in paragraph.getiterator(TEXT)
                      if node.text]
             if texts:
                 paragraphs.append(''.join(texts))
         html += '</p><p>'.join(paragraphs)
         html += '</p>'
     elif self.type == 'CSV':
         html = self.csv_to_html(filename=self.path)
     else:
         html = ''
     return html