Ejemplo n.º 1
0
def storeRecordWithID (record, ID, collectedRecords):
	global config

	sys.stdout.write("ID: " + str(ID) + u"… ")

	collectedRecords[ID] = record

	""" Write XML file for record. """
	if 'xml' in config.format:
		filePath = pathForID(ID, 'xml')
		XMLFile = open(filePath, 'w')
		XMLString = ET.tounicode(record).encode('UTF-8')
		XMLFile.write(XMLString)
		XMLFile.close()
		sys.stdout.write(' ./' + filePath)

	""" Convert to JSON and write file. """
	if 'json' in config.format:
		JSONInternal = elem_to_internal(record, strip=1)
		if len(JSONInternal) == 1:
			JSONInternal = JSONInternal.values()[0]
		JSONInternal['_id'] = ID
		filePath = pathForID(ID, 'json')
		JSONFile = open (filePath, "w")
		JSONFile.write(simplejson.dumps(JSONInternal))
		JSONFile.close()
		sys.stdout.write(' ./' + filePath)

	""" If no format is given, print the record. """
	if len(config.format) == 0:
		print ET.tounicode(record)

	print ""
Ejemplo n.º 2
0
 def testFilteringStyle(self):
     html = u"""
         <div style="color: #ffff00;"><p style="float: left;"><img src="http://cdnimg.visualizeus.com/thumbs/3e/37/hairstyles-3e37929b6847d0216b0aabe296ed9a76_h.jpg?ts=93246" alt="" width="248" height="400" style="width: 500px; color: blue;"><a href="http://www.hairstyles123.com/hairstylepics/faces/hairstyles_for_heart_shaped_faces/heart_shaped_faces_hairstyle_4.jpg" class="clb cboxElement"><img src="http://www.hairstyles123.com/hairstylepics/faces/hairstyles_for_heart_shaped_faces/heart_shaped_faces_hairstyle_4.jpg" alt="" width="400" height="544"></a><a href="http://womeninfashion.net/wp-content/uploads/2013/11/heart-face-shape-hairstyles-jennifer-aniston.jpg" class="clb cboxElement"><img src="http://womeninfashion.net/wp-content/uploads/2013/11/heart-face-shape-hairstyles-jennifer-aniston.jpg" alt="" width="420" height="560"></a><a href="http://thisgirlscity.com/wp-content/uploads/2013/07/reese.jpg" class="clb cboxElement"><img src="http://thisgirlscity.com/wp-content/uploads/2013/07/reese.jpg" alt="" width="420" height="560"></a><a href="http://images.beautyriot.com/photos/200/hairstyles_heart_shape_face-200.jpg" class="clb cboxElement"><img src="http://images.beautyriot.com/photos/200/hairstyles_heart_shape_face-200.jpg" alt="" width="200" height="272"></a></p>
         <p><a href="http://www.youbeauty.com/p/482031/thumbnail/entry_id/0_hmc2pi25/width/0/height/0/quality/90" class="clb cboxElement"><img src="http://www.youbeauty.com/p/482031/thumbnail/entry_id/0_hmc2pi25/width/0/height/0/quality/90" alt="" width="200" height="290"></a><a href="http://www.hairstyles123.com/hairstylepics/faces/hairstyles_for_heart_shaped_faces/heart_shaped_faces_hairstyle_11.jpg" class="clb cboxElement"><img src="http://www.hairstyles123.com/hairstylepics/faces/hairstyles_for_heart_shaped_faces/heart_shaped_faces_hairstyle_11.jpg" alt="" width="520" height="780"></a><a href="http://www.allure.com/images/hair-ideas/2012/05/heart-face-shape-hairstyles-reese-witherspoon.jpg" class="clb cboxElement"><img src="http://www.allure.com/images/hair-ideas/2012/05/heart-face-shape-hairstyles-reese-witherspoon.jpg" alt="" width="420" height="560"></a><a href="http://slodive.com/wp-content/uploads/2012/03/hairstyles-for-heart-shaped-faces/heart-shape-glasses.jpg" class="clb cboxElement"><img src="http://slodive.com/wp-content/uploads/2012/03/hairstyles-for-heart-shaped-faces/heart-shape-glasses.jpg" alt=""></a><a href="http://www.beautifulhairstyle.net/wp-content/uploads/2014/02/Long-Hairstyles-For-Heart-Shaped-Faces.jpg" class="clb cboxElement"><img src="http://www.beautifulhairstyle.net/wp-content/uploads/2014/02/Long-Hairstyles-For-Heart-Shaped-Faces.jpg" alt=""></a><a href="http://beautyhairtotoe.com/wp-content/uploads/2013/08/rby-heart-shaped-reese-marked-mdn.jpg" class="clb cboxElement"><img src="http://beautyhairtotoe.com/wp-content/uploads/2013/08/rby-heart-shaped-reese-marked-mdn.jpg" alt=""></a><a href="http://www.prettydesigns.com/wp-content/uploads/2013/09/Hairstyle-for-Oval-shaped-Women.jpg" class="clb cboxElement"><img src="http://www.prettydesigns.com/wp-content/uploads/2013/09/Hairstyle-for-Oval-shaped-Women.jpg" alt="" width="550" height="775"></a><a href="http://www.hairnext.com/wp-content/uploads/2014/05/Heart-Shaped-Face-Best-Short-Bangs-Hairstyle-For-Fine-Hair.jpg" class="clb cboxElement"><img src="http://www.hairnext.com/wp-content/uploads/2014/05/Heart-Shaped-Face-Best-Short-Bangs-Hairstyle-For-Fine-Hair.jpg" alt="Heart Shaped Face Best Short Bangs Hairstyle For Fine Hair"></a><a href="http://www.hairnext.com/wp-content/uploads/2014/05/Short-Bob-Side-Swept-For-Long-Face-Shape.jpg" class="clb cboxElement"><img src="http://www.hairnext.com/wp-content/uploads/2014/05/Short-Bob-Side-Swept-For-Long-Face-Shape.jpg" alt="Short Bob Side Swept  For Long Face Shape"></a></p>
         <p>&nbsp; <img src="http://www.hairnext.com/wp-content/uploads/2014/05/Short-blonde-Curly-hairstyle.jpg" alt="Short blonde Curly hairstyle:"></p></div>
         """
     print etree.tounicode(filter_style(fromstring(html)))
Ejemplo n.º 3
0
    def htmltopost(self, html, pagetime):
        self.removecomments(html)
        c = etree.tounicode(html, method='html', pretty_print=True)

        postnumber = 0
        postnumbernode = html.xpath(r"../../../tr[1]/td[2]/a[last()]")
        if postnumbernode:
            postnumber = int(etree.tounicode(postnumbernode[-1], method="text"))
            postlinknode = postnumbernode[-1].attrib['href']
            parsed = urlparse(postlinknode)
            postid = int(parse_qs(parsed.query)['p'][0])

        titlenode = html.xpath(r"../div[@class='smallfont']/strong")
        title = etree.tounicode(titlenode[-1], method="text").strip()

        posternode = html.xpath(r"../../td[1]/div/a[starts-with(@class,'bigusername')]")
        poster = etree.tounicode(posternode[-1], method="text").strip()

        timenode = html.xpath(r"../../../tr[1]/td[1]")
        timestring = etree.tounicode(timenode[-1], method="text").strip()
        ts = misc.parseitemtime(pagetime, timestring)

        p = post.Post(content=c, postnumber=postnumber, title=title, postername=poster,
                      postid=postid, ts=ts)
        print(postnumber, postid, poster, title)
        return p
Ejemplo n.º 4
0
def main():
    extensions = {(URL_MLR_EXT, 'vcard_uuid'): utils.vcard_uuid}
    converterExtract = XMLTransform(STYLESHEET_EXTRACT, extensions)
    converterDup = XMLTransform(STYLESHEET_DUP)
    parser = argparse.ArgumentParser(
        description='Extend the vcard of a lom into a xcard')
    parser.add_argument('-f', '--format', default='rawxml',
                        help="output format: one of 'rawxml', 'xml', 'n3',"
                             " 'turtle', 'nt', 'pretty-xml', trix'")
    parser.add_argument('-o', '--output', help="Output file",
                        type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('infile', help="input file or url", nargs="?")
    converterExtract.populate_argparser(parser)
    #converterDup.populate_argparser(parser)
    args = parser.parse_args()
    converterExtract.set_options_from_dict(vars(args))
    #converterDup.set_options_from_dict(vars(args))
    
    if (urlparse(args.infile).scheme):
        opener = urlopen
    else:
        opener = open

    with opener(args.infile) as infile:
        xml = converterExtract.convertfile(infile)
    if xml:
        xml = converterDup.convertxml(xml)
    if xml:
        if args.format == "rawxml":
            args.output.write(etree.tounicode(xml, pretty_print=True).encode('utf-8'))
        else:
            rdf = Graph().parse(data=etree.tounicode(xml), format="xml")
            if rdf:
                args.output.write(rdf.serialize(format=args.format, encoding='utf-8'))
    args.output.close()
Ejemplo n.º 5
0
 def create_xml(self, p):
     path = "log.xml"
     # checks whether XML file exists and loads the root elemenet
     if not os.path.isfile(path):
         root = etree.Element("searchedItems")
         # if there is no XML file, create blank with root element
         with open(path, mode = "w", encoding="utf-8") as f:
             f.write(etree.tounicode(root, pretty_print=True))
             
     root = etree.parse(path)
     # creates child element from root
     item = etree.SubElement(root.getroot(), "item", attrib = {"id":p["id"]})
     
     while p:
         print(etree.tounicode(root, pretty_print=True))
         key, value = p.popitem()
         # for BuyNow and Bid prices nested elements are needed!
         if "buyNow" in key or "bid" in key:
             if item.find("price") is None:
                 price = etree.Element("price")
                 item.append(price)
             subitem = etree.Element(key)
             price.append(subitem)
             subitem.text = value
             # otherwise all elements are ascendants of item element
         else:
             subitem = etree.Element(key)
             item.append(subitem)
             subitem.text = value
             
     with open(path, mode="w", encoding="utf-8") as f:
         f.write(etree.tounicode(root, pretty_print = True))
Ejemplo n.º 6
0
 def serialized(self, stripped=True):
     """
     Serialized version of the definition node.
     """
     if stripped:
         return etree.tounicode(self.node_stripped())
     else:
         return etree.tounicode(self.node)
Ejemplo n.º 7
0
def get_single_content(element, data_type):
    """Return the processed content of given element"""
    if isinstance(element, basestring) or \
       isinstance(element, etree._ElementStringResult) or \
       isinstance(element, etree._ElementUnicodeResult):
        return element
    if data_type == 'text':
        # Return element.text or ''
        return etree.tounicode(element, method='text').strip()
    elif data_type == 'html':
        return etree.tounicode(element, pretty_print=True).strip()
Ejemplo n.º 8
0
def main ():
	global config

	loadXSLs()

	SRUBaseURL = config.url + '?' \
								+ 'operation=searchRetrieve' \
								+ '&' + 'version=1.1' \
								+ '&' + 'recordPacking=xml' \
								+ '&' + 'recordSchema=' + urllib.quote(config.schema) \
								+ '&' + 'maximumRecords=' + str(config.chunksize) \
								+ '&' + 'query=' + urllib.quote(config.query)


	recordCount = 1
	done = False


	while not done:
		firstRecord = recordCount
		SRUURL = SRUBaseURL + '&' + 'startRecord=' + str(recordCount)
		print SRUURL
		SRUResponse = urllib.urlopen(SRUURL).read()

		XML = ET.fromstring(SRUResponse)
		records = XML.findall('.//{http://www.loc.gov/zing/srw/}recordData/*')
		numberOfRecords = XML.findall('.//{http://www.loc.gov/zing/srw/}numberOfRecords')
		resultCount = 0
		if len(numberOfRecords) > 0:
			resultCount = int(numberOfRecords[0].text)

		print u"Loaded " + str(len(records)) + " records: " + str(recordCount) + "-" + str(min(recordCount + config.chunksize, resultCount)) + " of " + str(resultCount)

		collectedRecords = {}
		for record in records:
			ID = recordID(record, recordCount)

			""" Transform record. """
			for XSL in config.XSLs:
				record = XSL(record).getroot()

			if record is None:
				print u"Record transformation failed for ID »" + ID + u"«"
				print ET.tounicode(record)
			else:
				storeRecordWithID(record, ID, collectedRecords)

			recordCount += 1

		storeBatches(collectedRecords, firstRecord)

		done = (len(records) == 0 or recordCount > resultCount)
Ejemplo n.º 9
0
 def full_description(self, url):
     tree = etree.parse(retrieve(url=url), self.parser)
     desc_list = tree.xpath("//div[@id='item-full']")
     assert len(desc_list), _(u'No elements found.')
     for item in desc_list:
         try:
             return {
                 'image': item.xpath(".//div[@class='item-pic']//img/@src")[0],
                 'desc': tounicode(item.xpath(".//div[@id='item-details']/node()")[0]),
                 'tech': tounicode(item.xpath(".//div[@id='item-tech']/node()")[0]),
                 }
         except IndexError:
             print 'Bad structure in %s !' % url
             return {}
Ejemplo n.º 10
0
 def comment_stripped_text(self):
     """
     Return a version of the plain text with any square-bracketed
     comments removed.
     """
     serialized = etree.tounicode(self.node)
     if '<cm' in serialized:
         stripped = COMMENT_STRIPPER.edit(serialized)
         try:
             new_node = etree.XML(stripped)
         except etree.XMLSyntaxError:
             new_node = self.node
     else:
         new_node = self.node
     return etree.tounicode(new_node, method='text') or ''
Ejemplo n.º 11
0
 def success(self, template_name, transaction_number):
     # First, divert to the 404 page if the
     # transaction_number is not found.
     if transaction_number not in g.success_data:
         return abort(
             status_code = 404, 
             comment = 'Transaction number expired or invalid.',
             )
     # Retrieve from success cache. Copy it since we will mutate it.
     values = g.success_data[transaction_number].copy()
     # Apply the generic commerce notice.
     sale_template = SaleTemplate(template_name)
     success_xml = sale_template.success_xml()
     self._apply_commerce_notice(success_xml)
     # Grab the mailer, since it's an object and not a string.
     mailer = values.pop('mailer_instance')
     # Apply remaining text values to the template.
     for key, value in values.items():
         for e in CSSSelector('#' + key)(success_xml):
             e.text = value
     # Allow the mailer to manipulate the page.
     for e in CSSSelector('#simplsale-email-notice')(success_xml):
         mailer.apply_notice(e)
     # Render.
     return XHTML11_DTD + tounicode(success_xml, method='html')
Ejemplo n.º 12
0
    def tag_words_in(cls, elem, tag='w'):
        w = Dict(PATTERN=re.compile("([^\s]+)"), REPLACE=r'{%s}\1{/%s}' % (tag, tag), OMIT_ELEMS=[])

        def tag_words(e):
            e.text = re.sub(w.PATTERN, w.REPLACE, e.text or '')
            for ch in e:
                if ch.tag not in w.OMIT_ELEMS:
                    tag_words(ch)
                ch.tail = re.sub(w.PATTERN, w.REPLACE, ch.tail or '')

        new_elem = XML.fromstring(etree.tounicode(elem))
        tag_words(new_elem)
        s = etree.tounicode(new_elem)
        s = s.replace('{%s}' % tag, '<%s>' % tag).replace('{/%s}' % tag, '</%s>' % tag)
        new_elem = XML.fromstring(s)
        return new_elem
Ejemplo n.º 13
0
    def __init__(self, datapath):
        """Class for operating on a single WMS source."""
        self.datapath = datapath
        self.handler = webtest.TestApp(NetCDFHandler(datapath))

        self.path_info = '/' + datapath + '.wms'

        # Find this directory
        dir_path = os.path.dirname(os.path.realpath(__file__))

        self.base_env = {
            'pydap.config': {
                'pydap.responses.wms.fill_method': 'contourf',
                'pydap.responses.wms.paletted': True,
                'pydap.responses.wms.allow_eval': True,
                'pydap.responses.wms.colorfile': dir_path + '/colors.json',
                'pydap.responses.wms.styles_file': dir_path + '/styles.json',
                'pydap.responses.wms.max_age': 600,
                'pydap.responses.wms.s_maxage': 93600,
                'pydap.responses.wms.max_image_size': 16777216,
                'pydap.responses.wms.localCache': True,
                'pydap.responses.wms.redis': False,
                'pydap.responses.wms.redis.host': 'localhost',
                'pydap.responses.wms.redis.port': 6379,
                'pydap.responses.wms.redis.db': 0,
                'pydap.responses.wms.redis.redis_expiration_time': 604800,
                'pydap.responses.wms.redis.distributed_lock': True
            }
        }

        self.base_query_map = {'SERVICE': 'WMS',
                               'REQUEST': 'GetMap',
                               'VERSION': '1.3.0',
                               'STYLES': '',
                               'FORMAT': 'image/png',
                               'TRANSPARENT': 'TRUE',
                               'HEIGHT': 512,
                               'WIDTH': 512,
                               'BBOX': '-180.0,-90.0,180.0,90.0',
                               'CRS': 'EPSG:4326'}
        self.base_query_cap = {'SERVICE': 'WMS',
                               'REQUEST': 'GetCapabilities',
                               'VERSION': '1.3.0',}
        #print('Getting Capabilities for %s' % self.path_info)
        env = self.base_env.copy()
        env['QUERY_STRING'] = urllib.parse.urlencode(self.base_query_cap)
        response = self.get(params=self.base_query_cap,
                            extra_environ=env, status=200)
        self.xml = response.normal_body
        try:
            self.wms = WebMapService(self.path_info, xml=self.xml,
                                     version='1.3.0')
        except:
            print('PATH_INFO', self.path_info)
            parser = etree.XMLParser(remove_blank_text=True)
            file_obj = io.BytesIO(self.xml)
            tree = etree.parse(file_obj, parser)
            x_str = etree.tounicode(tree, pretty_print=True)
            print('XML', x_str)
            raise
def evaluateXPath(path, element):
    try:
        import xml.dom
        from xml.xpath import Evaluate
        result=Evaluate(path, element)
        if hasattr(result,'__iter__'):
            for i in range(len(result)):
                if isinstance(result[i], xml.dom.Node) and result[i].nodeType == xml.dom.Node.ATTRIBUTE_NODE:
                    result[i]=result[i].value
        elif type(result)==bool:
            return result
        else:
            result=[result]
        return result
    except ImportError:
        # Implementation for etree
        from lxml.etree import XPath, fromstring, tounicode
        # returns a list of _ElementStringResult
        buf=toPrettyXML(element)
        elist=XPath(path).evaluate(fromstring(buf))
        nodelist=list()
        # if is iterable
        if hasattr(elist,'__iter__'):
            for eelement in elist:
                # either the returnlist is a stringlist or a element list
                if isinstance(eelement, basestring):
                    nodelist.append(eelement)
                else:
                    nodelist.append(parseXMLString(tounicode(eelement)).documentElement)
        elif type(elist)==bool:
            return elist
        else:
            nodelist.append(elist)
        return nodelist
Ejemplo n.º 15
0
    def __init__(self, tracks, filename=None):
        if filename is None:
            filename = 'playlist.xspf'

        NSMAP = {
            None: 'http://xspf.org/ns/0/',
        }

        playlist = etree.Element('playlist', nsmap=NSMAP, attrib={
            'version': '1',
        })

        track_list = etree.SubElement(playlist, 'trackList')

        for track in tracks:
            elem = etree.SubElement(track_list, 'track')

            title = etree.SubElement(elem, 'title')
            title.text = track.get_parent_instance().title

            duration = etree.SubElement(elem, 'duration')
            duration.text = unicode(track.length * 1000)

            location = etree.SubElement(elem, 'location')
            location.text = track.file.url().replace('https:', 'http:')

        super(XSPFResponse, self).__init__(
            etree.tounicode(playlist),
            content_type='application/xspf+xml',
        )

        self['Content-Disposition'] = 'attachment; filename=%s' % filename
Ejemplo n.º 16
0
 def Element(cls, s, *args):
     """given a string s and string *args, return an Element."""
     sargs = []
     for arg in args:
         if type(arg) == etree._Element:
             sargs.append(etree.tounicode(arg))
         else:
             sargs.append(arg)
     if type(s) == etree._Element:
         t = etree.tounicode(s)
     else:
         t = s
     if len(args) == 0:
         return XML.fromstring(t)
     else:
         return XML.fromstring(t % tuple(sargs))
Ejemplo n.º 17
0
    def serialized(self):
        """
        Return the node serialized in string form.

        (Wrapper for etree.tounicode())
        """
        return etree.tounicode(self.node)
Ejemplo n.º 18
0
    def test_all_basic_feed_with_one_item(self):
        response = self.app.get('/results/all-basic.atom')
        root = etree.XML(response.content)
        xml_pretty = etree.tounicode(root, pretty_print=True)

        result_event = ResultEvent.objects.first()
        expected = '''<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-gb">
  <title>Election results from example.com</title>
  <link href="http://example.com/" rel="alternate"/>
  <link href="http://example.com/results/all-basic.atom" rel="self"/>
  <id>http://example.com/</id>
  <updated>{updated}</updated>
  <entry>
    <title>Tessa Jowell (Labour Party) won in Member of Parliament for Dulwich and West Norwood</title>
    <link href="http://example.com/#{item_id}" rel="alternate"/>
    <published>{updated}</published>
    <updated>{updated}</updated>
    <author>
      <name>john</name>
    </author>
    <id>http://example.com/#{item_id}</id>
    <summary type="html">A example.com volunteer recorded at {space_separated} that Tessa Jowell (Labour Party) won the ballot in Member of Parliament for Dulwich and West Norwood, quoting the source 'Seen on the BBC news').</summary>
  </entry>
</feed>
'''.format(
    updated=rfc3339_date(result_event.created),
    space_separated=result_event.created.strftime("%Y-%m-%d %H:%M:%S"),
    item_id=result_event.id,
)
        self.compare_xml(expected, xml_pretty)
Ejemplo n.º 19
0
 def remove_range(cls, elem, end_elem, delete_end=True):
     """delete everything from elem to end_elem, including elem.
     if delete_end==True, also including end_elem; otherwise, leave it."""
     while elem is not None and elem != end_elem and end_elem not in elem.xpath("descendant::*"):
         parent = elem.getparent()
         nxt = elem.getnext()
         parent.remove(elem)
         if DEBUG == True:
             print(etree.tounicode(elem))
         elem = nxt
     if elem == end_elem:
         if delete_end == True:
             cls.remove(end_elem, leave_tail=True)
     elif elem is None:
         if parent.tail not in [None, '']:
             parent.tail = ''
         cls.remove_range(parent.getnext(), end_elem)
         XML.remove_if_empty(parent)
     elif end_elem in elem.xpath("descendant::*"):
         if DEBUG == True:
             print(elem.text)
         elem.text = ''
         cls.remove_range(elem.getchildren()[0], end_elem)
         XML.remove_if_empty(elem)
     else:
         print("LOGIC ERROR", file=sys.stderr)
Ejemplo n.º 20
0
def trans_entry(data):
    r = []
    meta = {}

    try:
        root = et.fromstring(data)
        head = root.find('Head')
        title = _get_text_nr(head.find('HWD/BASE'))
        poslist = head.findall('POS')
        if poslist:
            title += ' ({0})'.format(
                    ', '.join(_get_text_nr(pos) for pos in poslist))
    except:
        title = ""

    try:
        pron_gb = head.find('Audio[@resource="GB_HWD_PRON"]')
        if pron_gb is not None:
            meta['gb_pron'] = pron_gb.get('topic').split('/')[-1]
        pron_us = head.find('Audio[@resource="US_HWD_PRON"]')
        if pron_us is not None:
            meta['us_pron'] = pron_us.get('topic').split('/')[-1]
    except:
        pass

    r.append(_build_header(['entry'], title=title, meta=meta))

    r.append(et.tounicode(_trans_assets(root),
        pretty_print=True, method='html'))

    r.append(body2html(root))
    r.append('</body></html>')
    return enc_utf8(''.join(r))
Ejemplo n.º 21
0
def transform_misused_divs_into_paragraphs(doc):
    """Turn all divs that don't have children block level elements into p's

    Since we can't change the tree as we iterate over it, we must do this
    before we process our document.

    The idea is that we process all divs and if the div does not contain
    another list of divs, then we replace it with a p tag instead appending
    it's contents/children to it.

    """
    for elem in doc.iter(tag='div'):
        child_tags = [n.tag for n in elem.getchildren()]
        if 'div' not in child_tags:
            # if there is no div inside of this div...then it's a leaf
            # node in a sense.
            # We need to create a <p> and put all it's contents in there
            # We'll just stringify it, then regex replace the first/last
            # div bits to turn them into <p> vs <div>.
            LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
            orig = tounicode(elem).strip()
            started = re.sub(r'^<\s*div', '<p', orig)
            ended = re.sub(r'div>$', 'p>', started)
            elem.getparent().replace(elem, fromstring(ended))
    return doc
Ejemplo n.º 22
0
def build_section_by_section(sxs, fr_start_page, previous_label):
    """Given a list of xml nodes in the section by section analysis, pull
    out hierarchical data into a structure. Previous label is carried along to
    merge analyses of the same section."""
    structures = []
    while len(sxs):  # while sxs: is deprecated
        cfr_part = previous_label.split('-')[0]
        title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part)

        page = find_page(title, title.sourceline, fr_start_page)
        paragraph_xmls = [deepcopy(el) for el in text_els
                          if el.tag == 'P' or el.tag == 'FP']
        footnotes = []
        for p_idx, paragraph_xml in enumerate(paragraph_xmls):
            spaces_then_remove(paragraph_xml, 'PRTPAGE')
            spaces_then_remove(paragraph_xml, 'FTREF')
            swap_emphasis_tags(paragraph_xml)
            # Anything inside a SU can also be ignored
            for su in paragraph_xml.xpath('./SU'):
                su_text = etree.tounicode(su)
                footnotes.append({
                    'paragraph': p_idx,
                    'reference': su.text,
                    'offset': body_to_string(paragraph_xml).find(su_text)})
                if su.tail and su.getprevious() is not None:
                    su.getprevious().tail = (su.getprevious().tail or '')
                    su.getprevious().tail += su.tail
                elif su.tail:
                    su.getparent().text = (su.getparent().text or '')
                    su.getparent().text += su.tail
                su.getparent().remove(su)

        paragraphs = [body_to_string(el) for el in paragraph_xmls]
        label_for_children = previous_label
        labels = parse_into_labels(title.text, cfr_part)
        if labels:
            label_for_children = labels[-1]

        # recursively build children. Be sure to give them the proper label
        children = build_section_by_section(sub_sections, page,
                                            label_for_children)

        next_structure = {
            'page': page,
            'title': add_spaces_to_title(title.text),
            'paragraphs': paragraphs,
            'children': children,
            'footnote_refs': footnotes
        }

        if (labels and  # No label => subheader
                # Concatenate if repeat label or backtrack
                not all(label == previous_label or
                        is_backtrack(previous_label, label)
                        for label in labels)):
            previous_label = labels[-1]
            next_structure['labels'] = labels
        structures.append(next_structure)

    return structures
Ejemplo n.º 23
0
    def _le_xml(self, arquivo):
        if arquivo is None:
            return False

        if not isinstance(arquivo, basestring):
            arquivo = etree.tounicode(arquivo)
            #self._xml = arquivo
            #return True

        #elif arquivo is not None:
        if arquivo is not None:
            if isinstance(arquivo, basestring):
                if isinstance(arquivo, str):
                    arquivo = unicode(arquivo.encode('utf-8'))

                if '<' in arquivo:
                    self._xml = etree.fromstring(tira_abertura(arquivo).encode('utf-8'))
                else:
                    arq = open(arquivo)
                    txt = b''.join(arq.readlines())
                    txt = unicode(txt.decode('utf-8'))
                    txt = tira_abertura(txt)
                    arq.close()
                    self._xml = etree.fromstring(txt)
            else:
                self._xml = etree.parse(arquivo)
            return True

        return False
Ejemplo n.º 24
0
 def write(self):
     """
         Print converted rules
     """
     print(etree.tounicode(self.new_rules, pretty_print=True))
     print("<!-- Failed convert. Please, handle it manually-->\n")
     print("\n".join(self.fails))
Ejemplo n.º 25
0
def check(proxy):
    # url = 'https://ip.cn/' 
    # url = 'https://httpbin.org/ip'
    url = 'https://geoiptool.com/zh/'
    proxies = {
        'http': 'http://{}'.format(proxy),
        'https': 'http://{}'.format(proxy)
    }
    try:
        res = requests.get(url, proxies=proxies, verify=False).text
        data = etree.HTML(res)
    except Exception as e:
        print(e)
        return '当前代理已经失效'
    else:
        if url == 'https://ip.cn/':
            result = data.xpath('//div[@id="result"]')[0]
            content = html.tostring(result)
            return content
        elif (url == 'https://geoiptool.com/zh/' and data):
            content = data.xpath('//div[contains(@class, "sidebar-data")]')[0] if data.xpath('//div[contains(@class, "sidebar-data")]') else None
            if content is None:
                return '当前代理已经失效'
            content = etree.tounicode(content)
            content = re.sub(r'<img.*?>', '', content)
            content = re.sub(r'hidden-xs hidden-sm', '', content)
            return content
        else:
            return '当前代理已经失效'
Ejemplo n.º 26
0
def html_to_plaintext(text):
        """try to get readable plaintext from the G+ html.   Lxml doesn't
        seem to do <br> elements properly."""
        text = text.replace("<br />", " ")
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(text), parser)
        return etree.tounicode(tree.getroot(), method="text")
Ejemplo n.º 27
0
def test_parentheses_cleanup(original, new_text):
    """Helper function to verify that the XML is transformed as
    expected"""
    with XMLBuilder("PART") as ctx:
        ctx.child_from_string(u"<P>{0}</P>".format(original))
    preprocessors.parentheses_cleanup(ctx.xml)
    assert etree.tounicode(ctx.xml[0]) == "<P>{0}</P>".format(new_text)
Ejemplo n.º 28
0
    def as_stringio(self):
        """Returns a ``StringIO`` representation of the
        ``ResultDocument`` instance.

        """
        buf = etree.tounicode(self._document, pretty_print=True)
        return StringIO(buf)
Ejemplo n.º 29
0
 def cross_references(self):
     """
     Return a list of CrossReference objects representing any
     cross-references found in the definition.
     """
     try:
         return self._xrefs
     except AttributeError:
         self._xrefs = [CrossReference(xref_node) for xref_node in
                        self.node_stripped().findall('.//xr')]
         if self._xrefs:
             # Add a 'type' attribute to each cross-reference,
             # determined by the preceding text
             for xref in self._xrefs:
                 xref.type = None  # default value
             # Split definitions into sections, one section per xref,
             #  with the xref at the end of the section. The 'sections'
             #  list should then be aligned with the self._xrefs list.
             serialized = etree.tounicode(self.node_stripped())
             sections = []
             for section in serialized.split('</xr>'):
                 section = XREF_STRIPPER.edit(section.lower())
                 sections.append(section)
             for section, xref in zip(sections, self._xrefs):
                 if EQUALS_XREF.search(section):
                     xref.type = 'equals'
                 elif 'see <xr' in section:
                     xref.type = 'see'
                 elif 'also <xr' in section or 'cf. <xr' in section:
                     xref.type = 'cf'
                 elif 'opp. <xr' in section:
                     xref.type = 'opposite'
         return self._xrefs
Ejemplo n.º 30
0
 def tostring(self, root=None, doctype=None, pretty_print=True):
     """return the content of the XML document as a unicode string"""
     if root is None:
         root = self.root
     return etree.tounicode(
         root, doctype=doctype or self.info.doctype, pretty_print=pretty_print
     )
Ejemplo n.º 31
0
    def parse_nodes(self, xml):
        """Derive a flat list of nodes from this xml chunk. This does nothing
        to determine node depth"""
        nodes = []

        for child in xml.getchildren():
            matching = (m for m in self.MATCHERS if m.matches(child))

            tag_matcher = next(matching, None)
            if tag_matcher:
                nodes.extend(tag_matcher.derive_nodes(child, processor=self))
            else:
                logger.warning("No tag match\n%s", etree.tounicode(child))

        # Trailing stars don't matter; slightly more efficient to ignore them
        while nodes and nodes[-1].label[0] in mtypes.stars:
            nodes = nodes[:-1]

        return nodes
Ejemplo n.º 32
0
def get_mo_fail_response(text, ack, status_code):
    """
    Builds xml return in case of error (status_code <> 200)

    :param text: error description
    :param ack: message's arrival acknowledgement
    :param status_code: request status code
    :return: xml
    """

    body = etree.Element('smsmo_response', ack=str(ack))
    message_id = etree.SubElement(body, "message_id")
    source = etree.SubElement(body, "source")
    large_account = etree.SubElement(body, "large_account")
    response_datetime = etree.SubElement(body, "request_datetime")
    description = etree.SubElement(body, "description", code=str(status_code))
    description.text = str(text)

    return etree.tounicode(body)
Ejemplo n.º 33
0
 def build_yang_response(self,
                         root,
                         request,
                         yang_options=None,
                         custom_rpc=False):
     try:
         self.custom_rpc = custom_rpc
         yang_xml = self.to_yang_xml(root, request, yang_options,
                                     custom_rpc)
         log.info('yang-xml',
                  yang_xml=etree.tounicode(yang_xml, pretty_print=True))
         return self.build_xml_response(request, yang_xml, custom_rpc)
     except Exception as e:
         log.exception('error-building-yang-response',
                       request=request,
                       xml=etree.tostring(root))
         self.rpc_response.is_error = True
         self.rpc_response.node = ncerror.BadMsg(request)
         return
Ejemplo n.º 34
0
def xls2xml(xls_name):
    with xlrd.open_workbook(xls_name) as wb:
        ws = wb.sheet_by_index(0)
    table = OrderedDict()
    for i in range(ws.nrows):
        key = int(ws.row_values(i)[0])
        value = str(ws.row_values(i)[1:])
        table[key] = value

    with open("student1.xml", 'w') as f:
        root = etree.Element("root")
        e_root = etree.ElementTree(root)
        e_students = etree.SubElement(root, 'students')
        e_students.text = '\n' + str(
            json.dumps(table, indent=4, ensure_ascii=False)) + '\n'
        e_students.append(
            etree.Comment('\n    学生信息表\n    "id" : [名字,数学,语文,英语]\n'))
        f.write('<?xml version="1.0" encoding="UTF-8"?>' +
                etree.tounicode(e_root.getroot()))
Ejemplo n.º 35
0
def test_parse_colletion_metdata_without_print_style(tmpdir,
                                                     litezip_valid_litezip):
    working_dir = tmpdir.mkdir('col')
    collection_file = working_dir.join('collection.xml')
    # Copy over and modify the collection.xml file.
    with (litezip_valid_litezip / 'collection.xml').open() as origin:
        xml = etree.parse(origin)
        elm = xml.xpath('//col:param[@name="print-style"]',
                        namespaces=COLLECTION_NSMAP)[0]
        elm.getparent().remove(elm)
        collection_file.write(etree.tounicode(xml).encode('utf8'))
    assert 'print-style' not in collection_file.read()

    # Test the parser doesn't error when a print-style is missing.
    # given a Collection object,
    model = parse_collection(Path(working_dir))
    # parse the metadata into a CollectionMetadata,
    md = parse_collection_metadata(model)
    assert md.print_style is None
Ejemplo n.º 36
0
    def to_svg(self):
        '''
        Returns:

            unicode : SVG XML source with up-to-date electrode channel lists.
        '''
        xml_root = etree.parse(self.svg_filepath)

        # Identify electrodes with modified channel lists.
        df_diff_channels = self.diff_electrode_channels()

        # Update `svg:path` XML elements for electrodes with modified channel
        # lists.
        xpath = XPathEvaluator(xml_root, namespaces=INKSCAPE_NSMAP)
        for electrode_id, (orig_i, new_i) in df_diff_channels.iterrows():
            elements_i = xpath.evaluate('//svg:path[@id="%s"]' % electrode_id)
            for element_i in elements_i:
                element_i.attrib['data-channels'] = ','.join(map(str, new_i))
        return etree.tounicode(xml_root)
Ejemplo n.º 37
0
def test_mets_dnx():
    """Test basic construction of METS DNX"""
    ie_dc_dict = {"dc:title": "test title"}
    mets = mdf.build_mets(
        ie_dmd_dict=ie_dc_dict,
        pres_master_dir=os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'test_batch_1', 'pm'),
        modified_master_dir=os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'test_batch_1', 'mm'),
        input_dir=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'data', 'test_batch_1'),
        generalIECharacteristics=[{
            'submissionReason': 'bornDigitalContent',
            'IEEntityType': 'periodicIE'
        }],
    )
    print(ET.tounicode(mets, pretty_print=True))
Ejemplo n.º 38
0
def xls_xml(file_name):
    # 从xls文件读取
    data = {}
    excel = xlrd.open_workbook(file_name)
    table = excel.sheet_by_name('city')
    nrows = table.nrows
    for i in range(nrows):
        key = str(int(table.row_values(i)[0]))
        value = str(table.row_values(i)[1:])
        data[key] = value
    # 写入到xml文件
    output = codecs.open('city.xml', 'w', 'utf-8')
    root = etree.Element('root')
    citys_xml = etree.ElementTree(root)
    citys = etree.SubElement(root, 'citys')
    citys.append(etree.Comment('城市信息'))
    citys.text = str(data)
    output.write(etree.tounicode(citys_xml.getroot()))
    output.close()
Ejemplo n.º 39
0
    def get_body(self, configs, channel_id, msisdn, interface,
                 subscription_type):
        """
        Builds xml request body

        :param configs: Cache object application.settings['config']
        :param channel_id: Channel id number
        :param msisdn: Telephone number
        :param interface: Interface number
        :param subscription_type: Subscription type number
        :return: xml
        """

        body = etree.Element('tangram_request',
                             company_id=str(configs['company_id']),
                             service_id=str(configs['service_id']),
                             user=str(configs['user']))
        if interface:
            body.attrib['interface'] = str(interface)

        provisioning = etree.SubElement(body, "provisioning")
        operation = etree.SubElement(provisioning,
                                     "operation",
                                     code=str(configs['operation_code']))
        operation.text = configs['operation_description']
        channel = etree.SubElement(provisioning, "channel_id")
        channel.text = str(channel_id)
        destination = etree.SubElement(provisioning, "destination")
        destination.text = str(msisdn)
        if subscription_type:
            destination.attrib['subscription_type'] = str(subscription_type)

        notification = etree.SubElement(provisioning,
                                        "notification",
                                        type=str(configs['notification_type']),
                                        calltype=str(
                                            configs['notification_calltype']))
        notification.text = configs['notification_callback']
        request_datetime = etree.SubElement(provisioning, "request_datetime")
        request_datetime.text = str(int(time.time()))

        return etree.tounicode(body)
Ejemplo n.º 40
0
def xls2xml(filename, outfile):
    with xlrd.open_workbook(filename) as excel:
        #table = excel.sheet_by_name('student')
        table = excel.sheet_by_index(0)

    data = OrderedDict()
    for i in range(table.nrows):
        key = str(int(table.row_values(i)[0]))
        value = str(table.row_values(i)[1:])
        data[key] = value

    output = codecs.open(outfile, 'w', 'utf-8')
    root = etree.Element('root')
    students_xml = etree.ElementTree(root)
    students = etree.SubElement(root, 'students')
    students.append(etree.Comment('\n\t学生信息表\n\t"d" :[名字, 数学, 语文, 英语]\n'))
    students.text = '\n\t学生信息表\n\t"d" :[名字, 数学, 语文, 英语]\n'
    students.text = '\n'+str(json.dumps(data, indent=4, ensure_ascii=False))+'\n'
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n' + etree.tounicode(students_xml.getroot()))
    output.close()
    def __add_certainty(self, text, certainty):
        tree = etree.fromstring(text)

        certainties = tree.xpath(
            '//default:teiHeader'
            '//default:classCode[@scheme="http://providedh.eu/uncertainty/ns/1.0"]',
            namespaces=NAMESPACES)

        if not certainties:
            tree = self.__create_annotation_list(tree)
            certainties = tree.xpath(
                '//default:teiHeader'
                '//default:classCode[@scheme="http://providedh.eu/uncertainty/ns/1.0"]',
                namespaces=NAMESPACES)

        certainties[0].append(certainty)

        text = etree.tounicode(tree)

        return text
    def __add_annotator(self, text, annotator):
        tree = etree.fromstring(text)

        list_person = tree.xpath(
            '//default:teiHeader'
            '//default:listPerson[@type="PROVIDEDH Annotators"]',
            namespaces=NAMESPACES)

        if not list_person:
            tree = self.__create_list_person(tree)
            list_person = tree.xpath(
                '//default:teiHeader'
                '//default:listPerson[@type="PROVIDEDH Annotators"]',
                namespaces=NAMESPACES)

        list_person[0].append(annotator)

        text = etree.tounicode(tree)

        return text
Ejemplo n.º 43
0
 def send_notification(self, data, *params):
     msg = etree.Element("{{{}}}notification".format(NSMAP['ncEvent']))
     node_event_time = util.leaf_elm(
         'eventTime', date_time_string(datetime.datetime.now()))
     msg.append(node_event_time)
     msg.append(data)
     msg_unicode = etree.tounicode(msg, pretty_print=True)
     logger.debug("notification msg is:\n%s", str(msg_unicode))
     for socket in self.server.sockets:
         if socket.running is False:
             continue
         for session in socket.sessions:
             if session.session_open is False:
                 continue
             logger.debug(
                 "Sending to client, session id: %d, ip:%s, port:%d",
                 session.session_id, socket.client_addr[0],
                 socket.client_addr[1])
             session.send_message(msg_unicode)
     return
def xls_xml(file_name):
	data = {}

	excel = xlrd.open_workbook(file_name)
	table = excel.sheet_by_name('student')
	# print(table.row_values(0))
	nrows = table.nrows
	for i in range(nrows):
		key = str(int(table.row_values(i)[0]))
		value = str(table.row_values(i)[1:])
		data[key] = value

	output = codecs.open('students.xml','w','utf-8')
	root = etree.Element('root')
	students_xml = etree.ElementTree(root)
	students = etree.SubElement(root, 'students')
	students.append(etree.Comment('学生信息表\n\"id\": [名字,数学,语文,英语]'))
	students.text = str(data)
	output.write(etree.tounicode(students_xml.getroot()))
	output.close()
Ejemplo n.º 45
0
 def getProvince(self):
     """ 获得所有省的对应的url
     :return:[[href, province_detail_name],[]...] """
     url = 'http://www.cc10000.cn/0/'
     options = {
         'method': 'get',
         'url': url,
         'headers': self.headers,
         'timeout': _time_out
     }
     response = Request.basic(options, resend_times=4)
     selector = etree.HTML(response.text)
     content = etree.tounicode(selector.xpath('//body/div[6]')[0])
     href_and_name = re.findall('href="(/\d.*?)">(.*?)<', content)
     # 仅提取省,并将用详细省名代替简写省名
     seq = [[
         i[0], config.ROOT_DETAIL_NAMES[config.ROOT_SHORT_NAMES.index(i[1])]
     ] for i in href_and_name if i[1] in config.ROOT_SHORT_NAMES]
     self.hrefs.extend([index[0] for index in seq])
     return seq
Ejemplo n.º 46
0
def replace_id_and_version(model, id, version):
    """Does an inplace replacement of the given model's id and version

    :param model: module
    :type model: :class:`litezip.Collection` or :class:`litezip.Module`
    :param id: id
    :type id: str
    :param version: major and minor version tuple
    :type version: tuple of int

    """
    # Rewrite the content with the id and version
    with model.file.open('rb') as fb:
        xml = etree.parse(fb)
    elm = xml.xpath('//md:content-id', namespaces=COLLECTION_NSMAP)[0]
    elm.text = id
    elm = xml.xpath('//md:version', namespaces=COLLECTION_NSMAP)[0]
    elm.text = convert_version_to_legacy_version(version)
    with model.file.open('wb') as fb:
        fb.write(etree.tounicode(xml).encode('utf8'))
Ejemplo n.º 47
0
    def log(self, node, action, description):
        """Write out our log info based on the node and event specified.

        We only log this information if we're are DEBUG loglevel

        """
        if self._active:
            content = tounicode(node)
            hashed = md5()
            try:
                hashed.update(content.encode('utf-8', errors="replace"))
            except Exception as exc:
                LOG.error("Cannot hash the current node." + str(exc))
            hash_id = hashed.hexdigest()[0:8]
            # if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
            print("{0} :: {1}\n{2}".format(
                hash_id,
                description,
                content.replace("\n", "")[0:202],
            ))
Ejemplo n.º 48
0
def get_clinical_document(access_token, hie_profile):
    """get member's clinical data from HIXNY (CDA XML), convert to FHIR (JSON), return both.
    """
    request_xml = """
        <GETDOCUMENTPAYLOAD>
            <MRN>%s</MRN>
            <DATAREQUESTOR>%s</DATAREQUESTOR>
        </GETDOCUMENTPAYLOAD>
        """ % (
        hie_profile.mrn,
        hie_profile.data_requestor,
    )
    print(request_xml)

    response = requests.post(
        settings.HIE_GETDOCUMENT_API_URI,
        verify=False,
        headers={
            'Content-Type': 'application/xml',
            'Authorization': "Bearer %s" % (access_token)
        },
        data=request_xml,
    )
    response_xml = etree.XML(response.content)
    print(response_xml)

    cda_element = response_xml.find("{%(hl7)s}ClinicalDocument" % NAMESPACES)
    if cda_element is not None:
        cda_content = etree.tounicode(cda_element)
        fhir_content = cda2fhir(cda_content).decode('utf-8')
        result = {
            'cda_content': cda_content,
            'fhir_content': fhir_content,
        }
    else:
        result = {
            'cda_content': None,
            'fhir_content': None,
        }

    return result
Ejemplo n.º 49
0
def ocrdata():
    if "Content-Encoding" in request.headers and \
            request.headers["Content-Encoding"] == "gzip":
        data = json.loads(gzip.decompress(request.data).decode("utf-8"))
    else:
        data = request.get_json()
    cnt = 0
    for bname, bdict in data["ocrdata"].items():
        b = Book.query.filter_by(name=bname).one()
        for pname, pdict in bdict.items():
            p = Page.query.filter_by(book_id=b.id, name=pname).one()
            root = etree.fromstring(p.data)
            ns = {"ns": root.nsmap[None]}
            for lid, text in pdict.items():
                linexml = root.find('.//ns:TextLine[@id="' + lid + '"]',
                                    namespaces=ns)
                if linexml is None:
                    continue
                textequivxml = linexml.find(
                    './ns:TextEquiv[@index="{}"]'.format(data["index"]),
                    namespaces=ns)
                if textequivxml is None:
                    textequivxml = etree.SubElement(
                        linexml,
                        "{{{}}}TextEquiv".format(ns["ns"]),
                        attrib={"index": str(data["index"])})
                unicodexml = textequivxml.find('./ns:Unicode', namespaces=ns)
                if unicodexml is None:
                    unicodexml = etree.SubElement(
                        textequivxml, "{{{}}}Unicode".format(ns["ns"]))
                unicodexml.text = text
                cnt += 1
            p.no_lines_ocr = int(
                root.xpath(
                    'count(//ns:TextLine'
                    '[count(./ns:TextEquiv'
                    '[@index>0])>0])',
                    namespaces=ns))
            p.data = etree.tounicode(root.getroottree())
    db_session.commit()
    return "Imported {} lines.".format(cnt)
Ejemplo n.º 50
0
def storeBatches(collectedRecords, firstRecord):
    global config

    if len(collectedRecords) > 0:
        if 'xml-batch' in config.format:
            XMLContainer = ET.XML('<records/>')
            for (ID, record) in collectedRecords.iteritems():
                XMLContainer.append(record)
            filePath = pathForBatch(firstRecord, 'xml')
            XMLFile = open(filePath, 'w')
            XMLString = ET.tounicode(XMLContainer).encode('UTF-8')
            XMLFile.write(XMLString)
            XMLFile.close()
            print u"XML-Batch: " + str(
                len(collectedRecords)) + u" records to »" + filePath + u"«"

        if 'json-batch' in config.format or 'couchdb-batch' in config.format:
            JSONContainer = []
            for (ID, record) in collectedRecords.iteritems():
                JSONInternal = elem_to_internal(record, strip=1)
                if len(JSONInternal) == 1:
                    JSONInternal = JSONInternal.values()[0]
                JSONInternal['_id'] = ID
                JSONContainer += [JSONInternal]

            if 'json-batch' in config.format:
                filePath = pathForBatch(firstRecord, 'json')
                JSONFile = open(filePath, "w")
                JSONFile.write(simplejson.dumps(JSONContainer))
                JSONFile.close()
                print u"JSON-Batch: " + str(
                    len(collectedRecords)) + u" records to »" + filePath + u"«"

            if 'couchdb-batch' in config.format:
                filePath = pathForBatch(firstRecord, 'couch.json')
                JSONContainer = {'docs': JSONContainer}
                JSONFile = open(filePath, "w")
                JSONFile.write(simplejson.dumps(JSONContainer))
                JSONFile.close()
                print u"CouchDB JSON-Batch: " + str(
                    len(collectedRecords)) + u" records to »" + filePath + u"«"
Ejemplo n.º 51
0
    def detail_page(self, response):
        # 初始化字段
        _id = ''
        _id_ = ''
        ann_type = '送达公告'
        announcer = '普洱市中级人民法院'
        defendant = ''
        defendant_origin = ''
        ann_date = ''
        ann_content = ''
        ann_html = ''
        content_url = response.url
        pdf_url = ''
        case_no = ''
        source = '普洱市中级人民法院'

        try:
            html = self.xml_xpath(response, 0)
            content_text = html.xpath('//div[@class="sswy_article_m"]//text()')

            ann_html = etree.tounicode(html.xpath('//div[@class="sswy_article_m"]')[0])
            # print(ann_html)
            content_p = ''.join(content_text).replace('\xa0', '')
            content_p = content_p.replace('\r\n', '')
            ann_content = ''.join(content_p.split())

            ann_date = re.findall(r'(.{4}[年].{1,2}[月].{1,3}[日号])', ann_content)[-1]

            ann_date = self.parse_time(ann_date)

            text = ann_content.replace('本院', ':')
            text = text.split(':')[0] + ":"
            case = re.findall(r'((.{4}).*?)号', text)
            print(len(case))
            print(text)
            if len(case) > 0:
                print("*******")
                defendant_origin_list = re.findall(r'号(.*?):', text)[0]
                # print(defendant_origin_list)
            elif '公告' in text:
                defendant_origin_list = re.findall(r'公告(.*?):', text)[0]
Ejemplo n.º 52
0
def extractExamples(directory):
    examples = {}
    for filename in os.listdir(directory):
        if not filename.endswith('.rml'):
            continue
        rmlFile = open(os.path.join(directory, filename), 'rb')
        root = etree.parse(rmlFile).getroot()
        elements = root.xpath('//@doc:example/parent::*',
                              namespaces={'doc': EXAMPLE_NS})
        # Phase 1: Collect all elements
        for elem in elements:
            demoTag = elem.get(EXAMPLE_ATTR_NAME) or elem.tag
            elemExamples = examples.setdefault(demoTag, [])
            elemExamples.append({
                'filename':
                filename,
                'line':
                elem.sourceline,
                'element':
                elem,
                'rmlurl':
                INPUT_URL % filename,
                'pdfurl':
                EXPECTED_URL % (filename[:-4] + '.pdf')
            })
        # Phase 2: Render all elements
        removeDocAttributes(root)
        for dirExamples in examples.values():
            for example in dirExamples:
                xml = etree.tounicode(example['element']).strip()
                xml = re.sub(
                    ' ?xmlns:doc="http://namespaces.zope.org/rml/doc"', '',
                    xml)
                xml = dedent(xml)
                xml = enforceColumns(xml, 80)
                xml = highlightRML(xml)
                example['code'] = xml

        rmlFile.close()

    return examples
Ejemplo n.º 53
0
def test_make_instructions():
    tokenized = [
        tokens.Paragraph.make(part='111'),
        tokens.Verb(tokens.Verb.PUT, active=True),
        tokens.Paragraph.make(part='222'),
        tokens.Paragraph.make(part='333'),
        tokens.Paragraph.make(part='444'),
        tokens.Verb(tokens.Verb.DELETE, active=True),
        tokens.Paragraph.make(part='555'),
        tokens.Verb(tokens.Verb.MOVE, active=True),
        tokens.Paragraph.make(part='666'),
        tokens.Paragraph.make(part='777')
    ]
    with XMLBuilder("EREGS_INSTRUCTIONS") as ctx:
        ctx.PUT(label=222)
        ctx.PUT(label=333)
        ctx.PUT(label=444)
        ctx.DELETE(label=555)
        ctx.MOVE(label=666, destination=777)
    assert ctx.xml_str == etree.tounicode(
        amdparser.make_instructions(tokenized))
Ejemplo n.º 54
0
def test_digtial_original_dnx_single_file():
    """Test that the digitalOriginal value is being properly translated
    from a boolean input to a lower-case string of 'true' or 'false' for a
    single-file METS"""
    ie_dc_dict = {"dc:title": "test title"}
    mets = mdf.build_single_file_mets(
        ie_dmd_dict=ie_dc_dict,
        filepath=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                              'data', 'test_batch_1', 'pm', 'presmaster.jpg'),
        generalIECharacteristics=[{
            'submissionReason': 'bornDigitalContent',
            'IEEntityType': 'periodicIE'
        }],
        digital_original=True)
    grc = mets.findall('.//section[@id="generalRepCharacteristics"]')[0]
    # print(ET.tounicode(grc[0], pretty_print=True))
    do = grc.findall('.//key[@id="DigitalOriginal"]')[0]
    assert (do.text == 'true')
    # for grc in general_rep_characteristics:
    #     assert(grc.text == 'true')
    print(ET.tounicode(mets, pretty_print=True))
Ejemplo n.º 55
0
 def parse_by_br(self, response):
     html = etree.HTML(text=response.text)
     text_l_new = []
     for querySelector in self.querySelectorList_br:
         p_list = html.xpath(querySelector)
         # print(len(p_list))
         for p in p_list:
             temp_text = etree.tounicode(p)
             # print(temp_text)
             if '<br>' in temp_text:
                 text_l = temp_text.split('<br>')
             elif '<br/>' in temp_text:
                 text_l = temp_text.split('<br/>')
             else:
                 print('text_l.split by br error, maybe not found br')
                 text_l = []
             for text in text_l:
                 text = text.strip().replace('\n', '').replace('\r', '')
                 text = re.sub('<.*?>', '', text)
                 text_l_new.append(text)
     return text_l_new
Ejemplo n.º 56
0
def _ogc_filter_to_expression(prop):
    if 'And' in prop.tag:
        return ' and '.join(map(_ogc_filter_to_expression,
                                prop.iterchildren()))
    elif 'Or' in prop.tag:
        return ' or '.join(map(_ogc_filter_to_expression, prop.iterchildren()))
    elif 'PropertyIsGreaterThan' in prop.tag:
        return _compile_bin_op('>', prop.iterchildren())
    elif 'PropertyIsLessThan' in prop.tag:
        return _compile_bin_op('<', prop.iterchildren())
    elif 'PropertyIsEqualTo' in prop.tag:
        return _compile_bin_op('=', prop.iterchildren())
    elif 'PropertyIsNotEqualTo' in prop.tag:
        return _compile_bin_op('!=', prop.iterchildren())
    elif 'PropertyIsBetween' in prop.tag:
        name = prop.PropertyName
        cql_lo = _compile_bin_op('>', [name, prop.LowerBoundary.Literal])
        cql_hi = _compile_bin_op('<', [name, prop.UpperBoundary.Literal])
        return cql_lo + 'and ' + cql_hi

    raise AssertionError(etree.tounicode(prop, pretty_print=True))
Ejemplo n.º 57
0
 def send_rpc_reply(self, rpc_reply, origmsg):
     reply = etree.Element(qmap('nc') + "rpc-reply",
                           attrib=origmsg.attrib,
                           nsmap=origmsg.nsmap)
     print('step 10')
     print(etree.tostring(reply, pretty_print=True))
     try:
         #rpc_reply.getchildren                           # pylint: disable=W0104
         reply.append(rpc_reply)
         print('step 11')
         print(reply)
     except AttributeError:
         reply.extend(rpc_reply)
         print('stpe 12')
         print(reply)
     ucode = etree.tounicode(reply, pretty_print=True)
     if self.debug:
         logger.debug("%s: Sending RPC-Reply: %s", str(self), str(ucode))
     print('step 13')
     print(ucode)
     self.send_message(ucode)
Ejemplo n.º 58
0
def xslt(request):
    transform_result = ''
    transform_result_pretty = ''
    if request.method == 'POST':
        form = forms.TransformForm(request.POST)
        if form.is_valid():
            tansformer = transformers.get(form.cleaned_data['transformer'])
            record_tree = etree.fromstring(
                form.cleaned_data['xml'].encode('utf-8'))
            transformed = tansformer(record_tree, abstract='0')
            transform_result = unicode(transformed)
            transform_result_pretty = etree.tounicode(transformed,
                                                      pretty_print=True)
    else:
        form = forms.TransformForm()
    return render(
        request, 'transformers_pool/administration/xslt.html', {
            'form': form,
            'transform_result': transform_result,
            'transform_result_pretty': transform_result_pretty
        })
Ejemplo n.º 59
0
            def write_to_file(verses, f):
                if len(verses) > 0:
                    tf = get_text_format(verses[0])

                    for i, v in enumerate(tf["verses"]):
                        last_verse = i == len(tf["verses"]) - 1

                        verse = ET.tounicode(v)
                        verse_wo_terms = get_rid_of_notes(verse)
                        verse = ET.fromstring(verse_wo_terms)

                        verse_wo_add = remove_nodes(verse, "add", NS_TI["ti"])

                        verse = stringify_children(verse_wo_add)
                        verse = clean_entities(verse)
                        if len(verse.strip()) > 0:
                            print('"'+verse+'"')
                            f.write("{verse}{eol}".format(verse=verse,
                                                          eol="\n" if not last_verse else ""))
                        else:
                            f.write("\n")
Ejemplo n.º 60
0
    def dump_to_xml(self, tree_name='Item'):

        tree = etree.parse(file_path)
        root = tree.getroot()

        for bad in root.xpath("//%s[@login=\'%s\']" % (tree_name, self.login)):
            bad.getparent().remove(bad)

        item_branch = etree.Element("Item", login=self.login)
        for curr in self.__dict__:
            elem = etree.Element(curr)
            #print self.__dict__[curr]
            elem.text = unicode(self.__dict__[curr])
            item_branch.append(elem)
        root.append(item_branch)

        xml = etree.tounicode(root, pretty_print=True)

        root = etree.fromstring(xml)
        et = etree.ElementTree(root)
        et.write(file_path, pretty_print=True, encoding="UTF-8")