def _parse(self, source, encoding): if not isinstance(source, Stream): source = XMLParser(source, filename=self.filename, encoding=encoding) stream = [] for kind, data, pos in source: if kind is TEXT: for kind, data, pos in interpolate(data, self.filepath, pos[1], pos[2], lookup=self.lookup): stream.append((kind, data, pos)) elif kind is PI and data[0] == 'python': if not self.allow_exec: raise TemplateSyntaxError('Python code blocks not allowed', self.filepath, *pos[1:]) try: suite = Suite(data[1], self.filepath, pos[1], lookup=self.lookup) except SyntaxError, err: raise TemplateSyntaxError(err, self.filepath, pos[1] + (err.lineno or 1) - 1, pos[2] + (err.offset or 0)) stream.append((EXEC, suite, pos)) elif kind is COMMENT: if not data.lstrip().startswith('!'): stream.append((kind, data, pos))
def test_text_node_pos_single_line(self): text = '<elem>foo bar</elem>' events = list(XMLParser(StringIO(text))) kind, data, pos = events[1] self.assertEqual(Stream.TEXT, kind) self.assertEqual('foo bar', data) self.assertEqual((None, 1, 6), pos)
def test_undefined_entity_with_dtd(self): text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html>&junk;</html> """ events = XMLParser(StringIO(text)) self.assertRaises(ParseError, list, events)
def subtemplates(self, tpl=None): if tpl is None: tpl = self subtemplates = [] f = open(tpl.path, 'r') try: for kind, data, pos in XMLParser(f, filename=tpl.path): if kind is START: tag, attrib = data if tag.namespace == 'http://www.w3.org/2001/XInclude'\ and tag.localname == 'include': subtpl_ident = attrib.get('href') try: subtpl = self.loader.load(subtpl_ident) except TemplateNotFound: # This will fail later, here we just need to ignore # template idents that are dynamically computed. pass else: subtemplates.append(subtpl) finally: f.close() for subtemplate in subtemplates: for new_subtpl in self.subtemplates(subtemplate): if new_subtpl not in subtemplates: subtemplates.append(new_subtpl) return subtemplates
def test_html_entity_in_attribute(self): text = '<p title=" "/>' events = list(XMLParser(StringIO(text))) kind, data, pos = events[0] self.assertEqual(Stream.START, kind) self.assertEqual(u'\xa0', data[1].get('title')) kind, data, pos = events[1] self.assertEqual(Stream.END, kind)
def test_latin1_encoded_xmldecl(self): text = u"""<?xml version="1.0" encoding="iso-8859-1" ?> <div>\xf6</div> """.encode('iso-8859-1') events = list(XMLParser(StringIO(text))) kind, data, pos = events[2] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\xf6', data)
def test_xmldecl_encoding(self): text = '<?xml version="1.0" encoding="utf-8" ?><root />' events = list(XMLParser(StringIO(text))) kind, (version, encoding, standalone), pos = events[0] self.assertEqual(Stream.XML_DECL, kind) self.assertEqual(u'1.0', version) self.assertEqual(u'utf-8', encoding) self.assertEqual(-1, standalone)
def test_xmldecl_standalone(self): text = '<?xml version="1.0" standalone="yes" ?><root />' events = list(XMLParser(StringIO(text))) kind, (version, encoding, standalone), pos = events[0] self.assertEqual(Stream.XML_DECL, kind) self.assertEqual(u'1.0', version) self.assertEqual(None, encoding) self.assertEqual(1, standalone)
def test_text_node_pos_multi_line(self): text = '''<elem>foo bar</elem>''' events = list(XMLParser(StringIO(text))) kind, data, pos = events[1] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'foo\nbar', data) if sys.version_info[:2] >= (2, 4): self.assertEqual((None, 1, -1), pos)
def test_html_entity_with_dtd(self): text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html> </html> """ events = list(XMLParser(StringIO(text))) kind, data, pos = events[2] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\xa0', data)
def test_element_attribute_order(self): text = '<elem title="baz" id="foo" class="bar" />' events = list(XMLParser(StringIO(text))) kind, data, pos = events[0] self.assertEqual(Stream.START, kind) tag, attrib = data self.assertEqual(u'elem', tag) self.assertEqual((u'title', u'baz'), attrib[0]) self.assertEqual((u'id', u'foo'), attrib[1]) self.assertEqual((u'class', u'bar'), attrib[2])
def extract_javascript_script(fileobj, keywords, comment_tags, options): """Extract messages from Javascript embedding in <script> tags. Select <script type="javascript/text"> tags and delegate to `extract_javascript`. """ from genshi.core import Stream from genshi.input import XMLParser out = StringIO() stream = Stream(XMLParser(fileobj)) stream.select('//script[@type="text/javascript"]').render(out=out) out.seek(0) return extract_javascript(out, keywords, comment_tags, options)
class XHTMLParser(object): """ parse an XHTML fragment """ def __init__(self, text): self.parser = XMLParser(StringIO("<div>%s</div>" % text)) self.depth = 0 def __iter__(self): self.iter = self.parser.__iter__() return self def next(self): object = self.iter.next() if object[0] == 'END': self.depth = self.depth - 1 predepth = self.depth if object[0] == 'START': self.depth = self.depth + 1 if predepth: return object return self.next()
def test_undefined_entity_without_dtd(self): text = '<html>&junk;</html>' events = XMLParser(StringIO(text)) self.assertRaises(ParseError, list, events)
def test_html_entity_without_dtd(self): text = '<html> </html>' events = list(XMLParser(StringIO(text))) kind, data, pos = events[1] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\xa0', data)
def __init__(self, text): self.parser = XMLParser(StringIO("<div>%s</div>" % text)) self.depth = 0
def test_latin1_encoded(self): text = u'<div>\xf6</div>'.encode('iso-8859-1') events = list(XMLParser(StringIO(text), encoding='iso-8859-1')) kind, data, pos = events[1] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\xf6', data)
def test_unicode_input(self): text = u'<div>\u2013</div>' events = list(XMLParser(StringIO(text))) kind, data, pos = events[1] self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\u2013', data)
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields entry.new_feed = entry.source.id entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
#!/usr/bin/python import sys from pprint import pprint from genshi.input import XMLParser with open(sys.argv[1]) as f: parser = XMLParser(f) pprint(list(parser))