def test_parslepy_init_selector_handler_error(): parselet_script = { "title": "h1", "subtitle": "//h2" } class MyHandler(parslepy.selectors.SelectorHandler): _dummy = True mh = MyHandler() parselet = parslepy.Parselet(parselet_script, selector_handler=mh)
def compare_extracted_output(root, input_parselet, expected_output, debug=False): parselet = parslepy.Parselet(input_parselet, strict=True, debug=debug) extracted = parselet.extract(root) #pprint.pprint(extracted) #pprint.pprint(expected_output) assert_dict_equal(extracted, expected_output)
def test_parslepy_parse_html_file(): parselet = parslepy.Parselet({"title": "h1"}) expected = {'title': 'Markup Validation Service'} dirname = os.path.dirname(os.path.abspath(__file__)) extracted = parselet.parse( open(os.path.join(dirname, 'data/validator.w3.org.html')) ) assert_dict_equal(extracted, expected)
def test_broken(self): """ A broken snippet must raise an Exception """ input_parselet, expected_output = ( {"stuff": {"broken": "spanner"}}, {} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted)
def test_parslepy_defaultparse_xml_file(): parselet_script = {"id": "//atom:id"} dsh = parslepy.selectors.DefaultSelectorHandler( namespaces={'atom': 'http://www.w3.org/2005/Atom'} ) dirname = os.path.dirname(os.path.abspath(__file__)) fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss')) expected = { 'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml' } parselet = parslepy.Parselet(parselet_script, selector_handler=dsh) extracted = parselet.parse(fp, parser=lxml.etree.XMLParser()) assert_dict_equal(extracted, expected)
def test_broken_but_optional(self): """ Empty dict if optional keys have broken inner-content An inner object might be broken (no selector match), but if it's for an optional key, the result is simply an empty dict """ input_parselet, expected_output = ( {"stuff?": {"perhaps": "spanner"}}, {'stuff': {}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_parslepy_parse_html_fromstring(): htmldoc = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> <title>The W3C Markup Validation Service</title> <link rev="made" href="mailto:[email protected]" /> <link rel="shortcut icon" href="http://www.w3.org/2008/site/images/favicon.ico" type="image/x-icon" /> <link rev="start" href="./" title="Home Page" /> <style type="text/css" media="all"> @import "./style/base"; </style> <script type="text/javascript" src="scripts/combined"></script> <meta name="keywords" content="HTML, HyperText Markup Language, Validation, W3C Markup Validation Service" /> <meta name="description" content="W3C's easy-to-use markup validation service, based on SGML and XML parsers." /> <link rel="alternate" type="application/atom+xml" href="http://www.w3.org/QA/Tools/validator-whatsnew.atom" /> </head> <body> <div id="banner"> <h1 id="title"> <a href="http://www.w3.org/"><img alt="W3C" width="110" height="61" id="logo" src="./images/w3c.png" /></a> <a href="./"><span>Markup Validation Service</span></a> </h1> <p id="tagline">Check the markup (HTML, XHTML, ...) of Web documents</p> </div> </body> </html> """ parselet = parslepy.Parselet( { "title": "h1", "pid": "p[id] @id" }) expected = { 'title': 'Markup Validation Service', 'pid': 'tagline' } extracted = parselet.parse_fromstring(htmldoc) assert_dict_equal(extracted, expected)
def test_one_required_broken_one_matching(self): """ Broken content with 1 non-matching selector """ input_parselet, expected_output = ( {"stuff": { "nothing": "paragraph", "title": "h1", }}, {'stuff': {}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_parslepy_init_default(): parselet_script = { "title": "h1", "subtitle": "//h2" } parselet = parslepy.Parselet(parselet_script) assert_dict_equal(parselet.parselet, parselet_script) assert_is_instance(parselet.parselet_tree, parslepy.base.ParsleyNode) assert_equal(len(parselet.parselet_tree), len(parselet_script), "not the same number of keys") for k,v in list(parselet.parselet_tree.items()): assert_is_instance(k, parslepy.base.ParsleyContext) assert_is_instance(v, parslepy.selectors.Selector) # since we did not provide a selector handler assert_is_instance(parselet.selector_handler, parslepy.base.DefaultSelectorHandler)
def test_one_required_broken(self): """ Broken content mixing required and optional keys """ input_parselet, expected_output = ( {"stuff": { "nothing": "paragraph", "nothing2?": "spanner", "nothing3?": "bodyboard", }}, {'stuff': {}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_parslepy_init_selector_handler_error(): parselet_script = { "title": "h1", "subtitle": "//h2" } class MyHandler(parslepy.selectors.SelectorHandler): def make(self, selection): return parslepy.selectors.Selector(lxml.etree.XPath("body")) def select(self, document, selector): return None def extract(self, document, selector): return None mh = MyHandler() parselet = parslepy.Parselet(parselet_script, selector_handler=mh) assert_is_instance(parselet.selector_handler, MyHandler)
def test_complicated(self): input_parselet, expected_output = ( {"stuff": { "nothing?": "paragraph", "title": { "value": "h1", "novalue?": { "maybe": "h47", } } }}, {'stuff': {'title': {'novalue': {}, 'value': 'Creative Commons License Deed'}}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_all_optional(self): """ When no selector matches anything for optional keys, we should end up with an empty dict, even if the parent key is required """ input_parselet, expected_output = ( {"stuff": { "nothing1?": "h24", "nothing2?": "spanner", "nothing3?": "bodyboard", }}, {'stuff': {}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_one_required_exists(self): """ Only required keys, no optional keys When optional keys selectors do not match anything, we should only have non-empty key/values """ input_parselet, expected_output = ( {"stuff": { "nothing": "h1", "nothing2?": "spanner", "nothing3?": "bodyboard", }}, {'stuff': {'nothing': 'Creative Commons License Deed'}} ) parselet = parslepy.Parselet(input_parselet, strict=True, debug=self.debug) extracted = parselet.extract(self.root) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def test_parslepy_xpathparse_xml_fromstring(): parselet_script = { "--(//atom:feed/atom:entry)": { "title": "atom:title", "name": "im:name", "id": "atom:id/@im:id", "images(im:image)": [{ "height": "@height", "url": ".", }], "releasedate": "im:releaseDate", } } xsh = parslepy.selectors.XPathSelectorHandler( namespaces={ 'atom': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss', } ) expected = { 'id': '647928068', 'images': [ { 'height': '55', 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg' }, { 'height': '60', 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg' }, { 'height': '170', 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg' } ], 'name': 'The Gifted', 'title': 'The Gifted - Wale', } parselet = parslepy.Parselet(parselet_script, selector_handler=xsh) extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser()) assert_dict_equal(extracted, expected)
def test_to_xml(): parselets = (({ "first": "parslepy:xml(//atom:feed/atom:entry[1]/im:contentType)" }, { 'first': '<im:contentType xmlns:im="http://itunes.apple.com/rss" xmlns="http://www.w3.org/2005/Atom" term="Music" label="Music"><im:contentType term="Album" label="Album"/></im:contentType>' }), ) dirname = os.path.dirname(os.path.abspath(__file__)) root = lxml.etree.parse(open( os.path.join(dirname, 'data/itunes.topalbums.rss')), parser=lxml.etree.XMLParser()).getroot() xsh = parslepy.selectors.XPathSelectorHandler( namespaces={ 'atom': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss' }) for input_parselet, expected_output in parselets: parselet = parslepy.Parselet(input_parselet, selector_handler=xsh, strict=True) extracted = parselet.extract(root) assert_dict_equal(extracted, expected_output)
def get_details(self): print 'Questions (%s)' % self.next_list_url contents = self.url_get(self.next_list_url) p = parslepy.Parselet(self.question_parsing_rules) page = p.parse_fromstring(contents) for row in page['papers']: if len(row['cell']) == 11: url = row['cell'][8]['url'] root, ext = os.path.splitext(os.path.split(url)[1]) self.details.append({ "name": row['cell'][0]['contents'], "language": row['cell'][6]['contents'], "url": self.base_url + url, "house": row['cell'][4]['contents'], "date": row['cell'][2]['contents'], "type": ext[1:], # This is also in the pdf's metadata, but it's easier to # get it from here "document_number": int(root.split('_')[0]), }) # check for next page of links (or None if not found) self.next_list_url = None for cell in page['next']: if cell['contents'] == 'Next': next_url = self.base_url + cell['url'] if self.next_list_url == next_url: raise Exception( "Possible url loop detected, next url '{0}' has not changed." .format(next_url)) self.next_list_url = next_url break
def test_parslepy_init_wrong_selector_handler(): parselet_script = { "title": "h1", "subtitle": "//h2" } parselet = parslepy.Parselet(parselet_script, selector_handler=lambda s: s)
def test_userdefined_extensions(): def myattrnames(ctx, xpctx, attributes, *args): #print "myattrnames:", ctx, xpctx, attributes, args return [a.attrname for a in attributes] # extension to built full URLs from @href or @src attributes try: import urlparse # Python 2.x except ImportError: import urllib.parse as urlparse def absurl(ctx, xpctx, attributes, *args): #print "absurl:", ctx, xpctx, attributes, args return [urlparse.urljoin(ctx, u) for u in attributes] parselets = (({ "head_meta(head/meta)": [{ "attrnames": ["myext:attrnames(@*)"], "attrvals": ["@*"], }], "img_links": ["//img/@src"], "img_abslinks": ["myext:absurl(//img/@src)"], }, { 'head_meta': [{ 'attrnames': ['http-equiv', 'content'], 'attrvals': ['Content-Type', 'text/html;charset=utf-8'] }, { 'attrnames': ['name', 'content'], 'attrvals': [ 'keywords', 'HTML, HyperText Markup Language, Validation,\n W3C Markup Validation Service' ] }, { 'attrnames': ['name', 'content'], 'attrvals': [ 'description', "W3C's easy-to-use\n markup validation service, based on SGML and XML parsers." ] }], 'img_abslinks': [ 'http://validator.w3.org/images/w3c.png', 'http://validator.w3.org/images/arrow-closed.png', 'http://validator.w3.org/images/arrow-closed.png', 'http://validator.w3.org/images/arrow-closed.png', 'http://www.w3.org/Icons/VSlogo', 'http://www.w3.org/Icons/WWW/w3c_home_nb', 'http://validator.w3.org/images/opensource-55x48.png', 'http://www.w3.org/QA/Tools/I_heart_validator' ], 'img_links': [ './images/w3c.png', './images/arrow-closed.png', './images/arrow-closed.png', './images/arrow-closed.png', 'http://www.w3.org/Icons/VSlogo', 'http://www.w3.org/Icons/WWW/w3c_home_nb', './images/opensource-55x48.png', 'http://www.w3.org/QA/Tools/I_heart_validator' ] }), ) mynamespaces = {"myext": "myextension"} myextensions = { ("myextension", "absurl"): absurl, ("myextension", "attrnames"): myattrnames, } sh = parslepy.DefaultSelectorHandler(namespaces=mynamespaces, extensions=myextensions) dirname = os.path.dirname(os.path.abspath(__file__)) for input_parselet, expected_output in parselets: parselet = parslepy.Parselet(input_parselet, selector_handler=sh, strict=True) extracted = parselet.parse(os.path.join(dirname, 'data/validator.w3.org.html'), context='http://validator.w3.org/') #pprint.pprint(extracted) #pprint.pprint(expected_output) assert_dict_equal(extracted, expected_output)
def test_parslepy_keys(): parselet_scripts = [ ( { "title": "h1", "subtitle": "//h2" }, ["title", "subtitle"], ), ( { "--": { "--(#banner)": { "--(#title)": { "--(a span)": { "title": "." } } } } }, ["title"], ), ( { "--(#header)": { "--(#banner)": { "--(#title)": { "--(a span)": { "title": "." } } } } }, ["title"], ), ( { "--": { "--(#banner)": { "--(#title)": { "--(a span)": { "title": "." } } } }, "links": [".//a/@href"] }, ["title", "links"], ), ( { "title": "h1", "--(.content)": { "subtitle": ".//h2" } }, ["title", "subtitle"], ), ( { "title": "h1", "--(.content)": { "title": ".//h2" }, "footer": "parslepy:html(.//div[@class='footer'])" }, ["title", "footer"], ), ] for input_parselet, expected_output in parselet_scripts: parselet = parslepy.Parselet(input_parselet) assert_equal(set(parselet.keys()), set(expected_output))
import parslepy, urllib2 rules = {"questions(//div[contains(@class,'question-summary')])": [{"title": ".//h3/a", "votes": "div.votes div.mini-counts"}]} parslepy.Parselet(rules).parse(urllib2.urlopen('http://stackoverflow.com')) {'questions': [{'title': u'node.js RSS memory grows over time despite fairly consistent heap sizes', 'votes': u'0'}, {'title': u'SQL query for count of predicate applied on rows of subquery', 'votes': u'3'}, } import lxml.etree import parslepy import pprint html = """ <!DOCTYPE html> <html> <head> <title>Sample document to test parslepy</title> <meta http-equiv="content-type" content="text/html;charset=utf-8" /> </head> <body> <h1 id="main">What’s new</h1> <ul> <li class="newsitem"><a href="/article-001.html">This is the first article</a></li> <li class="newsitem"><a href="/article-002.html">A second report on something</a></li> <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li> </ul> </body> </html>""" rules = { "heading": "h1#main", "news(li.newsitem)": [{
"annuncio_url": "div[class=th_box] a @href", "annuncio_desc": "div[class=descr] p a strong", "annuncio_ora": "div[class=date]", }], "next_page_url": ".//a[contains(., 'Avanti')]/@href", } detrules = { "info(div.annuncio_info li)": [{ "item": ".", }], "coord": ".//script[contains(., 'loadMapQuest')]", } parselet = parslepy.Parselet(rules) detparselet = parslepy.Parselet(detrules) next_url = "http://www.subito.it/annunci-emilia-romagna/vendita/appartamenti/" while next_url: print "fetching", next_url current_url = next_url # ottiene il contenuto della pagina html = requests.get(next_url) extracted = parselet.parse_fromstring(html.content) for release in extracted.get("annunci"):
def get_details(self): sys.stdout.write('Answers {0}\n'.format(self.next_list_url)) contents = self.url_get(self.next_list_url) page = parslepy.Parselet( self.answer_parsing_rules).parse_fromstring(contents) for row in page['papers']: if len(row['cell']) == 11: url = row['cell'][8]['url'] types = url.partition(".") date_published = row['cell'][2]['contents'].strip() try: date_published = datetime.datetime.strptime( date_published, '%d %B %Y').date() except: warnings.warn("Failed to parse date (%s)" % date_published) date_published = None continue document_name = row['cell'][0]['contents'].strip().upper() try: document_data = self.document_name_regex.match( document_name).groupdict() except: if document_name not in self.known_bad_document_names: sys.stdout.write( 'SKIPPING bad document_name {0}\n'.format( document_name)) continue # FIXME - Temporary fix for launch # drop anything which doesn't have a written_number if not document_data['written_number']: continue # The President and vice Deputy President have their own # oral question sequences. president = document_data.pop('president') if president == 'P': document_data['president_number'] = document_data.pop( 'oral_number') if president == 'DP': document_data['dp_number'] = document_data.pop( 'oral_number') document_data.update( dict( document_name=document_name, date_published=date_published, language=row['cell'][6]['contents'], url=self.base_url + url, type=types[2], )) try: document_data['date'] = datetime.datetime.strptime( document_data.pop('date_string'), '%y%m%d', ).date() except: sys.stdout.write( "BAILING on {0} - problem converting date\n".format( document_name)) continue # We don't want anything from before the 2009 election. if document_data['date'] < datetime.date(2009, 4, 22): continue document_data['year'] = document_data['date'].year self.details.append(document_data) # check for next page of links (or None if not found) self.next_list_url = None for cell in page['next']: if cell['contents'] == 'Next': next_url = self.base_url + cell['url'] if self.next_list_url == next_url: raise Exception( "Possible url loop detected, next url '{0}' has not changed." .format(next_url)) self.next_list_url = next_url break
def test_itunes_top_albums(self): input_parselet, expected_output = ( {"entries(//atom:feed/atom:entry)": [{ "title": "atom:title", "name": "im:name", "id": "atom:id/@im:id", "artist(im:artist)": { "name": ".", "href": "@href", }, "images(im:image)": [{ "height": "@height", "url": ".", }], #"content": "atom:content[@type='html']" "releasedate": "im:releaseDate", }] }, {'entries': [{'artist': {'href': 'https://itunes.apple.com/us/artist/wale/id129335935?uo=2', 'name': 'Wale'}, 'id': '647928068', 'images': [{'height': '55', 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg'}, {'height': '170', 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg'}], 'name': 'The Gifted', 'releasedate': '2013-06-24T00:00:00-07:00', 'title': 'The Gifted - Wale'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/kanye-west/id2715720?uo=2', 'name': 'Kanye West'}, 'id': '662392801', 'images': [{'height': '55', 'url': 'http://a697.phobos.apple.com/us/r1000/033/Music4/v4/b8/fc/be/b8fcbe49-510d-8afe-7c34-fa268da339f2/UMG_cvrart_00602537439317_01_RGB72_1500x1500_13UAAIM08444.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1419.phobos.apple.com/us/r1000/033/Music4/v4/b8/fc/be/b8fcbe49-510d-8afe-7c34-fa268da339f2/UMG_cvrart_00602537439317_01_RGB72_1500x1500_13UAAIM08444.60x60-50.jpg'}, {'height': '170', 'url': 'http://a1930.phobos.apple.com/us/r1000/033/Music4/v4/b8/fc/be/b8fcbe49-510d-8afe-7c34-fa268da339f2/UMG_cvrart_00602537439317_01_RGB72_1500x1500_13UAAIM08444.170x170-75.jpg'}], 'name': 'Yeezus', 'releasedate': '2013-06-18T00:00:00-07:00', 'title': 'Yeezus - Kanye West'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/j-cole/id73705833?uo=2', 'name': 'J Cole'}, 'id': '651105499', 'images': [{'height': '55', 'url': 'http://a537.phobos.apple.com/us/r30/Music2/v4/c5/03/68/c5036883-38b9-702c-baf0-876db639b1f9/886444025935.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1259.phobos.apple.com/us/r30/Music2/v4/c5/03/68/c5036883-38b9-702c-baf0-876db639b1f9/886444025935.60x60-50.jpg'}, {'height': '170', 'url': 'http://a1354.phobos.apple.com/us/r30/Music2/v4/c5/03/68/c5036883-38b9-702c-baf0-876db639b1f9/886444025935.170x170-75.jpg'}], 'name': 'Born Sinner (Deluxe Version)', 'releasedate': '2013-06-14T00:00:00-07:00', 'title': 'Born Sinner (Deluxe Version) - J Cole'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/august-burns-red/id47796394?uo=2', 'name': 'August Burns Red'}, 'id': '655052532', 'images': [{'height': '55', 'url': 'http://a854.phobos.apple.com/us/r30/Music2/v4/05/81/64/05816462-e832-80e4-9fa1-554d9bdd2542/886443989689.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1576.phobos.apple.com/us/r30/Music2/v4/05/81/64/05816462-e832-80e4-9fa1-554d9bdd2542/886443989689.60x60-50.jpg'}, {'height': '170', 'url': 'http://a359.phobos.apple.com/us/r30/Music2/v4/05/81/64/05816462-e832-80e4-9fa1-554d9bdd2542/886443989689.170x170-75.jpg'}], 'name': 'Rescue & Restore', 'releasedate': '2013-06-25T00:00:00-07:00', 'title': 'Rescue & Restore - August Burns Red'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/mac-miller/id419944559?uo=2', 'name': 'Mac Miller'}, 'id': '650864146', 'images': [{'height': '55', 'url': 'http://a1599.phobos.apple.com/us/r30/Music/v4/7c/03/68/7c03681e-3cb6-23cb-5584-5b9dd42e54f7/040232021398_Cover.55x55-70.jpg'}, {'height': '60', 'url': 'http://a321.phobos.apple.com/us/r30/Music/v4/7c/03/68/7c03681e-3cb6-23cb-5584-5b9dd42e54f7/040232021398_Cover.60x60-50.jpg'}, {'height': '170', 'url': 'http://a1696.phobos.apple.com/us/r30/Music/v4/7c/03/68/7c03681e-3cb6-23cb-5584-5b9dd42e54f7/040232021398_Cover.170x170-75.jpg'}], 'name': 'Watching Movies With the Sound Off (Deluxe Edition)', 'releasedate': '2013-06-18T00:00:00-07:00', 'title': 'Watching Movies With the Sound Off (Deluxe Edition) - Mac Miller'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/daft-punk/id5468295?uo=2', 'name': 'Daft Punk'}, 'id': '617154241', 'images': [{'height': '55', 'url': 'http://a1849.phobos.apple.com/us/r1000/096/Music2/v4/52/aa/50/52aa5008-4934-0c27-a08d-8ebd7d13c030/886443919266.55x55-70.jpg'}, {'height': '60', 'url': 'http://a923.phobos.apple.com/us/r1000/096/Music2/v4/52/aa/50/52aa5008-4934-0c27-a08d-8ebd7d13c030/886443919266.60x60-50.jpg'}, {'height': '170', 'url': 'http://a1450.phobos.apple.com/us/r1000/096/Music2/v4/52/aa/50/52aa5008-4934-0c27-a08d-8ebd7d13c030/886443919266.170x170-75.jpg'}], 'name': 'Random Access Memories', 'releasedate': '2013-05-21T00:00:00-07:00', 'title': 'Random Access Memories - Daft Punk'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/skillet/id1750802?uo=2', 'name': 'Skillet'}, 'id': '655774977', 'images': [{'height': '55', 'url': 'http://a545.phobos.apple.com/us/r1000/050/Music/v4/b8/3f/7b/b83f7b74-4e7a-6b06-9385-667dc1288d7d/075679954787.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1267.phobos.apple.com/us/r1000/050/Music/v4/b8/3f/7b/b83f7b74-4e7a-6b06-9385-667dc1288d7d/075679954787.60x60-50.jpg'}, {'height': '170', 'url': 'http://a114.phobos.apple.com/us/r1000/050/Music/v4/b8/3f/7b/b83f7b74-4e7a-6b06-9385-667dc1288d7d/075679954787.170x170-75.jpg'}], 'name': 'Rise', 'releasedate': '2013-06-21T00:00:00-07:00', 'title': 'Rise - Skillet'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/skillet/id1750802?uo=2', 'name': 'Skillet'}, 'id': '662457451', 'images': [{'height': '55', 'url': 'http://a399.phobos.apple.com/us/r1000/022/Music/v4/87/3e/eb/873eebf6-618c-d8e1-b8df-4d0b60f6729b/075679954749.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1473.phobos.apple.com/us/r1000/022/Music/v4/87/3e/eb/873eebf6-618c-d8e1-b8df-4d0b60f6729b/075679954749.60x60-50.jpg'}, {'height': '170', 'url': 'http://a880.phobos.apple.com/us/r1000/022/Music/v4/87/3e/eb/873eebf6-618c-d8e1-b8df-4d0b60f6729b/075679954749.170x170-75.jpg'}], 'name': 'Rise (Deluxe Version)', 'releasedate': '2013-06-21T00:00:00-07:00', 'title': 'Rise (Deluxe Version) - Skillet'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/attila/id46893195?uo=2', 'name': 'Attila'}, 'id': '649587514', 'images': [{'height': '55', 'url': 'http://a608.phobos.apple.com/us/r30/Music/v4/ee/7d/b2/ee7db2ad-e783-2c3a-2ad3-6549868315e7/793018342834.55x55-70.jpg'}, {'height': '60', 'url': 'http://a1682.phobos.apple.com/us/r30/Music/v4/ee/7d/b2/ee7db2ad-e783-2c3a-2ad3-6549868315e7/793018342834.60x60-50.jpg'}, {'height': '170', 'url': 'http://a1297.phobos.apple.com/us/r30/Music/v4/ee/7d/b2/ee7db2ad-e783-2c3a-2ad3-6549868315e7/793018342834.170x170-75.jpg'}], 'name': 'About That Life', 'releasedate': '2013-06-25T00:00:00-07:00', 'title': 'About That Life - Attila'}, {'artist': {'href': 'https://itunes.apple.com/us/artist/india.arie/id92325?uo=2', 'name': 'India.Arie'}, 'id': '659585460', 'images': [{'height': '55', 'url': 'http://a1694.phobos.apple.com/us/r30/Music/v4/d5/65/b2/d565b212-4463-6486-7ee2-eeab22ff3d87/UMG_cvrart_00602537429486_01_RGB72_1500x1500_13UAAIM06584.55x55-70.jpg'}, {'height': '60', 'url': 'http://a768.phobos.apple.com/us/r30/Music/v4/d5/65/b2/d565b212-4463-6486-7ee2-eeab22ff3d87/UMG_cvrart_00602537429486_01_RGB72_1500x1500_13UAAIM06584.60x60-50.jpg'}, {'height': '170', 'url': 'http://a63.phobos.apple.com/us/r30/Music/v4/d5/65/b2/d565b212-4463-6486-7ee2-eeab22ff3d87/UMG_cvrart_00602537429486_01_RGB72_1500x1500_13UAAIM06584.170x170-75.jpg'}], 'name': 'SongVersation (Deluxe Edition)', 'releasedate': '2013-06-25T00:00:00-07:00', 'title': 'SongVersation (Deluxe Edition) - India.Arie'}]} ) xsh = parslepy.selectors.XPathSelectorHandler( namespaces={ 'atom': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss' }) parselet = parslepy.Parselet( input_parselet, selector_handler=xsh, strict=True, debug=self.debug) extracted = parselet.extract(self.docroot) if self.debug: pprint.pprint(extracted) assert_dict_equal(extracted, expected_output)
def init_parselet_expect_syntax_error(self, parselet): parslepy.Parselet(parselet)
def test_parslepy_init_invalid_parselet(): parselet = parslepy.Parselet("{ 'title': 'h1'}")