def parse_parsley(self, response): exit() parslet = PyParsley(T1, output='python') res = UturnItem(T1, parslet.parse(string=response.body)) pprint.pprint(res) print "parsley end" return res
def __init__(self, parseletfile=None): if parseletfile: with open(parseletfile) as jsonfp: self.parselet = PyParsley(jsonfp) else: #print "using T1" self.parselet = PyParsley(FAP_PARSELET)
def f(dictionary, ten_range): dictionary[1] = '1' ten_range.reverse() structure = {"title": "/div/div/div"} filepath = '/home/ubuntu/Programs/drupal/scrapy-parsley_wrappers/scrapy_parsley/tests/yelp/yelp.html' parselet = PyParsley(structure) dictionary['sub'] = parselet.parse(file=filepath, output='json') dictionary['2'] = 2 dictionary[0.25] = None
def setUp(self): self.parsley = PyParsley({'title': 'title'}) self.alt_parsley = PyParsley('{"title": "title"}') self.a_parsley = PyParsley({'links': ['regexp:match(a @href, ".*sign.*")']}) self.__file__ = currentframe().f_code.co_filename self.__dir__ = dirname(self.__file__) self.file = self.__dir__ + '/yelp.html' self.json = '{ "title": "\\t\\tNick\'s Crispy Tacos - Russian Hill - San Francisco, CA\\n" }' self.native = { "title": "\t\tNick's Crispy Tacos - Russian Hill - San Francisco, CA\n" } self.links = '{ "links": [ "\\/signup?return_url=%2Fuser_details", "\\/signup?return_url=%2Fwriteareview", "\\/signup?return_url=%2Finvite_friends", "\\/signup?return_url=%2Fmail", "\\/signup?return_url=%2Fprofile", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup" ] }' self.unicode_string = u'\u2019blah blah blah\u2019' self.unicode_document = u'<html><title>\u2019blah blah blah\u2019</title></html>'
def parse(self, response): if(self.parselet): parselet4 = PyParsley(FAP_PARSELET) extract = parselet4.parse(file = "http://www.imagefap.com/gallery.php", output = "python") #extract = parselet4.parse(string=config.gallery_php, output = "python") #l = GalleryItemLoader(item=GalleryItem, response=response) for g in extract["galls"]: #self.log("###%s" % g["title"]) res = GalleryItem(g) #res["title"] = g["title"] yield res
def parse(self, response): if (self.parselet): parselet4 = PyParsley(FAP_PARSELET) extract = parselet4.parse( file="http://www.imagefap.com/gallery.php", output="python") #extract = parselet4.parse(string=config.gallery_php, output = "python") #l = GalleryItemLoader(item=GalleryItem, response=response) for g in extract["galls"]: #self.log("###%s" % g["title"]) res = GalleryItem(g) #res["title"] = g["title"] yield res
class TestPyParsley(unittest.TestCase): def setUp(self): self.parsley = PyParsley({'title': 'title'}) self.alt_parsley = PyParsley('{"title": "title"}') self.a_parsley = PyParsley({'links': ['regexp:match(a @href, ".*sign.*")']}) self.__file__ = currentframe().f_code.co_filename self.__dir__ = dirname(self.__file__) self.file = self.__dir__ + '/yelp.html' self.json = '{ "title": "\\t\\tNick\'s Crispy Tacos - Russian Hill - San Francisco, CA\\n" }' self.native = { "title": "\t\tNick's Crispy Tacos - Russian Hill - San Francisco, CA\n" } self.links = '{ "links": [ "\\/signup?return_url=%2Fuser_details", "\\/signup?return_url=%2Fwriteareview", "\\/signup?return_url=%2Finvite_friends", "\\/signup?return_url=%2Fmail", "\\/signup?return_url=%2Fprofile", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup", "\\/signup" ] }' self.unicode_string = u'\u2019blah blah blah\u2019' self.unicode_document = u'<html><title>\u2019blah blah blah\u2019</title></html>' def test_unicode(self): parsed = self.parsley.parse(string = self.unicode_document.encode("utf-8"), output = "python", utf8 = 1) self.assertEquals(parsed['title'].decode("utf-8"), self.unicode_string) def test_file_xml(self): parsed = self.parsley.parse(file = self.file, output = "json") self.assertEquals(self.json, parsed) def test_pruning(self): parsed = self.a_parsley.parse(file = self.file, output = "json") self.assertEquals(self.links, parsed) def test_json_file_xml(self): parsed = self.alt_parsley.parse(file = self.file, output = "json") self.assertEquals(self.json, parsed) def test_native(self): parsed = self.alt_parsley.parse(file = self.file, output = "python") self.assertEquals(self.native, parsed) parsed = self.alt_parsley.parse(file = self.file) self.assertEquals(self.native, parsed)
def parse_parsley(self, response): parslet = PyParsley(self.parslet_code, output='python') return ParsleyItem(self.parslet_code, parslet.parse(string=response.body))