Exemple #1
0
 def test_dupefilter(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     name = 'books.toscrape.com'
     spider = smanager.create(name)
     spec = smanager._specs["spiders"][name]
     dropmeta = DropMetaPipeline(Settings({"SLYDROPMETA_ENABLED": True}))
     result = {
         "breadcrumbs": ["Home", "Books", "Mystery"],
         "description": [
             u"WICKED above her hipbone, GIRL across her heart Words are like a road map to reporter Camille Preaker’s troubled past. Fresh from a brief stay at a psych hospital, Camille’s first assignment from the second-rate daily paper where she works brings her reluctantly back to her hometown to cover the murders of two preteen girls. NASTY on her kneecap, BABYDOLL on her leg Since WICKED above her hipbone, GIRL across her heart Words are like a road map to reporter Camille Preaker’s troubled past. Fresh from a brief stay at a psych hospital, Camille’s first assignment from the second-rate daily paper where she works brings her reluctantly back to her hometown to cover the murders of two preteen girls. NASTY on her kneecap, BABYDOLL on her leg Since she left town eight years ago, Camille has hardly spoken to her neurotic, hypochondriac mother or to the half-sister she barely knows: a beautiful thirteen-year-old with an eerie grip on the town. Now, installed again in her family’s Victorian mansion, Camille is haunted by the childhood tragedy she has spent her whole life trying to cut from her memory. HARMFUL on her wrist, W***E on her ankle As Camille works to uncover the truth about these violent crimes, she finds herself identifying with the young victims—a bit too strongly. Clues keep leading to dead ends, forcing Camille to unravel the psychological puzzle of her own past to get at the story. Dogged by her own demons, Camille will have to confront what happened to her years before if she wants to survive this homecoming.With its taut, crafted writing, Sharp Objects is addictive, haunting, and unforgettable. ...more"
         ],
         "image": [
             "http://books.toscrape.com/media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg"
         ],
         "info": {
             "price": ["47.82"],
             "stock": ["20"],
             "tax": ["0.00"],
             "type": ["Books"],
             "upc": ["e00eb4fd7b871a48"]
         },
         "url":
         "http://books.toscrape.com/catalogue/sharp-objects_997/index.html"
     }
     tid = '3617-44af-a2f0'
     extracted = next(t for t in spec["templates"] if t['page_id'] == tid)
     processed = dropmeta.process_item(extracted['results'][0], spider)
     self.assertEqual(result, processed)
 def test_dupefilter(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     name = 'books.toscrape.com'
     spider = smanager.create(name)
     spec = smanager._specs["spiders"][name]
     dropmeta = DropMetaPipeline(Settings({"SLYDROPMETA_ENABLED": True}))
     result = {
         "breadcrumbs": ["Home", "Books", "Mystery"],
         "description": [
             u"WICKED above her hipbone, GIRL across her heart Words are "
             u"like a road map to reporter Camille Preaker’s troubled past."
             u" Fresh from a brief stay at a psych hospital, Camille’s "
             u"first assignment from the second-rate daily paper where she "
             u"works brings her reluctantly back to her hometown to cover "
             u"the murders of two preteen girls. NASTY on her kneecap, "
             u"BABYDOLL on her leg Since WICKED above her hipbone, GIRL "
             u"across her heart Words are like a road map to reporter "
             u"Camille Preaker’s troubled past. Fresh from a brief stay at "
             u"a psych hospital, Camille’s first assignment from the "
             u"second-rate daily paper where she works brings her "
             u"reluctantly back to her hometown to cover the murders of "
             u"two preteen girls. NASTY on her kneecap, BABYDOLL on her leg"
             u" Since she left town eight years ago, Camille has hardly "
             u"spoken to her neurotic, hypochondriac mother or to the "
             u"half-sister she barely knows: a beautiful thirteen-year-old "
             u"with an eerie grip on the town. Now, installed again in her "
             u"family’s Victorian mansion, Camille is haunted by the "
             u"childhood tragedy she has spent her whole life trying to cut"
             u" from her memory. HARMFUL on her wrist, W***E on her ankle "
             u"As Camille works to uncover the truth about these violent "
             u"crimes, she finds herself identifying with the young "
             u"victims—a bit too strongly. Clues keep leading to dead ends,"
             u" forcing Camille to unravel the psychological puzzle of her "
             u"own past to get at the story. Dogged by her own demons, "
             u"Camille will have to confront what happened to her years "
             u"before if she wants to survive this homecoming.With its "
             u"taut, crafted writing, Sharp Objects is addictive, haunting,"
             u" and unforgettable. ...more"
         ],
         "image": [
             "http://books.toscrape.com/media/cache/c0/59/c05972805aa720117"
             "1b8fc71a5b00292.jpg"
         ],
         "info": {
             "price": ["47.82"],
             "stock": ["20"],
             "tax": ["0.00"],
             "type": ["Books"],
             "upc": ["e00eb4fd7b871a48"]},
         "url": ("http://books.toscrape.com/catalogue/"
                 "sharp-objects_997/index.html")
     }
     tid = '3617-44af-a2f0'
     extracted = next(t for t in spec["templates"] if t['page_id'] == tid)
     processed = dropmeta.process_item(extracted['results'][0], spider)
     self.assertEqual(result, processed)
 def test_nested_items(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     name = 'books.toscrape.com'
     spider = smanager.create(name)
     spec = smanager._specs["spiders"][name]
     t = [t for t in spec["templates"] if t['page_id'] == '3617-44af-a2f0'][0]
     response = HtmlResponse(t['url'], body=t['original_body'].encode('utf-8'))
     results = [i for i in spider.parse(response)
                if hasattr(i, '__getitem__')]
     self.assertEqual(results, t['results'])
 def test_nested_items(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     name = 'books.toscrape.com'
     spider = smanager.create(name)
     spec = smanager._specs["spiders"][name]
     t = [t for t in spec["templates"] if t['page_id'] == '3617-44af-a2f0'][0]
     response = HtmlResponse(t['url'], body=t['original_body'].encode('utf-8'))
     results = [i for i in spider.parse(response)
                if hasattr(i, '__getitem__')]
     self.assertEqual(results, t['results'])
Exemple #5
0
class DupeFilterTest(TestCase):
    smanager = SlybotSpiderManager("%s/data/SampleProject" % _PATH)

    def test_dupefilter(self):
        name = "seedsofchange2"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t1, t2 = spec["templates"]

        dupefilter = DupeFilterPipeline(Settings({"SLYDUPEFILTER_ENABLED": True}))

        response1 = HtmlResponse(url=t1["url"], body=t1["original_body"].encode('utf-8'))
        response2 = HtmlResponse(url=t2["url"], body=t2["original_body"].encode('utf-8'))

        result1 = spider.handle_html(response1)
        for item1 in result1:
            if isinstance(item1, DictItem):
                break

        result2 = spider.handle_html(response2)
        for item2 in result2:
            if isinstance(item2, DictItem):
                break

        self.assertEqual(item1, dupefilter.process_item(item1, spider))
        self.assertEqual(item2, dupefilter.process_item(item2, spider))

        self.assertRaises(DropItem, dupefilter.process_item, item1, spider)
Exemple #6
0
class SpiderTest(TestCase):
    smanager = SlybotSpiderManager("%s/data/SampleProject" % _PATH)

    def test_spider_with_selectors(self):
        name = "seedsofchange"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t = spec["templates"][1]
        response = HtmlResponse(t["url"],
                                body=t["original_body"].encode('utf-8'))

        item = {
            '_template': u'4fac3b47688f920c7800000f',
            '_type': u'default',
            'image': u'previous data',
        }

        spider.plugins['Selectors'].process_item(item, response)

        self.assertEqual(item['breadcrumbs'], [
            u'Seeds & Supplies', u'Seeds', u'Vegetables', u'Squash & Pumpkins'
        ])
        self.assertEqual(
            item['image'],
            [u'previous data', u'/images/product_shots/PPS14165B.jpg'])
Exemple #7
0
class SpiderTest(TestCase):
    smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)

    def test_spider_with_selectors(self):
        name = "seedsofchange"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t = spec["templates"][1]
        response = HtmlResponse(t["url"],
                                body=t["original_body"].encode('utf-8'))

        item = {
            '_template': u'4fac3b47688f920c7800000f',
            '_type': u'default',
            'image': u'previous data',
        }

        spider.plugins['Selectors'].process_item(item, response)

        self.assertEqual(item['breadcrumbs'], [
            u'Seeds & Supplies', u'Seeds', u'Vegetables', u'Squash & Pumpkins'
        ])
        self.assertEqual(
            item['image'],
            [u'previous data', u'/images/product_shots/PPS14165B.jpg'])

    def test_spider_with_inbuilt_selectors(self):
        name = 'books.toscrape.com'
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t = spec["templates"][0]
        response = HtmlResponse(t['url'],
                                body=t['original_body'].encode('utf-8'))
        results = [
            i for i in spider.parse(response) if hasattr(i, '__getitem__')
        ]
        for result in results:
            result['posted'] = [result['posted'][0].strftime('%Y-%m-%d %H:%M')]
        self.assertEqual(results, t['_results'])

    def test_spider_with_surrounded_selectors(self):
        spider, page, results = open_spider_page_and_results('cs-cart.json')
        items = [i for i in spider.parse(page) if not isinstance(i, Request)]
        self.assertEqual(items, results)
Exemple #8
0
class SpiderTest(TestCase):
    smanager = SlybotSpiderManager("%s/data/SampleProject" % _PATH)

    def test_list(self):
        self.assertEqual(set(self.smanager.list()), set(["seedsofchange", "seedsofchange2",
                "seedsofchange.com", "pinterest.com", "ebay", "ebay2", "ebay3", "ebay4", "cargurus",
                "networkhealth.com", "allowed_domains", "any_allowed_domains", "example.com", "example2.com",
                "example3.com", "sitemaps"]))

    def test_spider_with_link_template(self):
        name = "seedsofchange"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t1, t2 = spec["templates"]
        target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]]

        items, link_regions = spider.plugins['Annotations'].extract_items(target1)
        self.assertEqual(items, [])
        self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target1, link_regions))), 104)

        items, link_regions = spider.plugins['Annotations'].extract_items(target2)
        self.assertEqual(items[0], {
                '_template': u'4fac3b47688f920c7800000f',
                '_type': u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [u'1-2 lbs. (75-95 days) This early, extremely productive, compact bush variety is ideal for small gardens.  Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.  Great for stuffing, soups and pies.'],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'product_id': [u'01593'],
                u'species': [u'Cucurbita maxima'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]}
        )
        self.assertEqual(link_regions, [])
        self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target2, link_regions))), 0)

    def test_spider_with_link_region_but_not_link_template(self):
        name = "seedsofchange2"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t1, t2 = spec["templates"]

        target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]]
        items, link_regions = spider.plugins['Annotations'].extract_items(target1)
        self.assertEqual(items[0], {
                '_template': u'4fad6a7c688f922437000014',
                '_type': u'default',
                u'category': [u'Onions'],
                u'days': [None],
                u'description': [u'(110-120 days)  Midsized Italian variety.  Long to intermediate day red onion that tolerates cool climates.  Excellent keeper.  We have grown out thousands of bulbs and re-selected this variety to be the top quality variety that it once was.  4-5" bulbs are top-shaped, uniformly colored, and have tight skins.'],
                u'lifecycle': [u'Heirloom/Rare'],
                u'name': [u'Rossa Di Milano Onion'],
                u'price': [u'3.49'],
                u'species': [u'Alium cepa'],
                u'type': [u'Heirloom/Rare'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS15978'}
        )
        self.assertEqual(link_regions, [])

        items, link_regions = spider.plugins['Annotations'].extract_items(target2)
        self.assertEqual(items[0], {
                '_template': u'4fad6a7d688f922437000017',
                '_type': u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [u'1-2 lbs. (75-95 days) This early, extremely productive, compact bush variety is ideal for small gardens.  Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.  Great for stuffing, soups and pies.'],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'species': [u'Cucurbita maxima'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]}
        )
        self.assertEqual(len(link_regions), 1)
        self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target1, link_regions))), 25)

    def test_login_requests(self):
        name = "pinterest.com"
        spider = self.smanager.create(name)
        login_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read())
        response.request = login_request
        form_request = login_request.callback(response)
        expected = {'_encoding': 'utf-8',
            'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2',
            'callback': 'after_login',
            'cookies': {},
            'dont_filter': True,
            'errback': None,
            'headers': {'Content-Type': ['application/x-www-form-urlencoded']},
            'meta': {},
            'method': 'POST',
            'priority': 0,
            'url': u'https://pinterest.com/login/?next=%2F'}

        self.assertEqual(request_to_dict(form_request, spider), expected)

        # simulate a simple response to login post from which extract a link
        response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>")
        result = list(spider.after_login(response))
        self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])

    def test_generic_form_requests(self):
        name = "ebay"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]

        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

    def test_generic_form_requests_with_file_field(self):
        name = "ebay2"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request
        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', u'type': u'form', 'field_index': 1, u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}]}, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request

        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}], u'type': u'form', 'field_index': 1}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse_form_page', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc')
        response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url':
            u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=',
            'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
            {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {},
            'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc',
            'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

    def test_generic_form_requests_with_spider_args(self):
        name = "ebay3"
        args = {'search_string': 'Cars'}
        spider = self.smanager.create(name, **args)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

    def test_allowed_domains(self):
        name = "allowed_domains"
        spider = self.smanager.create(name)
        expected = ['www.ebay.com', 'www.yahoo.com']
        self.assertEqual(spider.allowed_domains, expected)

    def test_allowed_domains_all(self):
        name = "any_allowed_domains"
        spider = self.smanager.create(name)
        expected = None
        self.assertEqual(spider.allowed_domains, expected)

    def test_allowed_domains_previous_behavior(self):
        name = "cargurus"
        spider = self.smanager.create(name)
        expected = ['www.cargurus.com']
        self.assertEqual(spider.allowed_domains, expected)

    def test_links_from_rss(self):
        body = open(join(_PATH, "data", "rss_sample.xml")).read()
        response = XmlResponse(url="http://example.com/sample.xml", body=body,
                headers={'Content-Type': 'application/rss+xml;charset=ISO-8859-1'})

        name = "cargurus"
        spider = self.smanager.create(name)

        urls = [r.url for r in spider.parse(response)]
        self.assertEqual(len(urls), 3)
        self.assertEqual(set(urls), set([
                "http://www.cargurus.com/Cars/2004-Alfa-Romeo-GT-Reviews-c10012",
                "http://www.cargurus.com/Cars/2005-Alfa-Romeo-GT-Reviews-c10013",
                "http://www.cargurus.com/Cars/2007-Alfa-Romeo-GT-Reviews-c10015"]))

    def test_links_from_atom(self):
        body = open(join(_PATH, "data", "atom_sample.xml")).read()
        response = XmlResponse(url="http://example.com/sample.xml", body=body,
                headers={'Content-Type': "application/atom+xml; charset=UTF-8"})

        name = "sitemaps"
        spider = self.smanager.create(name)

        urls = [r.url for r in spider.parse(response)]
        self.assertEqual(len(urls), 3)
        self.assertEqual(set(urls), set([
                "http://www.webupd8.org/sitemap.xml?page=1",
                "http://www.webupd8.org/sitemap.xml?page=2",
                "http://www.webupd8.org/sitemap.xml?page=3"]))

    def test_links_from_sitemap(self):
        body = open(join(_PATH, "data", "sitemap_sample.xml")).read()
        response = XmlResponse(url="http://example.com/sample.xml", body=body,
                headers={'Content-Type': "text/xml; charset=UTF-8"})

        name = "sitemaps"
        spider = self.smanager.create(name)

        urls = [r.url for r in spider.parse(response)]
        self.assertEqual(len(urls), 3)
        self.assertEqual(set(urls), set([
                "https://www.siliconrepublic.com/post-sitemap1.xml",
                "https://www.siliconrepublic.com/post-sitemap2.xml",
                "https://www.siliconrepublic.com/post-sitemap3.xml"]))

    def test_empty_content_type(self):
        name = "ebay4"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = Response(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
                            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        # must not raise an error
        for result in spider.parse(response):
            pass

    def test_variants(self):
        """Ensure variants are extracted as list of dicts"""

        name = "networkhealth.com"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        template, = spec["templates"]
        target = HtmlPage(url=template["url"], body=template["original_body"])
        items, link_regions = spider.plugins['Annotations'].extract_items(target)
        for item in items:
            for variant in item["variants"]:
                self.assertEqual(type(variant), dict)

    def test_start_requests(self):
        name = "example.com"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        start_requests = list(spider.start_requests())
        self.assertEqual(len(start_requests), 2)
        self.assertEqual(start_requests[0].url, 'http://www.example.com/products.csv')
        self.assertEqual(start_requests[1].url, 'http://www.example.com/index.html')

        csv = """
My feed

name,url,id
Product A,http://www.example.com/path,A
Product B,http://www.example.com/path2,B"""
        response = TextResponse(url="http://www.example.com/products.csv", body=csv)
        requests = list(start_requests[0].callback(spider, response))
        self.assertEqual(len(requests), 2)
        self.assertEqual(requests[0].url, 'http://www.example.com/path')
        self.assertEqual(requests[1].url, 'http://www.example.com/path2')

    def test_start_requests_allowed_domains(self):
        name = "example2.com"
        spider = self.smanager.create(name)
        self.assertEqual(spider.allowed_domains, ['www.example.com'])

    def test_override_start_urls(self):
        name = "example2.com"
        spider = self.smanager.create(name, start_urls=['http://www.example.com/override.html'])
        start_requests = list(spider.start_requests())
        self.assertEqual(start_requests[1].url, 'http://www.example.com/override.html')

    def test_links_to_follow(self):
        html = "<html><body><a href='http://www.example.com/link.html'>Link</a></body></html>"
        response = HtmlResponse(url='http://www.example.com/index.html', body=html)

        name = "example3.com"
        spider = self.smanager.create(name, links_to_follow='none')
        start_requests = list(spider.start_requests())

        requests = list(start_requests[0].callback(response))
        self.assertEqual(len(requests), 0)

    def test_js_enable_patterns(self):
        with splash_spider_manager() as manager:
            spider = manager.create("example3.com", js_enabled=True,
                                    js_enable_patterns=['/products/'])
        product_url = 'http://www.example.com/products/1234'
        aboutus_url = 'http://www.example.com/aboutus'
        request = spider._add_splash_meta(Request(product_url))
        self.assertEqual(request.meta['splash']['args']['url'], product_url)
        request = spider._add_splash_meta(Request(aboutus_url))
        self.assertEqual(request.meta.get('splash'), None)

    def test_js_disable_patterns(self):
        with splash_spider_manager() as manager:
            spider = manager.create("example3.com", js_enabled=True,
                                    js_disable_patterns=['/products/'])
        product_url = 'http://www.example.com/products/1234'
        aboutus_url = 'http://www.example.com/aboutus'
        request = spider._add_splash_meta(Request(product_url))
        self.assertEqual(request.meta.get('splash'), None)
        request = spider._add_splash_meta(Request(aboutus_url))
        self.assertEqual(request.meta['splash']['args']['url'], aboutus_url)

    def test_js_enable_and_disable_patterns(self):
        with splash_spider_manager() as manager:
            spider = manager.create("example3.com", js_enabled=True,
                                    js_enable_patterns=['/products/'],
                                    js_disable_patterns=['/products/[a-zA-Z]'])
        product_list_url = 'http://www.example.com/products/cameras'
        product_url = 'http://www.example.com/products/1234'
        request = spider._add_splash_meta(Request(product_list_url))
        self.assertEqual(request.meta.get('splash'), None)
        request = spider._add_splash_meta(Request(product_url))
        self.assertEqual(request.meta['splash']['args']['url'], product_url)
Exemple #9
0
def splash_spider_manager(splash_url='http://localhost:8050'):
    settings = get_project_settings()
    settings.set('SPLASH_URL', splash_url)
    yield SlybotSpiderManager("%s/data/SampleProject" % _PATH,
                              settings=settings)
 def setUp(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     self.spider = smanager.create('books.toscrape.com_1')
Exemple #11
0
 def setUp(self):
     smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH)
     self.spider = smanager.create('books.toscrape.com_1')
Exemple #12
0
class SpiderTest(TestCase):
    smanager = SlybotSpiderManager("%s/data/Plants" % _PATH)

    def test_list(self):
        self.assertEqual(set(self.smanager.list()), set(["seedsofchange", "seedsofchange2",
                "seedsofchange.com", "pinterest.com", "ebay"]))

    def test_spider_with_link_template(self):
        name = "seedsofchange"
        spider = self.smanager.create(name)
        with open(join(self.smanager.datadir, 'spiders', '%s.json' % name)) as f:
            spec = json.load(f)
        t1, t2 = spec["templates"]
        target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]]

        items, link_regions = spider.extract_items(target1)
        self.assertEqual(items, [])
        self.assertEqual(len(list(spider._process_link_regions(target1, link_regions))), 104)

        items, link_regions = spider.extract_items(target2)
        self.assertEqual(items[0], {
                '_template': u'4fac3b47688f920c7800000f',
                '_type': u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [u'1-2 lbs. (75-95 days)&nbsp;This early, extremely productive, compact bush variety is ideal for small gardens.&nbsp; Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.&nbsp; Great for stuffing, soups and pies.'],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'product_id': [u'01593'],
                u'species': [u'Cucurbita maxima'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]}
        )
        self.assertEqual(link_regions, [])
        self.assertEqual(len(list(spider._process_link_regions(target2, link_regions))), 0)

    def test_spider_with_link_region_but_not_link_template(self):
        name = "seedsofchange2"
        spider = self.smanager.create(name)
        with open(join(self.smanager.datadir, 'spiders', '%s.json' % name)) as f:
            spec = json.load(f)
        t1, t2 = spec["templates"]

        target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]]
        items, link_regions = spider.extract_items(target1)
        self.assertEqual(items[0], {
                '_template': u'4fad6a7c688f922437000014',
                '_type': u'default',
                u'category': [u'Onions'],
                u'days': [None],
                u'description': [u'(110-120 days)&nbsp; Midsized Italian variety.&nbsp; Long to intermediate day red onion that tolerates cool climates.&nbsp; Excellent keeper.&nbsp; We have grown out thousands of bulbs and re-selected this variety to be the top quality variety that it once was.&nbsp; 4-5&quot; bulbs are top-shaped, uniformly colored, and have tight skins.'],
                u'lifecycle': [u'Heirloom/Rare'],
                u'name': [u'Rossa Di Milano Onion'],
                u'price': [u'3.49'],
                u'species': [u'Alium cepa'],
                u'type': [u'Heirloom/Rare'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS15978'}
        )
        self.assertEqual(link_regions, [])

        items, link_regions = spider.extract_items(target2)
        self.assertEqual(items[0], {
                '_template': u'4fad6a7d688f922437000017',
                '_type': u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [u'1-2 lbs. (75-95 days)&nbsp;This early, extremely productive, compact bush variety is ideal for small gardens.&nbsp; Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.&nbsp; Great for stuffing, soups and pies.'],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'species': [u'Cucurbita maxima'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]}
        )
        self.assertEqual(len(link_regions), 1)
        self.assertEqual(len(list(spider._process_link_regions(target1, link_regions))), 25)

    def test_login_requests(self):
        name = "pinterest.com"
        spider = self.smanager.create(name)
        login_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read())
        response.request = login_request
        form_request = login_request.callback(response)
        expected = {'_encoding': 'utf-8',
            'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2',
            'callback': 'after_login',
            'cookies': {},
            'dont_filter': False,
            'errback': None,
            'headers': {'Content-Type': ['application/x-www-form-urlencoded']},
            'meta': {},
            'method': 'POST',
            'priority': 0,
            'url': u'https://pinterest.com/login/?next=%2F'}

        self.assertEqual(request_to_dict(form_request, spider), expected)

        # simulate a simple response to login post from which extract a link
        response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>")
        result = list(spider.after_login(response))
        self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])

    def test_generic_form_requests(self):
        name = "ebay"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]

        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_in_kw=1&_udlo=&_ex_kw=&_nkw=Cars&_ipg=50&_adv=1&_salic=1&_dmd=1&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_sop=12&_sasl=', 'dont_filter': False, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
                    {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_in_kw=2&_udlo=&_ex_kw=&_nkw=Cars&_ipg=50&_adv=1&_salic=1&_dmd=1&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_sop=12&_sasl=', 'dont_filter': False, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
                    {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_in_kw=3&_udlo=&_ex_kw=&_nkw=Cars&_ipg=50&_adv=1&_salic=1&_dmd=1&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_sop=12&_sasl=', 'dont_filter': False, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
                    {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_in_kw=4&_udlo=&_ex_kw=&_nkw=Cars&_ipg=50&_adv=1&_salic=1&_dmd=1&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_sop=12&_sasl=', 'dont_filter': False, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None},
                    {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)