Beispiel #1
0
 def test_init(self):
     self.assertRaises(Exception, Response)
     self.assertRaises(Exception, Response, url='http://www.example.com/')
     self.assertRaises(Exception, Response, request=Request('http://www.example.com/'))
     self.assertRaises(ValueError,
                       Response,
                       url='foo',
                       request=Request('http://www.example.com/')
                       )
     self.assertRaises(ValueError,
                       Response,
                       'http://www.example.com/',
                       status='foo',
                       request=Request('http://www.example.com/')
                       )
     self.assertRaises(TypeError,
                       Response,
                       'http://www.example.com/',
                       request='foo'
                       )
     response = Response('http://www.example.com/',
                         Request('http://www.example.com/')
                         )
     assert response.url
     assert not response.body
     response = Response('http://www.example.com/',
                         Request('http://www.example.com/'),
                         headers={'Content-Type': 'text/html',
                                  'Content-Length': 1234
                                  }
                         )
Beispiel #2
0
    def test_dynamic_request_browser_actions(self):
        cm = CookiesMiddleware(self.spider, self.spider.settings)
        self.driver = webdriver.Chrome()
        dh = DownloadHandler(self.spider, self.driver, self.driver_sem)

        def _actions(driver):
            driver.find_element_by_name('account').send_keys("username")
            driver.find_element_by_name('password').send_keys("pwd")
            driver.find_element_by_xpath(
                '/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button'
            ).click()
            gevent.sleep(5)

        request = Request(
            'https://www.zhihu.com/#signin',
            dynamic=True,
            meta={'cookiejar': 'test'},
            browser_actions=[_actions],
        )
        cm.process_request(request)
        response = dh.fetch(request)
        cm.process_response(request, response)

        request = Request('https://www.zhihu.com',
                          dynamic=True,
                          meta={'cookiejar': 'test'})
        cm.process_request(request)
        response = dh.fetch(request)
        cm.process_response(request, response)
        print response.body
        self.driver.close()
Beispiel #3
0
    def test_body(self):
        r1 = Response(url="http://www.example.com/",
                      request=Request('http://www.example.com/')
                      )
        assert r1.body == b''

        r2 = Response(url="http://www.example.com/",
                      body=b"",
                      request=Request('http://www.example.com/'))
        assert isinstance(r2.body, bytes)
        self.assertEqual(r2.encoding, 'utf-8')  # default encoding

        r3 = Response(url="http://www.example.com/",
                      body=u"Price: \xa3100",
                      encoding='utf-8',
                      request=Request('http://www.example.com/'))
        assert isinstance(r3.body, bytes)
        self.assertEqual(r3.body, b"Price: \xc2\xa3100")

        r4 = Response(url="http://www.example.com/",
                      request=Request('http://www.example.com/'),
                      body=u"Price: \xa3100",
                      encoding='latin1'
                      )
        assert isinstance(r4.body, bytes)
        self.assertEqual(r4.body, b"Price: \xa3100")
 def test_url(self):
     request = Request('http://www.example.com/')
     self.assertIsInstance(request.url, str)
     self.assertEqual(request.url, 'http://www.example.com/')
     request = Request(u'http://www.example.com?content=测试')
     self.assertEqual(request.url,
                      safe_url_string('http://www.example.com?content=测试'))
     self.assertRaises(TypeError, Request, 123)
 def test_copy(self):
     request1 = Request('http://www.example.com/',
                        headers={
                            'Content-Type': 'text/html',
                            'Content-Length': 1234
                        },
                        method='get')
     request2 = request1.copy()
     assert request1.__dict__ == request2.__dict__
     self.assertEqual(request1.headers, request2.headers)
     self.assertEqual(request1, request2)
     self.assertIsNot(request1, request2)
    def test_process_request_interval(self):
        self.spider.settings.set("PROXY_LIST", ['218.76.106.78:3128'])
        request = Request('http://httpbin.org/get')
        pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
        dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
        pm.process_request(request)
        time1 = time.time()
        dh.fetch(request)

        request = Request('http://httpbin.org/get')
        pm.process_request(request)
        self.assertGreater(time.time() - time1, 3)
 def test_init(self):
     self.assertRaises(Exception, Request)
     self.assertRaises(ValueError, Request, 'foo')
     request = Request('http://www.example.com/')
     assert request.url
     assert not request.body
     request = Request('http://www.example.com/',
                       headers={
                           'Content-Type': 'text/html',
                           'Content-Length': 1234
                       },
                       method='get')
     self.assertEqual(request.method, 'GET')
Beispiel #8
0
 def test_timeout_dynamic(self):
     self.driver = webdriver.PhantomJS()
     self.spider.settings.set('TIMEOUT', 5)
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     self.assertRaises(TimeoutException, dh.fetch,
                       Request(HTTPBIN_URL + '/delay/10', dynamic=True))
     self.driver.close()
    def test_process_response(self):
        request = Request('http://httpbin.org/')
        response = Response('http://httpbin.org/', request, status=500)
        rm = RetryMiddleware(self.spider.settings, self.spider.logger)
        request.meta["dont_retry"] = True
        self.assertEqual(rm.process_response(request, response), response)

        request.meta["dont_retry"] = False
        request = rm.process_response(request, response)
        self.assertIsInstance(request, Request)
        self.assertEqual(request.meta.get("retry_count"), 1)
        request = rm.process_response(request, response)
        self.assertIsInstance(request, Request)
        request = rm.process_response(request, response)
        self.assertIsInstance(request, Request)
        self.assertIsInstance(rm.process_response(request, response), Response)
 def test_process_request(self):
     self.spider.settings.set("PROXY_LIST", ['124.88.67.54:80'])
     request = Request('http://httpbin.org/get')
     pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
     dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
     pm.process_request(request)
     response = dh.fetch(request)
     assert response.body
 def after_login(self, response):
     html = response.body
     selector = etree.HTML(html)
     links = selector.xpath('//a[@class="question_link"]')
     for link in links:
         yield Request('https://www.zhihu.com' + link.attrib["href"],
                       meta={"cookiejar": "zhihu"},
                       callback=self.get_item)
Beispiel #12
0
 def test_dynamic_request_cookie_between_static_and_dynamic(self):
     cm = CookiesMiddleware(self.spider, self.spider.settings)
     self.driver = webdriver.PhantomJS()
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     request = Request(HTTPBIN_URL + '/cookies/set?key1=val1&key2=val2',
                       dynamic=True,
                       meta={'cookiejar': 'test'})
     response = dh.fetch(request)
     cm.process_response(request, response)
     request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test'})
     cm.process_request(request)
     response = dh.fetch(request)
     self.assertEqual(
         json.loads(response.body)['cookies'], {
             u'key1': u'val1',
             u'key2': u'val2'
         })
     self.driver.close()
Beispiel #13
0
    def test_post_data_content_static(self):
        dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
        response = dh.fetch(
            Request(HTTPBIN_URL + '/post',
                    method='POST',
                    body={'text': 'pycreeper'}))
        self.assertIsInstance(response, Response)
        self.assertEqual(
            json.loads(response.body)['form'], {'text': 'pycreeper'})

        response = dh.fetch(
            Request(HTTPBIN_URL + '/post', method='POST', body=u'Unicode测试'))
        self.assertEqual(json.loads(response.body)['data'], 'Unicode测试')

        response = dh.fetch(
            Request(HTTPBIN_URL + '/post', method='POST', body='中文测试'))
        self.assertEqual(json.loads(response.body)['data'], '中文测试')
        self.assertEqual(response.status, 200)
 def test_process_request(self):
     request = Request('http://httpbin.org/user-agent')
     self.assertIs(request.headers.get("User-Agent"), None)
     uam = UserAgentMiddleware(self.spider.settings, self.spider.logger)
     dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
     uam.process_request(request)
     response = dh.fetch(request)
     self.assertEqual(
         json.loads(response.body)['user-agent'],
         request.headers['User-Agent'])
Beispiel #15
0
 def test_concurrency_with_delayed_url(self):
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     n = 5
     pool = Pool(n)
     urls = []
     for i in range(n):
         urls.append(HTTPBIN_URL + '/delay/1')
     time_start = time.time()
     pool.map(dh.fetch, [Request(url) for url in urls])
     time_total = time.time() - time_start
     self.assertLess(time_total, n)
    def start_requests(self):
        def _search(driver):
            driver.find_element_by_id('key').send_keys(u"联想笔记本", Keys.ENTER)
            gevent.sleep(3)
            self._jump_guide(driver)
            gevent.sleep(3)

        yield Request(url='https://www.jd.com/',
                      meta={"cookiejar": "jd"},
                      callback=self.parse_list,
                      dynamic=True,
                      browser_actions=[_search])
Beispiel #17
0
 def test_dynamic_request_concurrency(self):
     self.driver = webdriver.PhantomJS()
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     n = 5
     pool = Pool(n)
     urls = []
     for i in range(n):
         urls.append(HTTPBIN_URL + '/delay/1')
     time1 = time.time()
     pool.map(dh.fetch,
              [Request(url, dynamic=True, wait=5) for url in urls])
     self.assertGreater(time.time() - time1, n)
     self.driver.close()
    def start_requests(self):
        def _login(driver):
            driver.find_element_by_name('account').send_keys("username")
            driver.find_element_by_name('password').send_keys("password")
            driver.find_element_by_xpath(
                '/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button'
            ).click()
            gevent.sleep(5)

        yield Request(url='https://www.zhihu.com/#signin',
                      meta={"cookiejar": "zhihu"},
                      callback=self.after_login,
                      dynamic=True,
                      browser_actions=[_login])
Beispiel #19
0
    def test_copy(self):
        response1 = Response('http://www.example.com/',
                             headers={'Content-Type': 'text/html',
                                      'Content-Length': 1234
                                      },
                             request=Request('http://www.example.com/')
                             )
        response2 = response1.copy()
        assert response1.__dict__ == response2.__dict__
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.request, response2.request)
        self.assertEqual(response1, response2)

        self.assertIsNot(response1.headers, response2.headers)
        self.assertIsNot(response1.request, response2.request)
        self.assertIsNot(response1, response2)
    def parse_list(self, response):
        html = response.body
        selector = etree.HTML(html)
        links = selector.xpath('//div[@class="p-img"]/a')
        titles = selector.xpath('//div[@class="p-name p-name-type-2"]/a/em')
        imgs = selector.xpath('//div[@class="p-img"]/a/img')
        prices = selector.xpath('//div[@class="p-price"]/strong/i')
        for i in range(len(links)):
            try:
                yield {
                    'path':
                    links[i].attrib["href"]
                    if 'http' in links[i].attrib["href"] else 'http:' +
                    links[i].attrib["href"],
                    'title':
                    parser.unescape(
                        etree.tostring(titles[i], pretty_print=True)),
                    'img':
                    imgs[i].attrib["src"] if 'http' in imgs[i].attrib["src"]
                    else 'http:' + imgs[i].attrib["src"],
                    'price':
                    prices[i].text,
                }
            except Exception as e:
                pass

            url = response.url

        def _next_page(driver):
            self._jump_guide(driver)
            driver.find_element_by_xpath(
                '//*[@id="J_bottomPage"]/span[1]/a[9]').click()
            self._jump_guide(driver)

        yield Request(url=url,
                      meta={"cookiejar": "jd"},
                      callback=self.parse_list,
                      dynamic=True,
                      browser_actions=[_next_page])
from pycreeper.http.request import Request
from pycreeper.spider import Spider
from Queue import Empty

__doctests__ = ['pycreeper.utils.scheduler']

URLS = [
    'http://www.example.com/index.html#print',
    'http://www.example.com/index.html',
    'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1',
    'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1',
    'http://www.xxxxx.com/index.html?test123123',
    'http://www.xxxxx.com/index.html', 'ftp://www.xxxxx.com/index.html'
]

REQUEST = [Request(url) for url in URLS]


class RequestTest(unittest.TestCase):
    def test_basic(self):
        request_filter = RequestFilter()
        request_filter.request_seen(REQUEST[0])
        self.assertEqual(request_filter.request_seen(REQUEST[0]), True)
        self.assertEqual(request_filter.request_seen(REQUEST[1]), False)
        self.assertEqual(request_filter.request_seen(REQUEST[1]), True)
        self.assertRaises(AttributeError, request_filter.request_seen, None)


class SchedulerTest(unittest.TestCase):
    def setUp(self):
        self.spider = Spider()
Beispiel #22
0
 def test_dynamic_request_wait(self):
     self.driver = webdriver.PhantomJS()
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     request = Request(HTTPBIN_URL + '/get', dynamic=True, wait=3)
     dh.fetch(request)
     self.driver.close()
Beispiel #23
0
 def test_timeout_static(self):
     self.spider.settings.set('TIMEOUT', 5)
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     self.assertRaises(TimeoutException, dh.fetch,
                       Request(HTTPBIN_URL + '/delay/10'))
Beispiel #24
0
 def test_post_data_static(self):
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST'))
     self.assertIsInstance(response, Response)
     self.assertEqual(response.status, 200)
Beispiel #25
0
 def test_get_data(self):
     dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
     response = dh.fetch(Request(HTTPBIN_URL + '/get'))
     self.assertIsInstance(response, Response)
     self.assertEqual(response.status, 200)
Beispiel #26
0
 def start_requests(self):
     """start_requests
     """
     for url in self.start_urls:
         yield Request(url)
Beispiel #27
0
 def test_request(self):
     response = Response('http://www.example.com/',
                         request=Request('http://www.example.com/')
                         )
     self.assertIsInstance(response.request, Request)
     self.assertEqual(response.request, Request('http://www.example.com/'))