Beispiel #1
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
    fetcher.run()
Beispiel #2
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port):
    g = ctx.obj
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    g.instances.append(fetcher)

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #3
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
Beispiel #4
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Beispiel #5
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        try:
            self.phantomjs = subprocess.Popen(
                [
                    "phantomjs",
                    os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
                    "25555",
                ]
            )
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Beispiel #6
0
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc,
          max_rate, max_burst, username, password):
    g = ctx.obj
    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['cdn'] = cdn

    if max_rate:
        app.config['max_rate'] = max_rate
    if max_burst:
        app.config['max_burst'] = max_burst
    if username:
        app.config['webui_username'] = username
    if password:
        app.config['webui_password'] = password

    # fetcher rpc
    if isinstance(fetcher_rpc, basestring):
        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
    if fetcher_rpc is None:
        from pyspider.fetcher.tornado_fetcher import Fetcher
        fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
        fetcher.phantomjs_proxy = g.phantomjs_proxy
        app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    else:
        import umsgpack
        app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data)

    if isinstance(scheduler_rpc, basestring):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % (
            os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    elif scheduler_rpc is None:
        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/')
    else:
        app.config['scheduler_rpc'] = scheduler_rpc

    app.debug = g.debug
    if g.get('testing_mode'):
        return app

    app.run(host=host, port=port)
Beispiel #7
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.fetcher.phantomjs_proxy = 'localhost:25555'
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
     self.phantomjs = subprocess.Popen(['phantomjs',
         os.path.join(os.path.dirname(__file__),
             '../pyspider/fetcher/phantomjs_fetcher.js'),
         '25555'])
Beispiel #8
0
def run_webui(g=g):
    import cPickle as pickle

    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    app.config['scheduler_rpc'] = g.scheduler_rpc
    #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
    if g.demo_mode:
        app.config['max_rate'] = 0.2
        app.config['max_burst'] = 3.0
    if 'WEBUI_USERNAME' in os.environ:
        app.config['webui_username'] = os.environ['WEBUI_USERNAME']
        app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
    if not getattr(g, 'all_in_one', False):
        app.debug = g.debug
    app.run(host=g.webui_host, port=g.webui_port)
Beispiel #9
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
Beispiel #10
0
class TestResponse(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
    }

    @classmethod
    def setUpClass(self):
        self.fetcher = Fetcher(None, None, async=False)
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = 'http://127.0.0.1:14887'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()

    def get(self, url, **kwargs):
        if not url.startswith('http://'):
            url = self.httpbin + url
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = url
        request.update(kwargs)
        task, result = self.fetcher.fetch(request)
        response = rebuild_response(result)
        self.assertEqual(response.status_code, 200, result)
        return response

    def test_10_html(self):
        response = self.get('/html')
        self.assertIsNotNone(response.doc('h1'))

    def test_20_xml(self):
        response = self.get('/xml')
        self.assertIsNotNone(response.doc('item'))

    def test_30_gzip(self):
        response = self.get('/gzip')
        self.assertIn('gzipped', response.text)

    def test_40_deflate(self):
        response = self.get('/deflate')
        self.assertIn('deflated', response.text)
Beispiel #11
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher):
    g = ctx.obj
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor,
                      poolsize=poolsize, proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = dict(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #12
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        "taskid": "taskid",
        "project": "project",
        "url": "http://echo.opera.com/",
        "fetch": {"method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "timeout": 60, "save": "abc"},
        "process": {"callback": "callback", "save": [1, 2, 3]},
    }

    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.rpc = xmlrpclib.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)

    @classmethod
    def tearDownClass(self):
        self.rpc._quit()
        self.thread.join()
        time.sleep(1)

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result["status_code"], 200)
        self.assertEqual(result["orig_url"], self.sample_task_http["url"])
        self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
        self.assertIn("content", result)

        content = result["content"]
        self.assertIn("..A:", content)
        self.assertIn("..Cookie:", content)
        self.assertIn("a=b", content)

    def test_10_http_post(self):
        request = dict(self.sample_task_http)
        request["fetch"]["method"] = "POST"
        request["fetch"]["data"] = "binux"
        request["fetch"]["cookies"] = {"c": "d"}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertEqual(result["orig_url"], self.sample_task_http["url"])
        self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
        self.assertIn("content", result)

        content = result["content"]
        self.assertIn("<h2>POST", content)
        self.assertIn("..A:", content)
        self.assertIn("..Cookie:", content)
        # FIXME: cookies in headers not supported
        self.assertNotIn("a=b", content)
        self.assertIn("c=d", content)
        self.assertIn("binux", content)

    def test_20_dataurl_get(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")

    def test_30_with_queue(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")

    def test_40_with_rpc(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        result = umsgpack.unpackb(self.rpc.fetch(data).data)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")

    def test_50_base64_data(self):
        request = dict(self.sample_task_http)
        request["fetch"]["method"] = "POST"
        request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result["status_code"], 200)
        self.assertIn(" d6 ", result["content"])
        self.assertIn(" d0 ", result["content"])
        self.assertIn(" ce ", result["content"])
        self.assertIn(" c4 ", result["content"])
Beispiel #13
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'http://echo.opera.com/',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = 'localhost:25555'
        self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.phantomjs = subprocess.Popen(['phantomjs',
            os.path.join(os.path.dirname(__file__),
                '../pyspider/fetcher/phantomjs_fetcher.js'),
            '25555'])

    @classmethod
    def tearDownClass(self):
        self.phantomjs.kill()
        self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()
        time.sleep(1)

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<b>A:', content)
        self.assertIn('<b>Cookie:</b>', content)
        self.assertIn('c=d</td>', content)

    def test_10_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<h2>POST', content)
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        # FIXME: cookies in headers not supported
        self.assertNotIn('a=b', content)
        self.assertIn('c=d', content)
        self.assertIn('binux', content)

    def test_20_dataurl_get(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_30_with_queue(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_40_with_rpc(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(data).data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn(' d6 ', result['content'])
        self.assertIn(' d0 ', result['content'])
        self.assertIn(' ce ', result['content'])
        self.assertIn(' c4 ', result['content'])

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 4)

    def test_70_phantomjs_url(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<b>a:</b>', content)
        self.assertIn('<b>Cookie:</b>', content)
        self.assertIn('c=d</td>', content)

    def test_80_phantomjs_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 4)

    def test_90_phantomjs_js_script(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])
Beispiel #14
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'http://echo.opera.com/',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = 'localhost:25555'
        self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None

    @classmethod
    def tearDownClass(self):
        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()
        time.sleep(1)

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'],
                         self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<b>A:', content)
        self.assertIn('<b>Cookie:</b>', content)
        self.assertIn('c=d</td>', content)

    def test_10_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'],
                         self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<h2>POST', content)
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        # FIXME: cookies in headers not supported
        self.assertNotIn('a=b', content)
        self.assertIn('c=d', content)
        self.assertIn('binux', content)

    def test_20_dataurl_get(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_30_with_queue(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_40_with_rpc(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(data).data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn(' d6 ', result['content'])
        self.assertIn(' d0 ', result['content'])
        self.assertIn(' ce ', result['content'])
        self.assertIn(' c4 ', result['content'])

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 4)

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'],
                         self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<b>a:</b>', content)
        self.assertIn('<b>Cookie:</b>', content)
        self.assertIn('c=d</td>', content)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 4)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        request['fetch'][
            'js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://bbs.byr.cn/#!article/WWWTechnology/28163'
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['headers']['User-Agent'] = 'Mozilla/5.0'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])
Beispiel #15
0
class TestSplashFetcher(unittest.TestCase):
    @property
    def sample_task_http(self):
        return {
            'taskid': 'taskid',
            'project': 'project',
            'url': '',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b',
                    'a': 'b'
                },
                'cookies': {
                    'c': 'd',
                },
                'timeout': 60,
                'save': 'abc',
            },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
            },
        }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        
    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_69_no_splash(self):
        splash_endpoint = self.fetcher.splash_endpoint
        self.fetcher.splash_endpoint = None

        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.splash_endpoint = splash_endpoint

    def test_70_splash_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data['headers'].get('A'), 'b', response.json)
        self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)

    def test_75_splash_robots(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_splash_timeout(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        # self.assertIn('js_script_result', result) TODO: lua nil is not exists

    def test_90_splash_js_script(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_95_splash_js_script_2(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/pyspider/ajax_click.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])
        self.assertIn('abc', result['js_script_result'])

    def test_a100_splash_sharp_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])
Beispiel #16
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json['form'].get('binux'), '')
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_30_with_queue(self):
        request= copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # utf8 encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u'中文', response.json['form'], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # gbk encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

        response = rebuild_response(result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn('teapot', response.text)

    def test_69_no_phantomjs(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = None

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data['headers'].get('A'), 'b', response.json)
        self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)

    def test_75_phantomjs_robots(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('js_script_result', result)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get?username=binux&password=123456'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect-to?url=/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.url, self.httpbin+'/get')

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn('redirects followed', response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['validate_cert'] = False
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['max_redirects'] = 10
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['robots_txt'] = False
        request['url'] = self.httpbin+'/deny'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy
Beispiel #17
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'http://echo.opera.com/',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None

    @classmethod
    def tearDownClass(self):
        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()
        time.sleep(1)

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn(b'<b>A:', content)
        self.assertIn(b'<b>Cookie:</b>', content)
        self.assertIn(b'c=d</td>', content)

    def test_10_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn(b'<h2>POST', content)
        self.assertIn(b'A:', content)
        self.assertIn(b'Cookie:', content)
        # FIXME: cookies in headers not supported
        self.assertNotIn(b'a=b', content)
        self.assertIn(b'c=d', content)
        self.assertIn(b'binux', content)

    def test_20_dataurl_get(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_30_with_queue(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_40_with_rpc(self):
        data = copy.deepcopy(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(data).data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn(b' d6 ', result['content'])
        self.assertIn(b' d0 ', result['content'])
        self.assertIn(b' ce ', result['content'])
        self.assertIn(b' c4 ', result['content'])

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 418)
        self.assertIn(b'teapot', result['content'])

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<b>a:</b>', content)
        self.assertIn('<b>Cookie:</b>', content)
        self.assertIn('c=d</td>', content)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://httpbin.org/delay/10'
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    @unittest.skipIf(os.environ.get('IGNORE_GOOGLE'), "can't connect to google.")
    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'https://groups.google.com/forum/#!forum/pyspider-users'
        request['fetch']['fetch_type'] = 'js'
        request['fetch']['headers']['User-Agent'] = 'Mozilla/5.0'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('pyspider-users', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])
Beispiel #18
0
class TestResponse(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
    }

    @classmethod
    def setUpClass(self):
        self.fetcher = Fetcher(None, None, async_mode=False)
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()

    def get(self, url, **kwargs):
        if not url.startswith('http://'):
            url = self.httpbin + url
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = url
        request.update(kwargs)
        result = self.fetcher.fetch(request)
        response = rebuild_response(result)
        return response

    def test_10_html(self):
        response = self.get('/html')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('h1'))

    def test_20_xml(self):
        response = self.get('/xml')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('item'))

    def test_30_gzip(self):
        response = self.get('/gzip')
        self.assertEqual(response.status_code, 200)
        self.assertIn('gzipped', response.text)

    def test_40_deflate(self):
        response = self.get('/deflate')
        self.assertEqual(response.status_code, 200)
        self.assertIn('deflated', response.text)

    def test_50_ok(self):
        response = self.get('/status/200')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        response = self.get('/status/302')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        with self.assertRaises(Exception):
            self.raise_for_status(allow_redirects=False)

    def test_60_not_ok(self):
        response = self.get('/status/400')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/500')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/600')
        self.assertFalse(response.ok)
        self.assertFalse(response)

    def test_70_reraise_exception(self):
        response = self.get('file://abc')
        with self.assertRaisesRegexp(Exception, 'HTTP 599'):
            response.raise_for_status()
Beispiel #19
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    g.fetcher = fetcher
    run_in_thread(fetcher.xmlrpc_run)
    fetcher.run()
Beispiel #20
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
            'taskid': 'taskid',
            'project': 'project',
            'url': 'http://echo.opera.com/',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b', 
                    'a': 'b'
                    },
                'timeout': 60,
                'save': 'abc',
                },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
                },
            }
    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)

    @classmethod
    def tearDownClass(self):
        self.rpc._quit()
        self.thread.join()

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        self.assertIn('a=b', content)

    def test_10_http_post(self):
        request = dict(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<h2>POST', content)
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        # FIXME: cookies in headers not supported
        self.assertNotIn('a=b', content)
        self.assertIn('c=d', content)
        self.assertIn('binux', content)

    def test_20_dataurl_get(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_30_with_queue(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_40_with_rpc(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        result = pickle.loads(self.rpc.fetch(data).data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')
Beispiel #21
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json['form'].get('binux'), '')
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_30_with_queue(self):
        request= copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # utf8 encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u'中文', response.json['form'], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # gbk encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

        response = rebuild_response(result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn('teapot', response.text)

    def test_69_no_phantomjs(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = None

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)

    def test_75_phantomjs_robots(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('js_script_result', result)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site-binux.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get?username=binux&password=123456'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect-to?url=/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.url, self.httpbin+'/get')

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn('redirects followed', response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['validate_cert'] = False
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['max_redirects'] = 10
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['robots_txt'] = False
        request['url'] = self.httpbin+'/deny'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy
Beispiel #22
0
class TestResponse(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
    }

    @classmethod
    def setUpClass(self):
        self.fetcher = Fetcher(None, None, async=False)
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()

    def get(self, url, **kwargs):
        if not url.startswith('http://'):
            url = self.httpbin + url
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = url
        request.update(kwargs)
        result = self.fetcher.fetch(request)
        response = rebuild_response(result)
        return response

    def test_10_html(self):
        response = self.get('/html')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('h1'))

    def test_20_xml(self):
        response = self.get('/xml')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('item'))

    def test_30_gzip(self):
        response = self.get('/gzip')
        self.assertEqual(response.status_code, 200)
        self.assertIn('gzipped', response.text)

    def test_40_deflate(self):
        response = self.get('/deflate')
        self.assertEqual(response.status_code, 200)
        self.assertIn('deflated', response.text)

    def test_50_ok(self):
        response = self.get('/status/200')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        response = self.get('/status/302')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        with self.assertRaises(Exception):
            self.raise_for_status(allow_redirects=False)

    def test_60_not_ok(self):
        response = self.get('/status/400')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/500')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/600')
        self.assertFalse(response.ok)
        self.assertFalse(response)

    def test_70_reraise_exception(self):
        response = self.get('file://abc')
        with self.assertRaisesRegexp(Exception, 'HTTP 599'):
            response.raise_for_status()
Beispiel #23
0
class TestSplashFetcher(unittest.TestCase):
    @property
    def sample_task_http(self):
        return {
            'taskid': 'taskid',
            'project': 'project',
            'url': '',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b',
                    'a': 'b'
                },
                'cookies': {
                    'c': 'd',
                },
                'timeout': 60,
                'save': 'abc',
            },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
            },
        }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--bind=0.0.0.0',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830'
        
    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_69_no_splash(self):
        splash_endpoint = self.fetcher.splash_endpoint
        self.fetcher.splash_endpoint = None

        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.splash_endpoint = splash_endpoint

    def test_70_splash_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)

    def test_75_splash_robots(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_splash_timeout(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        # self.assertIn('js_script_result', result) TODO: lua nil is not exists

    def test_90_splash_js_script(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_95_splash_js_script_2(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/pyspider/ajax_click.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])
        self.assertIn('abc', result['js_script_result'])

    def test_a100_splash_sharp_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a120_http_get_with_proxy_fail_1(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok_1(self):
        self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)
        self.fetcher.proxy = None
Beispiel #24
0
 def setUpClass(self):
     self.fetcher = Fetcher(None, None, async=False)
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
     self.httpbin = 'http://127.0.0.1:14887'
     time.sleep(0.5)
Beispiel #25
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        "taskid": "taskid",
        "project": "project",
        "url": "",
        "fetch": {
            "method": "GET",
            "headers": {"Cookie": "a=b", "a": "b"},
            "cookies": {"c": "d"},
            "timeout": 60,
            "save": "abc",
        },
        "process": {"callback": "callback", "save": [1, 2, 3]},
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        try:
            self.phantomjs = subprocess.Popen(
                [
                    "phantomjs",
                    os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
                    "25555",
                ]
            )
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        request["fetch"]["data"] = "binux"
        request["fetch"]["cookies"] = {"c": "d"}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json["form"].get("binux"), "")
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_30_with_queue(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        # utf8 encoding 中文
        request["fetch"]["data"] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u"中文", response.json["form"], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        # gbk encoding 中文
        request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/delay/5"
        request["fetch"]["timeout"] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/status/418"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn("teapot", response.text)

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        request["fetch"]["fetch_type"] = "js"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        data = json.loads(response.doc("pre").text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data["headers"].get("A"), "b", response.json)
        self.assertEqual(data["headers"].get("Cookie"), "c=d", response.json)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/delay/5"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["timeout"] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/html"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["js_script"] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("binux", result["content"])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/pyspider/ajax.html"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["headers"]["User-Agent"] = "pyspider-test"
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertNotIn("loading", result["content"])
        self.assertIn("done", result["content"])
        self.assertIn("pyspider-test", result["content"])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "http://www.not-exists-site.com/"
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 599)
        self.assertIn("error", result)
        self.assertIn("resolve", result["error"])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result["status_code"], 599)
        self.assertIn("error", result)
        self.assertIn("resolve", result["error"])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get?username=binux&password=123456"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/redirect-to?url=/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.url, self.httpbin + "/get")

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/redirect/10"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn("redirects followed", response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/cookies/set?k1=v1&k2=v2"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {"a": "b", "k1": "v1", "k2": "v2", "c": "d"}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["validate_cert"] = False
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["max_redirects"] = 10
        request["url"] = self.httpbin + "/redirect/10"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["robots_txt"] = False
        request["url"] = self.httpbin + "/deny"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request["fetch"]["robots_txt"] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = "127.0.0.1:20000"

        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        request["fetch"]["fetch_type"] = "js"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy