def run_fetcher(g=g): from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) fetcher.phantomjs_proxy = g.phantomjs_proxy run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host) fetcher.run()
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port): g = ctx.obj from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) fetcher.phantomjs_proxy = g.phantomjs_proxy g.instances.append(fetcher) if xmlrpc: run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run()
def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = "http://127.0.0.1:14887" self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = "127.0.0.1:25555" self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen( ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True ) self.proxy = "127.0.0.1:14830" try: self.phantomjs = subprocess.Popen( [ "phantomjs", os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"), "25555", ] ) except OSError: self.phantomjs = None time.sleep(0.5)
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password): g = ctx.obj from pyspider.webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password # fetcher rpc if isinstance(fetcher_rpc, basestring): fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) if fetcher_rpc is None: from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy app.config['fetch'] = lambda x: fetcher.fetch(x)[1] else: import umsgpack app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data) if isinstance(scheduler_rpc, basestring): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug if g.get('testing_mode'): return app app.run(host=host, port=port)
def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = 'localhost:25555' self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555'])
def run_webui(g=g): import cPickle as pickle from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy from pyspider.webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['fetch'] = lambda x: fetcher.fetch(x)[1] app.config['scheduler_rpc'] = g.scheduler_rpc #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/' if g.demo_mode: app.config['max_rate'] = 0.2 app.config['max_burst'] = 3.0 if 'WEBUI_USERNAME' in os.environ: app.config['webui_username'] = os.environ['WEBUI_USERNAME'] app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '') if not getattr(g, 'all_in_one', False): app.debug = g.debug app.run(host=g.webui_host, port=g.webui_port)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830'
class TestResponse(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', } @classmethod def setUpClass(self): self.fetcher = Fetcher(None, None, async=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5) @classmethod def tearDownClass(self): self.httpbin_thread.terminate() def get(self, url, **kwargs): if not url.startswith('http://'): url = self.httpbin + url request = copy.deepcopy(self.sample_task_http) request['url'] = url request.update(kwargs) task, result = self.fetcher.fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) return response def test_10_html(self): response = self.get('/html') self.assertIsNotNone(response.doc('h1')) def test_20_xml(self): response = self.get('/xml') self.assertIsNotNone(response.doc('item')) def test_30_gzip(self): response = self.get('/gzip') self.assertIn('gzipped', response.text) def test_40_deflate(self): response = self.get('/deflate') self.assertIn('deflated', response.text)
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher): g = ctx.obj fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor, poolsize=poolsize, proxy=proxy) fetcher.phantomjs_proxy = g.phantomjs_proxy if user_agent: fetcher.user_agent = user_agent if timeout: fetcher.default_options = dict(fetcher.default_options) fetcher.default_options['timeout'] = timeout g.instances.append(fetcher) if g.get('testing_mode'): return fetcher if xmlrpc: run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run()
class TestFetcher(unittest.TestCase): sample_task_http = { "taskid": "taskid", "project": "project", "url": "http://echo.opera.com/", "fetch": {"method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "timeout": 60, "save": "abc"}, "process": {"callback": "callback", "save": [1, 2, 3]}, } @classmethod def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.rpc = xmlrpclib.ServerProxy("http://localhost:%d" % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) @classmethod def tearDownClass(self): self.rpc._quit() self.thread.join() time.sleep(1) def test_10_http_get(self): result = self.fetcher.sync_fetch(self.sample_task_http) self.assertEqual(result["status_code"], 200) self.assertEqual(result["orig_url"], self.sample_task_http["url"]) self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"]) self.assertIn("content", result) content = result["content"] self.assertIn("..A:", content) self.assertIn("..Cookie:", content) self.assertIn("a=b", content) def test_10_http_post(self): request = dict(self.sample_task_http) request["fetch"]["method"] = "POST" request["fetch"]["data"] = "binux" request["fetch"]["cookies"] = {"c": "d"} result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 200) self.assertEqual(result["orig_url"], self.sample_task_http["url"]) self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"]) self.assertIn("content", result) content = result["content"] self.assertIn("<h2>POST", content) self.assertIn("..A:", content) self.assertIn("..Cookie:", content) # FIXME: cookies in headers not supported self.assertNotIn("a=b", content) self.assertIn("c=d", content) self.assertIn("binux", content) def test_20_dataurl_get(self): data = dict(self.sample_task_http) data["url"] = "data:,hello" result = self.fetcher.sync_fetch(data) self.assertEqual(result["status_code"], 200) self.assertIn("content", result) self.assertEqual(result["content"], "hello") def test_30_with_queue(self): data = dict(self.sample_task_http) data["url"] = "data:,hello" self.inqueue.put(data) task, result = self.outqueue.get() self.assertEqual(result["status_code"], 200) self.assertIn("content", result) self.assertEqual(result["content"], "hello") def test_40_with_rpc(self): data = dict(self.sample_task_http) data["url"] = "data:,hello" result = umsgpack.unpackb(self.rpc.fetch(data).data) self.assertEqual(result["status_code"], 200) self.assertIn("content", result) self.assertEqual(result["content"], "hello") def test_50_base64_data(self): request = dict(self.sample_task_http) request["fetch"]["method"] = "POST" request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result["status_code"], 200) self.assertIn(" d6 ", result["content"]) self.assertIn(" d0 ", result["content"]) self.assertIn(" ce ", result["content"]) self.assertIn(" c4 ", result["content"])
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': 'http://echo.opera.com/', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = 'localhost:25555' self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) @classmethod def tearDownClass(self): self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() time.sleep(1) def test_10_http_get(self): result = self.fetcher.sync_fetch(self.sample_task_http) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<b>A:', content) self.assertIn('<b>Cookie:</b>', content) self.assertIn('c=d</td>', content) def test_10_http_post(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<h2>POST', content) self.assertIn('..A:', content) self.assertIn('..Cookie:', content) # FIXME: cookies in headers not supported self.assertNotIn('a=b', content) self.assertIn('c=d', content) self.assertIn('binux', content) def test_20_dataurl_get(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = self.fetcher.sync_fetch(data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_30_with_queue(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' self.inqueue.put(data) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_40_with_rpc(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(data).data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn(' d6 ', result['content']) self.assertIn(' d0 ', result['content']) self.assertIn(' ce ', result['content']) self.assertIn(' c4 ', result['content']) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 4) def test_70_phantomjs_url(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<b>a:</b>', content) self.assertIn('<b>Cookie:</b>', content) self.assertIn('c=d</td>', content) def test_80_phantomjs_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['fetch_type'] = 'js' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 4) def test_90_phantomjs_js_script(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content'])
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': 'http://echo.opera.com/', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = 'localhost:25555' self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None @classmethod def tearDownClass(self): if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() time.sleep(1) def test_10_http_get(self): result = self.fetcher.sync_fetch(self.sample_task_http) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<b>A:', content) self.assertIn('<b>Cookie:</b>', content) self.assertIn('c=d</td>', content) def test_10_http_post(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<h2>POST', content) self.assertIn('..A:', content) self.assertIn('..Cookie:', content) # FIXME: cookies in headers not supported self.assertNotIn('a=b', content) self.assertIn('c=d', content) self.assertIn('binux', content) def test_20_dataurl_get(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = self.fetcher.sync_fetch(data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_30_with_queue(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' self.inqueue.put(data) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_40_with_rpc(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(data).data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn(' d6 ', result['content']) self.assertIn(' d0 ', result['content']) self.assertIn(' ce ', result['content']) self.assertIn(' c4 ', result['content']) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 4) def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<b>a:</b>', content) self.assertIn('<b>Cookie:</b>', content) self.assertIn('c=d</td>', content) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['fetch_type'] = 'js' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 4) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' request['fetch'][ 'js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://bbs.byr.cn/#!article/WWWTechnology/28163' request['fetch']['fetch_type'] = 'js' request['fetch']['headers']['User-Agent'] = 'Mozilla/5.0' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content'])
class TestSplashFetcher(unittest.TestCase): @property def sample_task_http(self): return { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_69_no_splash(self): splash_endpoint = self.fetcher.splash_endpoint self.fetcher.splash_endpoint = None request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.splash_endpoint = splash_endpoint def test_70_splash_url(self): request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json) def test_75_splash_robots(self): request = self.sample_task_http request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'splash' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_splash_timeout(self): request = self.sample_task_http request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'splash' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) # self.assertIn('js_script_result', result) TODO: lua nil is not exists def test_90_splash_js_script(self): request = self.sample_task_http request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'splash' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_95_splash_js_script_2(self): request = self.sample_task_http request['url'] = self.httpbin + '/pyspider/ajax_click.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) self.assertIn('abc', result['js_script_result']) def test_a100_splash_sharp_url(self): request = self.sample_task_http request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content'])
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json) def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) self.assertIn('js_script_result', result) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin+'/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': 'http://echo.opera.com/', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None @classmethod def tearDownClass(self): if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() time.sleep(1) def test_10_http_get(self): result = self.fetcher.sync_fetch(self.sample_task_http) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn(b'<b>A:', content) self.assertIn(b'<b>Cookie:</b>', content) self.assertIn(b'c=d</td>', content) def test_10_http_post(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn(b'<h2>POST', content) self.assertIn(b'A:', content) self.assertIn(b'Cookie:', content) # FIXME: cookies in headers not supported self.assertNotIn(b'a=b', content) self.assertIn(b'c=d', content) self.assertIn(b'binux', content) def test_20_dataurl_get(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = self.fetcher.sync_fetch(data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_30_with_queue(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' self.inqueue.put(data) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_40_with_rpc(self): data = copy.deepcopy(self.sample_task_http) data['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(data).data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn(b' d6 ', result['content']) self.assertIn(b' d0 ', result['content']) self.assertIn(b' ce ', result['content']) self.assertIn(b' c4 ', result['content']) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/status/418' self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 418) self.assertIn(b'teapot', result['content']) def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<b>a:</b>', content) self.assertIn('<b>Cookie:</b>', content) self.assertIn('c=d</td>', content) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://httpbin.org/delay/10' request['fetch']['fetch_type'] = 'js' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['fetch']['fetch_type'] = 'js' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) @unittest.skipIf(os.environ.get('IGNORE_GOOGLE'), "can't connect to google.") def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = 'https://groups.google.com/forum/#!forum/pyspider-users' request['fetch']['fetch_type'] = 'js' request['fetch']['headers']['User-Agent'] = 'Mozilla/5.0' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('pyspider-users', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error'])
class TestResponse(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', } @classmethod def setUpClass(self): self.fetcher = Fetcher(None, None, async_mode=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5) @classmethod def tearDownClass(self): self.httpbin_thread.terminate() def get(self, url, **kwargs): if not url.startswith('http://'): url = self.httpbin + url request = copy.deepcopy(self.sample_task_http) request['url'] = url request.update(kwargs) result = self.fetcher.fetch(request) response = rebuild_response(result) return response def test_10_html(self): response = self.get('/html') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('h1')) def test_20_xml(self): response = self.get('/xml') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('item')) def test_30_gzip(self): response = self.get('/gzip') self.assertEqual(response.status_code, 200) self.assertIn('gzipped', response.text) def test_40_deflate(self): response = self.get('/deflate') self.assertEqual(response.status_code, 200) self.assertIn('deflated', response.text) def test_50_ok(self): response = self.get('/status/200') self.assertTrue(response.ok) self.assertTrue(response) response = self.get('/status/302') self.assertTrue(response.ok) self.assertTrue(response) with self.assertRaises(Exception): self.raise_for_status(allow_redirects=False) def test_60_not_ok(self): response = self.get('/status/400') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/500') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/600') self.assertFalse(response.ok) self.assertFalse(response) def test_70_reraise_exception(self): response = self.get('file://abc') with self.assertRaisesRegexp(Exception, 'HTTP 599'): response.raise_for_status()
def run_fetcher(g=g): from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) g.fetcher = fetcher run_in_thread(fetcher.xmlrpc_run) fetcher.run()
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': 'http://echo.opera.com/', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) @classmethod def tearDownClass(self): self.rpc._quit() self.thread.join() def test_10_http_get(self): result = self.fetcher.sync_fetch(self.sample_task_http) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('..A:', content) self.assertIn('..Cookie:', content) self.assertIn('a=b', content) def test_10_http_post(self): request = dict(self.sample_task_http) request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertEqual(result['orig_url'], self.sample_task_http['url']) self.assertEqual(result['save'], self.sample_task_http['fetch']['save']) self.assertIn('content', result) content = result['content'] self.assertIn('<h2>POST', content) self.assertIn('..A:', content) self.assertIn('..Cookie:', content) # FIXME: cookies in headers not supported self.assertNotIn('a=b', content) self.assertIn('c=d', content) self.assertIn('binux', content) def test_20_dataurl_get(self): data = dict(self.sample_task_http) data['url'] = 'data:,hello'; result = self.fetcher.sync_fetch(data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_30_with_queue(self): data = dict(self.sample_task_http) data['url'] = 'data:,hello'; self.inqueue.put(data) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello') def test_40_with_rpc(self): data = dict(self.sample_task_http) data['url'] = 'data:,hello'; result = pickle.loads(self.rpc.fetch(data).data) self.assertEqual(result['status_code'], 200) self.assertIn('content', result) self.assertEqual(result['content'], 'hello')
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) self.assertIn('js_script_result', result) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site-binux.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin+'/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
class TestResponse(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', } @classmethod def setUpClass(self): self.fetcher = Fetcher(None, None, async=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5) @classmethod def tearDownClass(self): self.httpbin_thread.terminate() def get(self, url, **kwargs): if not url.startswith('http://'): url = self.httpbin + url request = copy.deepcopy(self.sample_task_http) request['url'] = url request.update(kwargs) result = self.fetcher.fetch(request) response = rebuild_response(result) return response def test_10_html(self): response = self.get('/html') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('h1')) def test_20_xml(self): response = self.get('/xml') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('item')) def test_30_gzip(self): response = self.get('/gzip') self.assertEqual(response.status_code, 200) self.assertIn('gzipped', response.text) def test_40_deflate(self): response = self.get('/deflate') self.assertEqual(response.status_code, 200) self.assertIn('deflated', response.text) def test_50_ok(self): response = self.get('/status/200') self.assertTrue(response.ok) self.assertTrue(response) response = self.get('/status/302') self.assertTrue(response.ok) self.assertTrue(response) with self.assertRaises(Exception): self.raise_for_status(allow_redirects=False) def test_60_not_ok(self): response = self.get('/status/400') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/500') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/600') self.assertFalse(response.ok) self.assertFalse(response) def test_70_reraise_exception(self): response = self.get('file://abc') with self.assertRaisesRegexp(Exception, 'HTTP 599'): response.raise_for_status()
class TestSplashFetcher(unittest.TestCase): @property def sample_task_http(self): return { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--bind=0.0.0.0', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830' @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_69_no_splash(self): splash_endpoint = self.fetcher.splash_endpoint self.fetcher.splash_endpoint = None request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.splash_endpoint = splash_endpoint def test_70_splash_url(self): request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) def test_75_splash_robots(self): request = self.sample_task_http request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'splash' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_splash_timeout(self): request = self.sample_task_http request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'splash' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) # self.assertIn('js_script_result', result) TODO: lua nil is not exists def test_90_splash_js_script(self): request = self.sample_task_http request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'splash' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_95_splash_js_script_2(self): request = self.sample_task_http request['url'] = self.httpbin + '/pyspider/ajax_click.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) self.assertIn('abc', result['js_script_result']) def test_a100_splash_sharp_url(self): request = self.sample_task_http request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a120_http_get_with_proxy_fail_1(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok_1(self): self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) self.fetcher.proxy = None
def setUpClass(self): self.fetcher = Fetcher(None, None, async=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5)
class TestFetcher(unittest.TestCase): sample_task_http = { "taskid": "taskid", "project": "project", "url": "", "fetch": { "method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "cookies": {"c": "d"}, "timeout": 60, "save": "abc", }, "process": {"callback": "callback", "save": [1, 2, 3]}, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = "http://127.0.0.1:14887" self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = "127.0.0.1:25555" self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen( ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True ) self.proxy = "127.0.0.1:14830" try: self.phantomjs = subprocess.Popen( [ "phantomjs", os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"), "25555", ] ) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" request["fetch"]["data"] = "binux" request["fetch"]["cookies"] = {"c": "d"} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["form"].get("binux"), "") self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_30_with_queue(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" # utf8 encoding 中文 request["fetch"]["data"] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u"中文", response.json["form"], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" # gbk encoding 中文 request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/delay/5" request["fetch"]["timeout"] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/status/418" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn("teapot", response.text) def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" request["fetch"]["fetch_type"] = "js" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) data = json.loads(response.doc("pre").text()) self.assertIsNotNone(data, response.content) self.assertEqual(data["headers"].get("A"), "b", response.json) self.assertEqual(data["headers"].get("Cookie"), "c=d", response.json) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/delay/5" request["fetch"]["fetch_type"] = "js" request["fetch"]["timeout"] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/html" request["fetch"]["fetch_type"] = "js" request["fetch"]["js_script"] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 200) self.assertIn("binux", result["content"]) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/pyspider/ajax.html" request["fetch"]["fetch_type"] = "js" request["fetch"]["headers"]["User-Agent"] = "pyspider-test" result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 200) self.assertNotIn("loading", result["content"]) self.assertIn("done", result["content"]) self.assertIn("pyspider-test", result["content"]) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "http://www.not-exists-site.com/" result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 599) self.assertIn("error", result) self.assertIn("resolve", result["error"]) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result["status_code"], 599) self.assertIn("error", result) self.assertIn("resolve", result["error"]) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get?username=binux&password=123456" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/redirect-to?url=/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.url, self.httpbin + "/get") def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/redirect/10" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn("redirects followed", response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/cookies/set?k1=v1&k2=v2" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {"a": "b", "k1": "v1", "k2": "v2", "c": "d"}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["validate_cert"] = False request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["max_redirects"] = 10 request["url"] = self.httpbin + "/redirect/10" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["robots_txt"] = False request["url"] = self.httpbin + "/deny" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request["fetch"]["robots_txt"] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = "127.0.0.1:20000" if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" request["fetch"]["fetch_type"] = "js" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy