def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 Scheduler.DELETE_TIME = 0 scheduler._last_tick = int(time.time()) # not dispatch cronjob run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1)
class TestProcessor(unittest.TestCase): projectdb_path = './data/tests/project.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.processor.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_update_project(self): self.assertIsNone(self.processor.project_manager.get('test_project')) self.projectdb.insert( 'test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone( self.processor.project_manager.get('test_project')) task = { "process": { "callback": "on_start" }, "project": "not_exists", "taskid": "data:,on_start", "url": "data:,on_start" } self.in_queue.put((task, {})) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['process']['ok'], False) self.assertIsNone(self.processor.project_manager.get('not_exists')) def test_20_broken_project(self): self.assertIsNone( self.processor.project_manager.get('test_broken_project')) self.projectdb.insert( 'test_broken_project', { 'name': 'test_broken_project', 'group': 'group', 'status': 'DEBUG', 'script': inspect.getsource(sample_handler)[:10], 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone( self.processor.project_manager.get('test_broken_project')) project_data = self.processor.project_manager.get( 'test_broken_project') self.assertIsNotNone(project_data.get('exception')) def test_30_new_task(self): self.assertTrue(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) task = { "process": { "callback": "on_start" }, "project": "test_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): self.status_queue.get() self.assertFalse(self.newtask_queue.empty()) def test_40_index_page(self): task = None while not self.newtask_queue.empty(): task = self.newtask_queue.get()[0] self.assertIsNotNone(task) fetch_result = { "orig_url": task['url'], "content": ("<html><body>" "<a href='http://binux.me'>binux</a>" "<a href='http://binux.me/中文'>binux</a>" "<a href='http://binux.me/1'>1</a>" "<a href='http://binux.me/1'>2</a>" "</body></html>"), "headers": { 'a': 'b', 'etag': 'tag' }, "status_code": 200, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertFalse(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 200) self.assertEqual('tag', status['track']['fetch']['headers']['etag']) self.assertIsNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], True) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 3) self.assertIsNone(status['track']['process']['result']) self.assertEqual(status['track']['process']['logs'], '') self.assertIsNone(status['track']['process']['exception']) tasks = self.newtask_queue.get() self.assertEqual(len(tasks), 3) self.assertEqual(tasks[0]['url'], 'http://binux.me/') self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url']) def test_50_fetch_error(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "index_page" }, "project": "test_project", "taskid": "data:,test_fetch_error", "url": "data:,test_fetch_error" } fetch_result = { "orig_url": task['url'], "content": "test_fetch_error", "error": "test_fetch_error", "headers": { 'a': 'b', 'last-modified': '123' }, "status_code": 598, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], False) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 598) self.assertEqual('123', status['track']['fetch']['headers']['last-modified']) self.assertIsNotNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 0) self.assertIsNone(status['track']['process']['result']) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) def test_60_call_broken_project(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start", } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) self.assertTrue(self.newtask_queue.empty()) def test_70_update_project(self): self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000 self.processor.project_manager._check_projects() self.assertIsNotNone( self.processor.project_manager.get('test_broken_project')) # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler), }) # not update self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) # updated task['project_updatetime'] = time.time() self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], True) self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler)[:10], }) # update with md5 task['project_md5sum'] = 'testmd5' del task['project_updatetime'] self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 @unittest.skipIf(six.PY3, "deprecated feature, not work for PY3") def test_80_import_project(self): self.projectdb.insert( 'test_project2', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.projectdb.insert( 'test_project3', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) from projects import test_project self.assertIsNotNone(test_project) self.assertIsNotNone(test_project.Handler) from projects.test_project2 import Handler self.assertIsNotNone(Handler) import projects.test_project3 self.assertIsNotNone(projects.test_project3.Handler)
class TestProcessor(unittest.TestCase): resultdb_path = './data/tests/result.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.result_worker.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_bad_result(self): self.inqueue.put(({'project': 'test_project'}, {})) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 0) self.assertEqual(self.resultdb.count('test_project'), 0) def test_20_insert_result(self): data = {'a': 'b'} self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, data)) time.sleep(0.5) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 1) self.assertEqual(self.resultdb.count('test_project'), 1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], data) def test_30_overwrite(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, "abc")) time.sleep(0.1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], "abc") def test_40_insert_list(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id2', 'url': 'url1' }, ['a', 'b'])) time.sleep(0.1) result = self.resultdb.get('test_project', 'id2') self.assertEqual(result['result'], ['a', 'b'])
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'js' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'js' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'js' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)
class TestScheduler(unittest.TestCase): taskdb_path = './data/tests/task.db' projectdb_path = './data/tests/project.db' resultdb_path = './data/tests/result.db' check_project_time = 1 scheduler_xmlrpc_port = 23333 @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 Scheduler.DELETE_TIME = 0 scheduler._last_tick = int(time.time()) # not dispatch cronjob run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.rpc._quit() self.process.join(5) assert not self.process.is_alive() shutil.rmtree('./data/tests', ignore_errors=True) time.sleep(1) def test_10_new_task_ignore(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) self.assertEqual(self.rpc.size(), 0) self.assertEqual(len(self.rpc.get_active_tasks()), 0) def test_20_new_project(self): self.projectdb.insert('test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) def test_30_update_project(self): from pyspider.libs.multiprocessing_queue import Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=1) self.projectdb.update('test_project', status="DEBUG") time.sleep(0.1) self.rpc.update_project() task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['url'], 'data:,_on_get_info') def test_34_new_not_used_project(self): self.projectdb.insert('test_project_not_started', { 'name': 'test_project_not_started', 'group': 'group', 'status': 'RUNNING', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) task = self.scheduler2fetcher.get(timeout=1) self.assertEqual(task['taskid'], '_on_get_info') def test_35_new_task(self): time.sleep(0.2) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, }, }) time.sleep(0.5) task = self.scheduler2fetcher.get(timeout=10) self.assertGreater(len(self.rpc.get_active_tasks()), 0) self.assertIsNotNone(task) self.assertEqual(task['project'], 'test_project') self.assertIn('schedule', task) self.assertIn('fetch', task) self.assertIn('process', task) self.assertIn('track', task) self.assertEqual(task['fetch']['data'], 'abc') def test_37_force_update_processing_task(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url_force_update', 'schedule': { 'age': 0, 'force_update': True, }, }) time.sleep(0.2) # it should not block next def test_40_taskdone_error_no_project(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'no_project', 'url': 'url' }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_50_taskdone_error_no_track(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': {} }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_60_taskdone_failed_retry(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_70_taskdone_ok(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_80_newtask_age_ignore(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_82_newtask_via_rpc(self): self.rpc.newtask({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_90_newtask_with_itag(self): time.sleep(0.1) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'itag': "abc", 'retries': 1 }, }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.test_70_taskdone_ok() def test_a10_newtask_restart_by_age(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1 }, }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a20_failed_retry(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': False }, 'process': { 'ok': False }, } }) from pyspider.libs.multiprocessing_queue import Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_a30_task_verify(self): self.assertFalse(self.rpc.newtask({ #'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', #'project': 'test_project', 'url': 'url', })) self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', #'url': 'url', })) self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'not_exist_project', 'url': 'url', })) self.assertTrue(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) def test_a40_success_recrawl(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a50_failed_recrawl(self): for i in range(3): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a60_disable_recrawl(self): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) from pyspider.libs.multiprocessing_queue import Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_x10_inqueue_limit(self): self.projectdb.insert('test_inqueue_project', { 'name': 'test_inqueue_project', 'group': 'group', 'status': 'DEBUG', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 0, 'burst': 0, }) time.sleep(0.1) pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put({ 'taskid': 'taskid%d' % i, 'project': 'test_inqueue_project', 'url': 'url', 'schedule': { 'age': 3000, 'force_update': True, }, }) time.sleep(1) self.assertEqual(self.rpc.size() - pre_size, 10) def test_x20_delete_project(self): self.assertIsNotNone(self.projectdb.get('test_inqueue_project')) #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete") time.sleep(1) self.assertIsNone(self.projectdb.get('test_inqueue_project')) self.taskdb._list_project() self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) def test_z10_startup(self): self.assertTrue(self.process.is_alive()) def test_z20_quit(self): self.rpc._quit() time.sleep(0.2) self.assertFalse(self.process.is_alive()) self.assertEqual( self.taskdb.get_task('test_project', 'taskid')['status'], self.taskdb.SUCCESS )