def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = "http://127.0.0.1:14887" self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = "127.0.0.1:25555" self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen( ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True ) self.proxy = "127.0.0.1:14830" try: self.phantomjs = subprocess.Popen( [ "phantomjs", os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"), "25555", ] ) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname( socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--bind=0.0.0.0', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830'
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5)
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html redis: redis://host:6379/db redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from pyspider.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) elif parsed.scheme == 'redis': from .redis_queue import Queue if ',' in parsed.netloc: """ redis in cluster mode (there is no concept of 'db' in cluster mode) ex. redis://host1:port1,host2:port2,...,hostn:portn """ cluster_nodes = [] for netloc in parsed.netloc.split(','): cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes) else: db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: logging.warning('redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: raise Exception('unknown connection url: %s', url)
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html beanstalk: beanstalk://host:11300/ redis: redis://host:6379/db kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from pyspider.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) elif parsed.scheme == 'beanstalk': from .beanstalk import Queue return Queue(name, host=parsed.netloc, maxsize=maxsize) elif parsed.scheme == 'redis': from .redis_queue import Queue db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: logging.warning( 'redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None return Queue(name, parsed.hostname, parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: raise Exception('unknow connection url: %s', url)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1)
def setUpClass(self): shutil.rmtree("./data/tests", ignore_errors=True) os.makedirs("./data/tests") def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler( taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb(), ) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {"": 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 Scheduler.DELETE_TIME = 0 scheduler._last_tick = int(time.time()) # not dispatch cronjob run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1)
class TestScheduler(unittest.TestCase): taskdb_path = './data/tests/task.db' projectdb_path = './data/tests/project.db' resultdb_path = './data/tests/result.db' check_project_time = 1 scheduler_xmlrpc_port = 23333 @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {'': 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.rpc._quit() self.process.join(5) self.xmlrpc_thread.join() assert not self.process.is_alive() shutil.rmtree('./data/tests', ignore_errors=True) time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(self.scheduler_xmlrpc_port) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) def test_10_new_task_ignore(self): ''' task_queue = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # unknown project: test_project self.assertEqual(self.rpc.size(), 0) self.assertEqual(len(self.rpc.get_active_tasks()), 0) def test_20_new_project(self): ''' task_queue = [ ] ''' self.projectdb.insert( 'test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) def test_30_update_project(self): ''' task_queue = [ ] ''' from six.moves import queue as Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=1) self.projectdb.update('test_project', status="DEBUG") time.sleep(0.1) self.rpc.update_project() task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual( task['taskid'], '_on_get_info' ) # select test_project:_on_get_info data:,_on_get_info def test_32_get_info(self): self.status_queue.put({ 'taskid': '_on_get_info', 'project': 'test_project', 'track': { 'save': {} } }) # test_project on_get_info {} def test_34_new_not_used_project(self): ''' task_queue = [] ''' self.projectdb.insert( 'test_project_not_started', { 'name': 'test_project_not_started', 'group': 'group', 'status': 'RUNNING', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) task = self.scheduler2fetcher.get( timeout=5 ) # select test_project_not_started:_on_get_info data:,_on_get_info self.assertEqual(task['taskid'], '_on_get_info') def test_35_new_task(self): ''' task_queue = [ ] ''' time.sleep(0.2) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, }, }) # new task test_project:taskid url # task_queue = [ test_project:taskid ] time.sleep(0.5) task = self.scheduler2fetcher.get( timeout=10) # select test_project:taskid self.assertGreater(len(self.rpc.get_active_tasks()), 0) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.assertEqual(task['project'], 'test_project') self.assertIn('schedule', task) self.assertIn('fetch', task) self.assertIn('process', task) self.assertIn('track', task) self.assertEqual(task['fetch']['data'], 'abc') def test_37_force_update_processing_task(self): ''' processing = [ test_project:taskid ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url_force_update', 'schedule': { 'age': 10, 'force_update': True, }, }) # restart task test_project:taskid url_force_update time.sleep(0.2) # it should not block next def test_40_taskdone_error_no_project(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'no_project', 'url': 'url' }) # unknown project: no_project time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_50_taskdone_error_no_track(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # Bad status pack: 'track' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': {} }) # Bad status pack: 'process' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_60_taskdone_failed_retry(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/3 test_project:taskid url from six.moves import queue as Queue # with self.assertRaises(Queue.Empty): # task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get( timeout=5) # select test_project:taskid url self.assertIsNotNone(task) def test_70_taskdone_ok(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_75_on_finished_msg(self): task = self.scheduler2fetcher.get( timeout=5) # select test_project:on_finished data:,on_finished self.assertEqual(task['taskid'], 'on_finished') self.status_queue.put({ 'taskid': 'on_finished', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:on_finished url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_80_newtask_age_ignore(self): ''' processing = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_82_newtask_via_rpc(self): ''' processing = [ ] ''' self.rpc.newtask({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_90_newtask_with_itag(self): ''' task_queue = [ ] processing = [ ] ''' time.sleep(0.1) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'itag': "abc", 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get( timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.test_70_taskdone_ok() # task done test_project:taskid url self.test_75_on_finished_msg( ) # select test_project:on_finished data:,on_finished def test_a10_newtask_restart_by_age(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get( timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a20_failed_retry(self): ''' processing: [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/1 test_project:taskid url task = self.scheduler2fetcher.get( timeout=5) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': False }, 'process': { 'ok': False }, } }) # task failed test_project:taskid url self.test_75_on_finished_msg( ) # select test_project:on_finished data:,on_finished from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_a30_task_verify(self): self.assertFalse( self.rpc.newtask({ #'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', }) ) # taskid not in task: {'project': 'test_project', 'url': 'url'} self.assertFalse( self.rpc.newtask({ 'taskid': 'taskid#', #'project': 'test_project', 'url': 'url', })) # project not in task: {'url': 'url', 'taskid': 'taskid#'} self.assertFalse( self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', #'url': 'url', }) ) # url not in task: {'project': 'test_project', 'taskid': 'taskid#'} self.assertFalse( self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'not_exist_project', 'url': 'url', })) # unknown project: not_exist_project self.assertTrue( self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) # new task test_project:taskid# url def test_a40_success_recrawl(self): ''' task_queue = [ test_project:taskid# ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, }) # restart task test_project:taskid url task1 = self.scheduler2fetcher.get( timeout=10) # select test_project:taskid# url task2 = self.scheduler2fetcher.get( timeout=10) # select test_project:taskid url self.assertIsNotNone(task1) self.assertIsNotNone(task2) self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a50_failed_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' for i in range(3): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # not processing pack: test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a60_disable_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_38_cancel_task(self): current_size = self.rpc.size() self.newtask_queue.put({ 'taskid': 'taskid_to_cancel', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'exetime': time.time() + 30 }, }) # new task test_project:taskid_to_cancel url # task_queue = [ test_project:taskid_to_cancel ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size + 1) self.newtask_queue.put({ 'taskid': 'taskid_to_cancel', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'force_update': True, 'age': 0, 'cancel': True }, }) # new cancel test_project:taskid_to_cancel url # task_queue = [ ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size) def test_x10_inqueue_limit(self): self.projectdb.insert( 'test_inqueue_project', { 'name': 'test_inqueue_project', 'group': 'group', 'status': 'DEBUG', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 0, 'burst': 0, }) time.sleep(0.1) pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put({ 'taskid': 'taskid%d' % i, 'project': 'test_inqueue_project', 'url': 'url', 'schedule': { 'age': 3000, 'force_update': True, }, }) time.sleep(1) self.assertEqual(self.rpc.size() - pre_size, 10) def test_x20_delete_project(self): self.assertIsNotNone(self.projectdb.get('test_inqueue_project')) #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete") time.sleep(1) self.assertIsNone(self.projectdb.get('test_inqueue_project')) self.taskdb._list_project() self.assertIsNone( self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum')) def test_z10_startup(self): self.assertTrue(self.process.is_alive()) def test_z20_quit(self): self.rpc._quit() time.sleep(0.2) self.assertFalse(self.process.is_alive()) self.assertEqual( self.taskdb.get_task('test_project', 'taskid')['status'], self.taskdb.SUCCESS)
class TestProcessor(unittest.TestCase): resultdb_path = './data/tests/result.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.result_worker.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_bad_result(self): self.inqueue.put(({'project': 'test_project'}, {})) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 0) self.assertEqual(self.resultdb.count('test_project'), 0) def test_10_bad_result_2(self): self.inqueue.put(({'project': 'test_project'}, {'a': 'b'})) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 0) self.assertEqual(self.resultdb.count('test_project'), 0) def test_20_insert_result(self): data = { 'a': 'b' } self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, data)) time.sleep(0.5) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 1) self.assertEqual(self.resultdb.count('test_project'), 1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], data) def test_30_overwrite(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, "abc")) time.sleep(0.1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], "abc") def test_40_insert_list(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id2', 'url': 'url1' }, ['a', 'b'])) time.sleep(0.1) result = self.resultdb.get('test_project', 'id2') self.assertEqual(result['result'], ['a', 'b'])
class TestScheduler(unittest.TestCase): taskdb_path = "./data/tests/task.db" projectdb_path = "./data/tests/project.db" resultdb_path = "./data/tests/result.db" check_project_time = 1 scheduler_xmlrpc_port = 23333 @classmethod def setUpClass(self): shutil.rmtree("./data/tests", ignore_errors=True) os.makedirs("./data/tests") def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler( taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb(), ) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {"": 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.rpc._quit() self.process.join(5) self.xmlrpc_thread.join() assert not self.process.is_alive() shutil.rmtree("./data/tests", ignore_errors=True) time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(self.scheduler_xmlrpc_port) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) def test_10_new_task_ignore(self): """ task_queue = [ ] """ self.newtask_queue.put( {"taskid": "taskid", "project": "test_project", "url": "url"} ) # unknown project: test_project self.assertEqual(self.rpc.size(), 0) self.assertEqual(len(self.rpc.get_active_tasks()), 0) def test_20_new_project(self): """ task_queue = [ ] """ self.projectdb.insert( "test_project", { "name": "test_project", "group": "group", "status": "TODO", "script": "import time\nprint(time.time())", "comments": "test project", "rate": 1.0, "burst": 10, }, ) def test_30_update_project(self): """ task_queue = [ ] """ from six.moves import queue as Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=1) self.projectdb.update("test_project", status="DEBUG") time.sleep(0.1) self.rpc.update_project() task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task["taskid"], "_on_get_info") # select test_project:_on_get_info data:,_on_get_info def test_32_get_info(self): self.status_queue.put({"taskid": "_on_get_info", "project": "test_project", "track": {"save": {}}}) # test_project on_get_info {} def test_34_new_not_used_project(self): """ task_queue = [] """ self.projectdb.insert( "test_project_not_started", { "name": "test_project_not_started", "group": "group", "status": "RUNNING", "script": "import time\nprint(time.time())", "comments": "test project", "rate": 1.0, "burst": 10, }, ) task = self.scheduler2fetcher.get(timeout=1) # select test_project_not_started:_on_get_info data:,_on_get_info self.assertEqual(task["taskid"], "_on_get_info") def test_35_new_task(self): """ task_queue = [ ] """ time.sleep(0.2) self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 0}, } ) # new task test_project:taskid url # task_queue = [ test_project:taskid ] time.sleep(0.5) task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid self.assertGreater(len(self.rpc.get_active_tasks()), 0) self.assertIsNotNone(task) self.assertEqual(task["taskid"], "taskid") self.assertEqual(task["project"], "test_project") self.assertIn("schedule", task) self.assertIn("fetch", task) self.assertIn("process", task) self.assertIn("track", task) self.assertEqual(task["fetch"]["data"], "abc") def test_37_force_update_processing_task(self): """ processing = [ test_project:taskid ] """ self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url_force_update", "schedule": {"age": 10, "force_update": True}, } ) # restart task test_project:taskid url_force_update time.sleep(0.2) # it should not block next def test_40_taskdone_error_no_project(self): """ processing = [ test_project:taskid ] """ self.status_queue.put( {"taskid": "taskid", "project": "no_project", "url": "url"} ) # unknown project: no_project time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_50_taskdone_error_no_track(self): """ processing = [ test_project:taskid ] """ self.status_queue.put({"taskid": "taskid", "project": "test_project", "url": "url"}) # Bad status pack: 'track' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) self.status_queue.put( {"taskid": "taskid", "project": "test_project", "url": "url", "track": {}} ) # Bad status pack: 'process' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_60_taskdone_failed_retry(self): """ processing = [ test_project:taskid ] """ self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "track": {"fetch": {"ok": True}, "process": {"ok": False}}, } ) # task retry 0/3 test_project:taskid url from six.moves import queue as Queue # with self.assertRaises(Queue.Empty): # task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) def test_70_taskdone_ok(self): """ processing = [ test_project:taskid ] """ self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "track": {"fetch": {"ok": True}, "process": {"ok": True}}, } ) # task done test_project:taskid url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_75_on_finished_msg(self): task = self.scheduler2fetcher.get(timeout=5) # select test_project:on_finished data:,on_finished self.assertEqual(task["taskid"], "on_finished") def test_80_newtask_age_ignore(self): """ processing = [ ] """ self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 30}, } ) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_82_newtask_via_rpc(self): """ processing = [ ] """ self.rpc.newtask( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 30}, } ) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_90_newtask_with_itag(self): """ task_queue = [ ] processing = [ ] """ time.sleep(0.1) self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"itag": "abc", "retries": 1}, } ) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task["taskid"], "taskid") self.test_70_taskdone_ok() # task done test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished def test_a10_newtask_restart_by_age(self): self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 0, "retries": 1}, } ) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task["taskid"], "taskid") def test_a20_failed_retry(self): """ processing: [ test_project:taskid ] """ self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "track": {"fetch": {"ok": True}, "process": {"ok": False}}, } ) # task retry 0/1 test_project:taskid url task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task["taskid"], "taskid") self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "track": {"fetch": {"ok": False}, "process": {"ok": False}}, } ) # task failed test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_a30_task_verify(self): self.assertFalse( self.rpc.newtask( { #'taskid': 'taskid#', "project": "test_project", "url": "url", } ) ) # taskid not in task: {'project': 'test_project', 'url': 'url'} self.assertFalse( self.rpc.newtask( { "taskid": "taskid#", #'project': 'test_project', "url": "url", } ) ) # project not in task: {'url': 'url', 'taskid': 'taskid#'} self.assertFalse( self.rpc.newtask( { "taskid": "taskid#", "project": "test_project", #'url': 'url', } ) ) # url not in task: {'project': 'test_project', 'taskid': 'taskid#'} self.assertFalse( self.rpc.newtask({"taskid": "taskid#", "project": "not_exist_project", "url": "url"}) ) # unknown project: not_exist_project self.assertTrue( self.rpc.newtask({"taskid": "taskid#", "project": "test_project", "url": "url"}) ) # new task test_project:taskid# url def test_a40_success_recrawl(self): """ task_queue = [ test_project:taskid# ] """ self.newtask_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 0, "retries": 1, "auto_recrawl": True}, } ) # restart task test_project:taskid url task1 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid# url task2 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task1) self.assertIsNotNone(task2) self.assertTrue(task1["taskid"] == "taskid#" or task2["taskid"] == "taskid#") self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "schedule": {"age": 0, "retries": 1, "auto_recrawl": True}, "track": {"fetch": {"ok": True}, "process": {"ok": True}}, } ) # task done test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a50_failed_recrawl(self): """ time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] """ for i in range(3): self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "schedule": {"age": 0, "retries": 1, "auto_recrawl": True}, "track": {"fetch": {"ok": True}, "process": {"ok": False}}, } ) # not processing pack: test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task["taskid"], "taskid") def test_a60_disable_recrawl(self): """ time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] """ self.status_queue.put( { "taskid": "taskid", "project": "test_project", "url": "url", "schedule": {"age": 0, "retries": 1}, "track": {"fetch": {"ok": True}, "process": {"ok": True}}, } ) # task done test_project:taskid url from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_38_cancel_task(self): current_size = self.rpc.size() self.newtask_queue.put( { "taskid": "taskid_to_cancel", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"age": 0, "exetime": time.time() + 30}, } ) # new task test_project:taskid_to_cancel url # task_queue = [ test_project:taskid_to_cancel ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size + 1) self.newtask_queue.put( { "taskid": "taskid_to_cancel", "project": "test_project", "url": "url", "fetch": {"data": "abc"}, "process": {"data": "abc"}, "schedule": {"force_update": True, "age": 0, "cancel": True}, } ) # new cancel test_project:taskid_to_cancel url # task_queue = [ ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size) def test_x10_inqueue_limit(self): self.projectdb.insert( "test_inqueue_project", { "name": "test_inqueue_project", "group": "group", "status": "DEBUG", "script": "import time\nprint(time.time())", "comments": "test project", "rate": 0, "burst": 0, }, ) time.sleep(0.1) pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put( { "taskid": "taskid%d" % i, "project": "test_inqueue_project", "url": "url", "schedule": {"age": 3000, "force_update": True}, } ) time.sleep(1) self.assertEqual(self.rpc.size() - pre_size, 10) def test_x20_delete_project(self): self.assertIsNotNone(self.projectdb.get("test_inqueue_project")) # self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.projectdb.update("test_inqueue_project", status="STOP", group="lock,delete") time.sleep(1) self.assertIsNone(self.projectdb.get("test_inqueue_project")) self.taskdb._list_project() self.assertIsNone(self.taskdb.get_task("test_inqueue_project", "taskid1")) self.assertNotIn("test_inqueue_project", self.rpc.counter("5m", "sum")) def test_z10_startup(self): self.assertTrue(self.process.is_alive()) def test_z20_quit(self): self.rpc._quit() time.sleep(0.2) self.assertFalse(self.process.is_alive()) self.assertEqual(self.taskdb.get_task("test_project", "taskid")["status"], self.taskdb.SUCCESS)
class TestScheduler(unittest.TestCase): taskdb_path = './data/tests/task.db' projectdb_path = './data/tests/project.db' resultdb_path = './data/tests/result.db' check_project_time = 1 scheduler_xmlrpc_port = 23333 @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {'': 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.rpc._quit() self.process.join(5) self.xmlrpc_thread.join() assert not self.process.is_alive() shutil.rmtree('./data/tests', ignore_errors=True) time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(self.scheduler_xmlrpc_port) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) def test_10_new_task_ignore(self): ''' task_queue = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # unknown project: test_project self.assertEqual(self.rpc.size(), 0) self.assertEqual(len(self.rpc.get_active_tasks()), 0) def test_20_new_project(self): ''' task_queue = [ ] ''' self.projectdb.insert('test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) def test_30_update_project(self): ''' task_queue = [ ] ''' from six.moves import queue as Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=1) self.projectdb.update('test_project', status="DEBUG") time.sleep(0.1) self.rpc.update_project() task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['taskid'], '_on_get_info') # select test_project:_on_get_info data:,_on_get_info def test_34_new_not_used_project(self): ''' task_queue = [] ''' self.projectdb.insert('test_project_not_started', { 'name': 'test_project_not_started', 'group': 'group', 'status': 'RUNNING', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) task = self.scheduler2fetcher.get(timeout=1) # select test_project_not_started:_on_get_info data:,_on_get_info self.assertEqual(task['taskid'], '_on_get_info') def test_35_new_task(self): ''' task_queue = [ ] ''' time.sleep(0.2) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, }, }) # new task test_project:taskid url # task_queue = [ test_project:taskid ] time.sleep(0.5) task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid self.assertGreater(len(self.rpc.get_active_tasks()), 0) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.assertEqual(task['project'], 'test_project') self.assertIn('schedule', task) self.assertIn('fetch', task) self.assertIn('process', task) self.assertIn('track', task) self.assertEqual(task['fetch']['data'], 'abc') def test_37_force_update_processing_task(self): ''' processing = [ test_project:taskid ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url_force_update', 'schedule': { 'age': 10, 'force_update': True, }, }) # restart task test_project:taskid url_force_update time.sleep(0.2) # it should not block next def test_40_taskdone_error_no_project(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'no_project', 'url': 'url' }) # unknown project: no_project time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_50_taskdone_error_no_track(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # Bad status pack: 'track' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': {} }) # Bad status pack: 'process' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_60_taskdone_failed_retry(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/3 test_project:taskid url from six.moves import queue as Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) def test_70_taskdone_ok(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_75_on_finished_msg(self): task = self.scheduler2fetcher.get(timeout=5) # select test_project:on_finished data:,on_finished self.assertEqual(task['taskid'], 'on_finished') def test_80_newtask_age_ignore(self): ''' processing = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_82_newtask_via_rpc(self): ''' processing = [ ] ''' self.rpc.newtask({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_90_newtask_with_itag(self): ''' task_queue = [ ] processing = [ ] ''' time.sleep(0.1) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'itag': "abc", 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.test_70_taskdone_ok() # task done test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished def test_a10_newtask_restart_by_age(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a20_failed_retry(self): ''' processing: [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/1 test_project:taskid url task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': False }, 'process': { 'ok': False }, } }) # task failed test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_a30_task_verify(self): self.assertFalse(self.rpc.newtask({ #'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) # taskid not in task: {'project': 'test_project', 'url': 'url'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', #'project': 'test_project', 'url': 'url', })) # project not in task: {'url': 'url', 'taskid': 'taskid#'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', #'url': 'url', })) # url not in task: {'project': 'test_project', 'taskid': 'taskid#'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'not_exist_project', 'url': 'url', })) # unknown project: not_exist_project self.assertTrue(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) # new task test_project:taskid# url def test_a40_success_recrawl(self): ''' task_queue = [ test_project:taskid# ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, }) # restart task test_project:taskid url task1 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid# url task2 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task1) self.assertIsNotNone(task2) self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a50_failed_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' for i in range(3): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # not processing pack: test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a60_disable_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_x10_inqueue_limit(self): self.projectdb.insert('test_inqueue_project', { 'name': 'test_inqueue_project', 'group': 'group', 'status': 'DEBUG', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 0, 'burst': 0, }) time.sleep(0.1) pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put({ 'taskid': 'taskid%d' % i, 'project': 'test_inqueue_project', 'url': 'url', 'schedule': { 'age': 3000, 'force_update': True, }, }) time.sleep(1) self.assertEqual(self.rpc.size() - pre_size, 10) def test_x20_delete_project(self): self.assertIsNotNone(self.projectdb.get('test_inqueue_project')) #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete") time.sleep(1) self.assertIsNone(self.projectdb.get('test_inqueue_project')) self.taskdb._list_project() self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum')) def test_z10_startup(self): self.assertTrue(self.process.is_alive()) def test_z20_quit(self): self.rpc._quit() time.sleep(0.2) self.assertFalse(self.process.is_alive()) self.assertEqual( self.taskdb.get_task('test_project', 'taskid')['status'], self.taskdb.SUCCESS )
class TestProcessor(unittest.TestCase): projectdb_path = './data/tests/project.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.processor.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_update_project(self): self.assertIsNone(self.processor.project_manager.get('test_project')) self.projectdb.insert( 'test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone( self.processor.project_manager.get('test_project')) task = { "process": { "callback": "on_start" }, "project": "not_exists", "taskid": "data:,on_start", "url": "data:,on_start" } self.in_queue.put((task, {})) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['process']['ok'], False) self.assertIsNone(self.processor.project_manager.get('not_exists')) def test_20_broken_project(self): self.assertIsNone( self.processor.project_manager.get('test_broken_project')) self.projectdb.insert( 'test_broken_project', { 'name': 'test_broken_project', 'group': 'group', 'status': 'DEBUG', 'script': inspect.getsource(sample_handler)[:10], 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone( self.processor.project_manager.get('test_broken_project')) project_data = self.processor.project_manager.get( 'test_broken_project') self.assertIsNotNone(project_data.get('exception')) def test_30_new_task(self): self.assertTrue(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) task = { "process": { "callback": "on_start" }, "project": "test_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): self.status_queue.get() self.assertFalse(self.newtask_queue.empty()) def test_40_index_page(self): task = None while not self.newtask_queue.empty(): task = self.newtask_queue.get()[0] self.assertIsNotNone(task) fetch_result = { "orig_url": task['url'], "content": ("<html><body>" "<a href='http://binux.me'>binux</a>" "<a href='http://binux.me/中文'>binux</a>" "<a href='http://binux.me/1'>1</a>" "<a href='http://binux.me/1'>2</a>" "</body></html>"), "headers": { 'a': 'b', 'etag': 'tag' }, "status_code": 200, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertFalse(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 200) self.assertEqual('tag', status['track']['fetch']['headers']['etag']) self.assertIsNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], True) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 3) self.assertIsNone(status['track']['process']['result']) self.assertEqual(status['track']['process']['logs'], '') self.assertIsNone(status['track']['process']['exception']) tasks = self.newtask_queue.get() self.assertEqual(len(tasks), 3) self.assertEqual(tasks[0]['url'], 'http://binux.me/') self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url']) def test_50_fetch_error(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "index_page" }, "project": "test_project", "taskid": "data:,test_fetch_error", "url": "data:,test_fetch_error" } fetch_result = { "orig_url": task['url'], "content": "test_fetch_error", "error": "test_fetch_error", "headers": { 'a': 'b', 'last-modified': '123' }, "status_code": 598, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], False) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 598) self.assertEqual('123', status['track']['fetch']['headers']['last-modified']) self.assertIsNotNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 0) self.assertIsNone(status['track']['process']['result']) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) def test_60_call_broken_project(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start", } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) self.assertTrue(self.newtask_queue.empty()) def test_70_update_project(self): self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000 self.processor.project_manager._check_projects() self.assertIsNotNone( self.processor.project_manager.get('test_broken_project')) # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler), }) # not update self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) # updated task['project_updatetime'] = time.time() self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], True) self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler)[:10], }) # update with md5 task['project_md5sum'] = 'testmd5' del task['project_updatetime'] self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 @unittest.skipIf(six.PY3, "deprecated feature, not work for PY3") def test_80_import_project(self): self.projectdb.insert( 'test_project2', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.projectdb.insert( 'test_project3', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) from projects import test_project self.assertIsNotNone(test_project) self.assertIsNotNone(test_project.Handler) from projects.test_project2 import Handler self.assertIsNotNone(Handler) import projects.test_project3 self.assertIsNotNone(projects.test_project3.Handler)
class TestProcessor(unittest.TestCase): projectdb_path = './data/tests/project.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.processor.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_update_project(self): self.assertIsNone(self.processor.project_manager.get('test_project')) self.projectdb.insert('test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone(self.processor.project_manager.get('test_project')) task = { "process": { "callback": "on_start" }, "project": "not_exists", "taskid": "data:,on_start", "url": "data:,on_start" } self.in_queue.put((task, {})) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['process']['ok'], False) self.assertIsNone(self.processor.project_manager.get('not_exists')) def test_20_broken_project(self): self.assertIsNone(self.processor.project_manager.get('test_broken_project')) self.projectdb.insert('test_broken_project', { 'name': 'test_broken_project', 'group': 'group', 'status': 'DEBUG', 'script': inspect.getsource(sample_handler)[:10], 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone(self.processor.project_manager.get('test_broken_project')) project_data = self.processor.project_manager.get('test_broken_project') self.assertIsNotNone(project_data.get('exception')) def test_30_new_task(self): self.assertTrue(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) task = { "process": { "callback": "on_start" }, "project": "test_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): self.status_queue.get() self.assertFalse(self.newtask_queue.empty()) def test_40_index_page(self): task = None while not self.newtask_queue.empty(): task = self.newtask_queue.get()[0] self.assertIsNotNone(task) fetch_result = { "orig_url": task['url'], "content": ( "<html><body>" "<a href='http://binux.me'>binux</a>" "<a href='http://binux.me/中文'>binux</a>" "<a href='http://binux.me/1'>1</a>" "<a href='http://binux.me/1'>2</a>" "</body></html>" ), "headers": {'a': 'b', 'etag': 'tag'}, "status_code": 200, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertFalse(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 200) self.assertEqual('tag', status['track']['fetch']['headers']['etag']) self.assertIsNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], True) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 3) self.assertIsNone(status['track']['process']['result']) self.assertEqual(status['track']['process']['logs'], '') self.assertIsNone(status['track']['process']['exception']) tasks = self.newtask_queue.get() self.assertEqual(len(tasks), 3) self.assertEqual(tasks[0]['url'], 'http://binux.me/') self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url']) def test_50_fetch_error(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "index_page" }, "project": "test_project", "taskid": "data:,test_fetch_error", "url": "data:,test_fetch_error" } fetch_result = { "orig_url": task['url'], "content": "test_fetch_error", "error": "test_fetch_error", "headers": {'a': 'b', 'last-modified': '123'}, "status_code": 598, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], False) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 598) self.assertEqual('123', status['track']['fetch']['headers']['last-modified']) self.assertIsNotNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 0) self.assertIsNone(status['track']['process']['result']) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) def test_60_call_broken_project(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start", } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) self.assertTrue(self.newtask_queue.empty()) def test_70_update_project(self): self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000 self.processor.project_manager._check_projects() self.assertIsNotNone(self.processor.project_manager.get('test_broken_project')) # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler), }) # not update self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) # updated task['project_updatetime'] = time.time() self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], True) self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler)[:10], }) # update with md5 task['project_md5sum'] = 'testmd5' del task['project_updatetime'] self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 @unittest.skipIf(six.PY3, "deprecated feature, not work for PY3") def test_80_import_project(self): self.projectdb.insert('test_project2', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.projectdb.insert('test_project3', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) from projects import test_project self.assertIsNotNone(test_project) self.assertIsNotNone(test_project.Handler) from projects.test_project2 import Handler self.assertIsNotNone(Handler) import projects.test_project3 self.assertIsNotNone(projects.test_project3.Handler)
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json) def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/delay/5' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) self.assertIn('js_script_result', result) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch'][ 'js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/pyspider/ajax.html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin + '/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, { 'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd' }, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin + '/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin + '/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
class TestFetcher(unittest.TestCase): sample_task_http = { "taskid": "taskid", "project": "project", "url": "", "fetch": { "method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "cookies": {"c": "d"}, "timeout": 60, "save": "abc", }, "process": {"callback": "callback", "save": [1, 2, 3]}, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = "http://127.0.0.1:14887" self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = "127.0.0.1:25555" self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen( ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True ) self.proxy = "127.0.0.1:14830" try: self.phantomjs = subprocess.Popen( [ "phantomjs", os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"), "25555", ] ) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" request["fetch"]["data"] = "binux" request["fetch"]["cookies"] = {"c": "d"} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["form"].get("binux"), "") self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_30_with_queue(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "data:,hello" result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, "hello") def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" # utf8 encoding 中文 request["fetch"]["data"] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u"中文", response.json["form"], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" # gbk encoding 中文 request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/delay/5" request["fetch"]["timeout"] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/status/418" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn("teapot", response.text) def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" request["fetch"]["fetch_type"] = "js" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) data = json.loads(response.doc("pre").text()) self.assertIsNotNone(data, response.content) self.assertEqual(data["headers"].get("A"), "b", response.json) self.assertEqual(data["headers"].get("Cookie"), "c=d", response.json) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/delay/5" request["fetch"]["fetch_type"] = "js" request["fetch"]["timeout"] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/html" request["fetch"]["fetch_type"] = "js" request["fetch"]["js_script"] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 200) self.assertIn("binux", result["content"]) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/pyspider/ajax.html" request["fetch"]["fetch_type"] = "js" request["fetch"]["headers"]["User-Agent"] = "pyspider-test" result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 200) self.assertNotIn("loading", result["content"]) self.assertIn("done", result["content"]) self.assertIn("pyspider-test", result["content"]) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request["url"] = "http://www.not-exists-site.com/" result = self.fetcher.sync_fetch(request) self.assertEqual(result["status_code"], 599) self.assertIn("error", result) self.assertIn("resolve", result["error"]) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result["status_code"], 599) self.assertIn("error", result) self.assertIn("resolve", result["error"]) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get?username=binux&password=123456" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/redirect-to?url=/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.url, self.httpbin + "/get") def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/redirect/10" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn("redirects followed", response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/cookies/set?k1=v1&k2=v2" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {"a": "b", "k1": "v1", "k2": "v2", "c": "d"}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["validate_cert"] = False request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["max_redirects"] = 10 request["url"] = self.httpbin + "/redirect/10" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["robots_txt"] = False request["url"] = self.httpbin + "/deny" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request["fetch"]["robots_txt"] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = "127.0.0.1:20000" if not self.phantomjs: raise unittest.SkipTest("no phantomjs") request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" request["fetch"]["fetch_type"] = "js" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json) def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) self.assertIn('js_script_result', result) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin+'/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy