def test_task_join_crawl_config(self): task = dict(self.sample_task_http) crawl_config = { 'taskid': 'xxxx', # should not affect finial task 'proxy': 'username:password@hostname:port', # should add proxy 'headers': { # should merge headers 'Cookie': 'abc', # should not affect cookie 'c': 'd', # should add header c } } ret = BaseHandler.task_join_crawl_config(task, crawl_config) self.assertDictEqual(ret, { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'proxy': 'username:password@hostname:port', 'headers': { 'Cookie': 'a=b', 'a': 'b', 'c': 'd' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, });
def on_select_task(self, task): logger.info('select %(project)s:%(taskid)s %(url)s', task) project_info = self.projects.get(task['project']) assert project_info, 'no such project' task['type'] = self.TASK_PACK task['group'] = project_info.group task['project_md5sum'] = project_info.md5sum task['project_updatetime'] = project_info.updatetime if getattr(project_info, 'crawl_config', None): task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config) project_info.active_tasks.appendleft((time.time(), task)) self.send_task(task) return task
def on_select_task(self, task): '''Called when a task is selected to fetch & process''' # inject informations about project logger.info('select %(project)s:%(taskid)s %(url)s', task) project_info = self.projects.get(task['project']) assert project_info, 'no such project' task['group'] = project_info.group task['project_md5sum'] = project_info.md5sum task['project_updatetime'] = project_info.updatetime # lazy join project.crawl_config if getattr(project_info, 'crawl_config', None): task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config) project_info.active_tasks.appendleft((time.time(), task)) self.send_task(task) return task
def test_task_join_crawl_config(self): task = dict(self.sample_task_http) crawl_config = { 'taskid': 'xxxx', # should not affect finial task 'proxy': 'username:password@hostname:port', # should add proxy 'headers': { # should merge headers 'Cookie': 'abc', # should not affect cookie 'c': 'd', # should add header c } } ret = BaseHandler.task_join_crawl_config(task, crawl_config) self.assertDictEqual( ret, { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'proxy': 'username:password@hostname:port', 'headers': { 'Cookie': 'a=b', 'a': 'b', 'c': 'd' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, })