def _crawl(self, url, **kwargs): task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, basestring) and hasattr(self, callback): func = getattr(self, callback) elif hasattr(callback, 'im_self') and callback.im_self is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in func._config.iteritems(): kwargs.setdefault(k, v) if hasattr(self, 'crawl_config'): for k, v in self.crawl_config.iteritems(): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] if schedule: task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] if fetch: task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] if process: task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or md5string(url) self._follows.append(task) return task
def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = {} self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string(project['script']) if not self.projects[project['name']].get('active_tasks', None): self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', ], }, 'process': { 'callback': '_on_get_info', }, }) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']]
def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = {} self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string(project['script']) if not self.projects[project['name']].get('active_tasks', None): self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', 'retry_delay'], }, 'process': { 'callback': '_on_get_info', }, }) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']]
def crawl_list_page(self, response): db_cookie = self.get_cookie() or {} r_cookie = response.cookies print db_cookie print r_cookie db_ctime = int(db_cookie.get('ctime', 0)) r_ctime = int(r_cookie.get('ctime', 0)) if self.check_captcha(response): if db_ctime <= r_ctime: db_cookie = self.verify_vcode(response) if not db_cookie: raise Exception('sougou_weixin refresh cookies fail!') #self.crawl(response.url, callback=self.crawl_list_page, cookies=db_cookie, save=response.save, force_update=True) else: # response.cookies.update(cookies) # 更新cookies 会导致无法转跳到 detail页面 for each in response.doc(self.LIST_ANCHOR_SEL).items(): taskid = md5string(each.text()) self.crawl(each.attr.href, taskid=taskid, callback=self.detail_page, save=response.save, cookies=response.cookies)
def update(self, project_info): self.project_info = project_info self.name = project_info['name'] self.group = project_info['group'] self.db_status = project_info['status'] self.updatetime = project_info['updatetime'] md5sum = utils.md5string(project_info['script']) if self.md5sum != md5sum: self.waiting_get_info = True self.md5sum = md5sum if self.waiting_get_info and self.active: self._send_on_get_info = True if self.active: self.task_queue.rate = project_info['rate'] self.task_queue.burst = project_info['burst'] else: self.task_queue.rate = 0 self.task_queue.burst = 0 logger.info('project %s updated, status:%s, paused:%s, %d tasks', self.name, self.db_status, self.paused, len(self.task_queue))
def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): scheduler_rpc = connect_rpc( ctx, None, 'http://%s/' % (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } })
def _update_project(self, project): """update one project""" if project["name"] not in self.projects: self.projects[project["name"]] = {} self.projects[project["name"]].update(project) self.projects[project["name"]]["md5sum"] = utils.md5string(project["script"]) if not self.projects[project["name"]].get("active_tasks", None): self.projects[project["name"]]["active_tasks"] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project["status"] in ("RUNNING", "DEBUG"): if project["name"] not in self.task_queue: self._load_tasks(project["name"]) self.task_queue[project["name"]].rate = project["rate"] self.task_queue[project["name"]].burst = project["burst"] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task( { "taskid": "_on_get_info", "project": project["name"], "url": "data:,_on_get_info", "status": self.taskdb.SUCCESS, "fetch": {"save": ["min_tick"]}, "process": {"callback": "_on_get_info"}, } ) else: if project["name"] in self.task_queue: self.task_queue[project["name"]].rate = 0 self.task_queue[project["name"]].burst = 0 del self.task_queue[project["name"]]
def save(self, project, taskid, url, result, capture_phase): tablename = self._tablename(project) #result[0] is primary key result[1] is the dictionary key = result[0] items = result[1] if project not in self.projects: self._create_project(project, key, items) self._list_project() obj = None obj_result = result[1] if key and key not in ['taskid', 'url']: obj = { key: str(obj_result[key][0] if isinstance(obj_result[key], list) else obj_result[key]), 'taskid': taskid, 'url': url, 'updatetime': time.time(), } obj_result.update(obj) key_value = obj_result[key] else: obj = { 'taskid': taskid, 'url': url, 'updatetime': time.time(), } suid = '' for k in obj_result.keys(): suid += str(obj_result[k][0] if isinstance(obj_result[k], list) else obj_result[k]) suid = utils.md5string(suid) obj_result.update(obj) obj_result['suid'] = suid key_value = suid key = 'suid' for o in obj_result.keys(): if obj_result[o] == None: obj_result[o] = '' else: obj_result[o] = str(obj_result[o][0] if isinstance(obj_result[o], list) else obj_result[o] ) try: obj_result_copy = copy.deepcopy(obj_result) del obj_result_copy['updatetime'] fields = tuple(obj_result_copy.keys()) tasks = self.get(project, key, key_value, fields) if tasks: print 'cunzai ', obj_result_copy where = "%s = %s" % (self.escape(key), self.placeholder) where_values=[obj_result_copy[key]] return self._update(tablename, where=where, where_values=[obj_result_copy[key]], **self._stringify(obj_result_copy)) else: print '不存在', obj_result return self._replace(tablename, **self._stringify(obj_result)) except Exception, e: print str(e)
def test_put(n): logger.info("message queue put %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % i task['taskid'] = md5string(task['url']) queue.put(task, block=True, timeout=1) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_insert(n, start=0): logger.info("taskdb insert %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l={0:d}'.format((i + start)) task['taskid'] = md5string(task['url']) task['track'] = {} taskdb.insert(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_update(n, start=0): logger.info("taskdb update %d" % n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.update(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def _update_project(self, project): """update one project""" if project['name'] not in self.projects: self.projects[project['name']] = {} try: project = rebuild_project(project) except Exception as e: logger.error("update project fail: %s" % str(e)) return self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string( project['script']) if not self.projects[project['name']].get('active_tasks', None): if self.ACTIVE_TASKS == 0: self.projects[project['name']]['active_tasks'] = deque() else: self.projects[project['name']]['active_tasks'] = deque( maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', 'retry_delay'], }, 'process': { 'callback': '_on_get_info', }, }) logger.info('get info of project: %s', project['name']) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']] logger.info('delete queue of project: %s', project['name']) if project not in self._cnt['all']: self._update_project_cnt(project['name'])
def test_get(n, start=0, random=True, fields=request_task_fields): logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) range_n = list(range(n)) if random: from random import shuffle shuffle(range_n) start_time = time.time() for i in range_n: task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.get_task(task['project'], task['taskid'], fields=fields) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get("SCHEDULER_NAME"): scheduler_rpc = connect_rpc(ctx, None, "http://%s/" % (os.environ["SCHEDULER_PORT_23333_TCP"][len("tcp://") :])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, "http://127.0.0.1:23333/") return scheduler_rpc.send_task( { "taskid": utils.md5string("data:,on_message"), "project": project, "url": "data:,on_message", "fetch": {"save": ("__command__", message)}, "process": {"callback": "_on_message"}, } )
def update(self, project_info): self.project_info = project_info self.name = project_info['name'] self.group = project_info['group'] self.db_status = project_info['status'] self.updatetime = project_info['updatetime'] md5sum = utils.md5string(project_info['script']) if (self.md5sum != md5sum or self.waiting_get_info) and self.active: self._send_on_get_info = True self.waiting_get_info = True self.md5sum = md5sum if self.active: self.task_queue.rate = project_info['rate'] self.task_queue.burst = project_info['burst'] else: self.task_queue.rate = 0 self.task_queue.burst = 0
def send_message(ctx, scheduler_rpc, project, message): if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://localhost:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } })
def _load_project(self, project): try: project['md5sum'] = utils.md5string(project['script']) ret = self.build_module(project, self.env) self.projects[project['name']] = ret except Exception as e: logger.exception("load project %s error", project.get('name', None)) ret = { 'loader': None, 'module': None, 'class': None, 'instance': None, 'exception': e, 'exception_log': traceback.format_exc(), 'info': project, 'load_time': time.time(), } self.projects[project['name']] = ret return False logger.debug('project: %s updated.', project.get('name', None)) return True
def _load_project(self, project): '''Load project into self.projects from project info dict''' try: project['md5sum'] = utils.md5string(project['script']) ret = self.build_module(project, self.env) self.projects[project['name']] = ret except Exception as e: logger.exception("load project %s error", project.get('name', None)) ret = { 'loader': None, 'module': None, 'class': None, 'instance': None, 'exception': e, 'exception_log': traceback.format_exc(), 'info': project, 'load_time': time.time(), } self.projects[project['name']] = ret return False logger.debug('project: %s updated.', project.get('name', None)) return True
def add_seed(self, seed_path, project, callback): new_tasks = list() from pyspider.libs.utils import md5string with open(seed_path) as fi: for line in fi: url = line.strip().decode('utf-8') task = {} task['url'] = url task['project'] = project task['status'] = 1 task.setdefault('schedule', {'force_update': True}) task['taskid'] = md5string(task['url']) task['fetch'] = {} task['process'] = {} task['process']['callback'] = callback task['depth'] = 0 new_tasks.append(task) for each in (new_tasks[x:x + 1000] for x in range(0, len(new_tasks), 1000)): self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in each]) logger.info("add seed success for project %s!" % project)
def update(self, project_info): self.project_info = project_info self.name = project_info['name'] self.group = project_info['group'] self.db_status = project_info['status'] self.updatetime = project_info['updatetime'] md5sum = utils.md5string(project_info['script']) if (self.md5sum != md5sum or self.waiting_get_info) and self.active: self._send_on_get_info = True self.waiting_get_info = True self.md5sum = md5sum if self.active: self.task_queue.rate = project_info['rate'] self.task_queue.burst = project_info['burst'] else: self.task_queue.rate = 0 self.task_queue.burst = 0 logger.info('project %s updated, status:%s, paused:%s, %d tasks', self.name, self.db_status, self.paused, len(self.task_queue))
def get_taskid(self, task): '''Generate taskid by information of task md5(url) by default, override me''' return md5string(task['url'])
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] if project not in self.projects: raise LookupError("no such project: %s" % project) project_data = self.projects[project] ret = project_data['instance'].run( project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': ( response.content[:500] if not response.isok() or ret.exception else None ), }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) for newtask in ret.follows: # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.newtask_queue.put(utils.unicode_obj(newtask)) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:100], 'logs': ret.logstr()[-200:], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info
def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult( logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': getattr(response, '_encoding', None), 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': (None if ret.result is None else utils.text( ret.result)[:self.RESULT_RESULT_LIMIT]), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task( { 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( 'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('updatetime', None) project_data = self.project_manager.get(project, updatetime) if not project_data: logger.error("no such project: %s", project) return False ret = project_data['instance'].run(project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': (response.content[:500] if ret.exception else None), }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': utils.text(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( 'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def save(self, project, taskid, url, result, capture_phase): key = result[0] key_value = None items = result[1] if project not in self.projects: self._create_project(project, key, items) self._list_project() self.tables[project] = copy.deepcopy(self.table) #if self.table is None: if not self.tables.has_key(project): self._init_table(project, key, items) self.tables[project] = copy.deepcopy(self.table) table = self.tables[project] table.name = self._tablename(project) obj_result = result[1] if key and key not in ['taskid', 'url']: obj = { key: result[1][key][0] if isinstance(result[1][key], list) else result[1][key], 'taskid': taskid, 'url': url, 'updatetime': time.time(), } obj_result.update(obj) key_value = result[1][key][0] if isinstance( result[1][key], list) else result[1][key] else: key = 'suid' obj = { 'taskid': taskid, 'url': url, 'updatetime': time.time(), } #计算suid suid = '' for k in obj_result.keys(): suid += str(obj_result[k][0] if isinstance( obj_result[k], list) else obj_result[k]) suid = utils.md5string(suid) obj_result.update(obj) obj_result['suid'] = suid key_value = suid #将unicode转换成utf-8 for o in obj_result.keys(): if obj_result[o] == None: obj_result[o] = '' else: obj_result[o] = str(obj_result[o][0] if isinstance( obj_result[o], list) else obj_result[o]) obj_result['recently'] = 0 try: obj_result_copy = copy.deepcopy(obj_result) del obj_result_copy['updatetime'] fields = tuple(obj_result_copy.keys()) tasks = self.get(project, key, key_value, fields) if tasks: need_update = False for key in fields: if str(obj_result_copy[key]) != str(tasks[key.upper()]): #need update need_update = True column = None for x in table.c: if key in str(x): column = x if column is not None and need_update: try: sql = table.update().where(column == key_value).values( **self._stringify(obj_result)) return self.engine.execute(sql) except Exception, e: logging.error('update data failed. error: %s %s' % (e, str(obj_result))) else:
def get_taskid(self, task): # 重写get_taskid方法 return md5string( task['url'] + json.dumps(task['fetch'].get('data', ''))) # 利用URl和POST参数生成ID,
def get_taskid(self, task): """Generate taskid by information of task md5(url) by default, override me""" return md5string(task["url"])
def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult(logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. # logger.info('process follows :%s' % ret.follows) # logger.info('process messages :%s' % ret.messages) if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('updatetime', None) project_data = self.project_manager.get(project, updatetime) if not project_data: logger.error("no such project: %s", project) return False ret = project_data['instance'].run( project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.content[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def get_taskid(self, task): return md5string(task['url'] + json.dumps(task['fetch'].get('data', '')) + str(datetime.date.today()) + 'v7.0')
def get_taskid(self, task): return md5string( task['url'] + json.dumps(task['fetch'].get('data', '')) + json.dumps(task['fetch'].get('params', '')))