def _parse(data): if six.PY3: for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[utils.text(key)] = utils.text(value) else: data[utils.text(key)] = value return data
def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) if "result" in data: data["result"] = json.loads(data["result"]) return data
def load_tasks(self, status, project=None, fields=None): if project is None: project = self.projects elif not isinstance(project, list): project = [project, ] if self.scan_available: scan_method = self.redis.sscan_iter else: scan_method = self.redis.smembers if fields: def get_method(key): obj = self.redis.hmget(key, fields) if all(x is None for x in obj): return None return dict(zip(fields, obj)) else: get_method = self.redis.hgetall for p in project: status_key = self._gen_status_key(p, status) for taskid in scan_method(status_key): obj = get_method(self._gen_key(p, utils.text(taskid))) if not obj: #self.redis.srem(status_key, taskid) continue else: yield self._parse(obj)
def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) if 'result' in data: if isinstance(data['result'], bytearray): data['result'] = str(data['result']) data['result'] = json.loads(data['result']) return data
def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data
def _parse(self, data): if six.PY3: result = {} for key, value in data.items(): if isinstance(value, bytes): value = utils.text(value) result[utils.text(key)] = value data = result for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} if 'status' in data: data['status'] = int(data['status']) if 'lastcrawltime' in data: data['lastcrawltime'] = float(data['lastcrawltime'] or 0) if 'updatetime' in data: data['updatetime'] = float(data['updatetime'] or 0) return data
def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) for each in ("schedule", "fetch", "process", "track"): if each in data: if data[each]: if isinstance(data[each], bytearray): data[each] = str(data[each]) data[each] = json.loads(data[each]) else: data[each] = {} return data
def endWrite(self, withErrors): if withErrors: self.app.logger.error('webdav.endWrite error: %r', withErrors) return super(ScriptResource, self).endWrite(withErrors) if not self.writebuffer: return projectdb = self.app.config['projectdb'] if not projectdb: return info = { 'script': text(getattr(self.writebuffer, 'content', '')) } if self.project.get('status') in ('DEBUG', 'RUNNING'): info['status'] = 'CHECKING' if self.new_project: self.project.update(info) self.new_project = False return projectdb.insert(self.project_name, self.project) else: return projectdb.update(self.project_name, info)
def readonly(self): projectdb = self.app.config['projectdb'] if not projectdb: return True if 'lock' in projectdb.split_group(self.project.get('group')) \ and self.app.config.get('webui_username') \ and self.app.config.get('webui_password'): authheader = self.environ.get("HTTP_AUTHORIZATION") if not authheader: return True authheader = authheader[len("Basic "):] try: username, password = text(base64.b64decode(authheader)).split(':', 1) except Exception as e: self.app.logger.error('wrong api key: %r, %r', authheader, e) return True if username == self.app.config['webui_username'] \ and password == self.app.config['webui_password']: return False else: return True return False
def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._projects = set(utils.text(x) for x in self.redis.smembers( self.__prefix__ + 'projects')) return self._projects
def on_task(self, task, response): """Deal one task""" start_time = time.time() response = rebuild_response(response) try: assert "taskid" in task, "need taskid in task" project = task["project"] updatetime = task.get("project_updatetime", None) md5sum = task.get("project_md5sum", None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get("exception"): ret = ProcessorResult(logs=(project_data.get("exception_log"),), exception=project_data["exception"]) else: ret = project_data["instance"].run_task(project_data["module"], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr,), exception=e) process_time = time.time() - start_time if not ret.extinfo.get("not_send_status", False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ("etag", "last-modified"): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { "taskid": task["taskid"], "project": task["project"], "url": task.get("url"), "track": { "fetch": { "ok": response.isok(), "redirect_url": response.url if response.url != response.orig_url else None, "time": response.time, "error": response.error, "status_code": response.status_code, "encoding": response.encoding, "headers": track_headers, "content": response.text[:500] if ret.exception else None, }, "process": { "ok": not ret.exception, "time": process_time, "follows": len(ret.follows), "result": (None if ret.result is None else utils.text(ret.result)[: self.RESULT_RESULT_LIMIT]), "logs": ret.logstr()[-self.RESULT_LOGS_LIMIT :], "exception": ret.exception, }, "save": ret.save, }, } if "schedule" in task: status_pack["schedule"] = task["schedule"] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: try: self.on_task( { "taskid": utils.md5string(url), "project": project, "url": url, "process": {"callback": "_on_message"}, }, {"status_code": 200, "url": url, "save": (task["project"], msg)}, ) except Exception as e: logger.exception("Sending message error.") continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( "process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r" % ( task["project"], task["taskid"], task.get("url"), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception, ) ) return True