Beispiel #1
0
 def _parse(data):
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.binary_type):
                 data[utils.text(key)] = utils.text(value)
             else:
                 data[utils.text(key)] = value
     return data
Beispiel #2
0
 def _parse(self, data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, (bytearray, six.binary_type)):
             data[key] = utils.text(value)
     if "result" in data:
         data["result"] = json.loads(data["result"])
     return data
Beispiel #3
0
    def load_tasks(self, status, project=None, fields=None):
        if project is None:
            project = self.projects
        elif not isinstance(project, list):
            project = [project, ]

        if self.scan_available:
            scan_method = self.redis.sscan_iter
        else:
            scan_method = self.redis.smembers

        if fields:
            def get_method(key):
                obj = self.redis.hmget(key, fields)
                if all(x is None for x in obj):
                    return None
                return dict(zip(fields, obj))
        else:
            get_method = self.redis.hgetall

        for p in project:
            status_key = self._gen_status_key(p, status)
            for taskid in scan_method(status_key):
                obj = get_method(self._gen_key(p, utils.text(taskid)))
                if not obj:
                    #self.redis.srem(status_key, taskid)
                    continue
                else:
                    yield self._parse(obj)
Beispiel #4
0
 def _parse(data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, six.binary_type):
             data[key] = utils.text(value)
     if 'result' in data:
         if isinstance(data['result'], bytearray):
             data['result'] = str(data['result'])
         data['result'] = json.loads(data['result'])
     return data
Beispiel #5
0
 def _parse(self, data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, (bytearray, six.binary_type)):
             data[key] = utils.text(value)
     for each in ('schedule', 'fetch', 'process', 'track'):
         if each in data:
             if data[each]:
                 data[each] = json.loads(data[each])
             else:
                 data[each] = {}
     return data
Beispiel #6
0
    def _parse(self, data):
        if six.PY3:
            result = {}
            for key, value in data.items():
                if isinstance(value, bytes):
                    value = utils.text(value)
                result[utils.text(key)] = value
            data = result

        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        if 'status' in data:
            data['status'] = int(data['status'])
        if 'lastcrawltime' in data:
            data['lastcrawltime'] = float(data['lastcrawltime'] or 0)
        if 'updatetime' in data:
            data['updatetime'] = float(data['updatetime'] or 0)
        return data
Beispiel #7
0
 def _parse(data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, six.binary_type):
             data[key] = utils.text(value)
     for each in ("schedule", "fetch", "process", "track"):
         if each in data:
             if data[each]:
                 if isinstance(data[each], bytearray):
                     data[each] = str(data[each])
                 data[each] = json.loads(data[each])
             else:
                 data[each] = {}
     return data
Beispiel #8
0
    def endWrite(self, withErrors):
        if withErrors:
            self.app.logger.error('webdav.endWrite error: %r', withErrors)
            return super(ScriptResource, self).endWrite(withErrors)
        if not self.writebuffer:
            return
        projectdb = self.app.config['projectdb']
        if not projectdb:
            return

        info = {
            'script': text(getattr(self.writebuffer, 'content', ''))
        }
        if self.project.get('status') in ('DEBUG', 'RUNNING'):
            info['status'] = 'CHECKING'

        if self.new_project:
            self.project.update(info)
            self.new_project = False
            return projectdb.insert(self.project_name, self.project)
        else:
            return projectdb.update(self.project_name, info)
Beispiel #9
0
    def readonly(self):
        projectdb = self.app.config['projectdb']
        if not projectdb:
            return True
        if 'lock' in projectdb.split_group(self.project.get('group')) \
                and self.app.config.get('webui_username') \
                and self.app.config.get('webui_password'):

            authheader = self.environ.get("HTTP_AUTHORIZATION")
            if not authheader:
                return True
            authheader = authheader[len("Basic "):]
            try:
                username, password = text(base64.b64decode(authheader)).split(':', 1)
            except Exception as e:
                self.app.logger.error('wrong api key: %r, %r', authheader, e)
                return True

            if username == self.app.config['webui_username'] \
                    and password == self.app.config['webui_password']:
                return False
            else:
                return True
        return False
Beispiel #10
0
 def projects(self):
     if time.time() - getattr(self, '_last_update_projects', 0) \
             > self.UPDATE_PROJECTS_TIME:
         self._projects = set(utils.text(x) for x in self.redis.smembers(
             self.__prefix__ + 'projects'))
     return self._projects
Beispiel #11
0
    def on_task(self, task, response):
        """Deal one task"""
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert "taskid" in task, "need taskid in task"
            project = task["project"]
            updatetime = task.get("project_updatetime", None)
            md5sum = task.get("project_md5sum", None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get("exception"):
                ret = ProcessorResult(logs=(project_data.get("exception_log"),), exception=project_data["exception"])
            else:
                ret = project_data["instance"].run_task(project_data["module"], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr,), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get("not_send_status", False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ("etag", "last-modified"):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                "taskid": task["taskid"],
                "project": task["project"],
                "url": task.get("url"),
                "track": {
                    "fetch": {
                        "ok": response.isok(),
                        "redirect_url": response.url if response.url != response.orig_url else None,
                        "time": response.time,
                        "error": response.error,
                        "status_code": response.status_code,
                        "encoding": response.encoding,
                        "headers": track_headers,
                        "content": response.text[:500] if ret.exception else None,
                    },
                    "process": {
                        "ok": not ret.exception,
                        "time": process_time,
                        "follows": len(ret.follows),
                        "result": (None if ret.result is None else utils.text(ret.result)[: self.RESULT_RESULT_LIMIT]),
                        "logs": ret.logstr()[-self.RESULT_LOGS_LIMIT :],
                        "exception": ret.exception,
                    },
                    "save": ret.save,
                },
            }
            if "schedule" in task:
                status_pack["schedule"] = task["schedule"]

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            try:
                self.on_task(
                    {
                        "taskid": utils.md5string(url),
                        "project": project,
                        "url": url,
                        "process": {"callback": "_on_message"},
                    },
                    {"status_code": 200, "url": url, "save": (task["project"], msg)},
                )
            except Exception as e:
                logger.exception("Sending message error.")
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            "process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r"
            % (
                task["project"],
                task["taskid"],
                task.get("url"),
                response.status_code,
                len(response.content),
                ret.result,
                len(ret.follows),
                len(ret.messages),
                ret.exception,
            )
        )
        return True