Esempio n. 1
0
    def _update_project(self, project):
        '''update one project'''
        if project['name'] not in self.projects:
            self.projects[project['name']] = {}
        self.projects[project['name']].update(project)
        self.projects[project['name']]['md5sum'] = utils.md5string(project['script'])
        if not self.projects[project['name']].get('active_tasks', None):
            self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS)

        # load task queue when project is running and delete task_queue when project is stoped
        if project['status'] in ('RUNNING', 'DEBUG'):
            if project['name'] not in self.task_queue:
                self._load_tasks(project['name'])
            self.task_queue[project['name']].rate = project['rate']
            self.task_queue[project['name']].burst = project['burst']

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project['name'],
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': ['min_tick', ],
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })
        else:
            if project['name'] in self.task_queue:
                self.task_queue[project['name']].rate = 0
                self.task_queue[project['name']].burst = 0
                del self.task_queue[project['name']]
Esempio n. 2
0
 def test_put(n):
     logger.info("message queue put %d", n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.spider.org/?l=%d' % i
         task['taskid'] = md5string(task['url'])
         queue.put(task, block=True, timeout=1)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Esempio n. 3
0
 def test_update(n, start=0):
     logger.info("taskdb update %d" % n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.spider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.update(task['project'], task['taskid'], task)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Esempio n. 4
0
 def test_get(n, start=0, random=True, fields=request_task_fields):
     logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
     range_n = list(range(n))
     if random:
         from random import shuffle
         shuffle(range_n)
     start_time = time.time()
     for i in range_n:
         task['url'] = 'http://bench.spider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.get_task(task['project'], task['taskid'], fields=fields)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Esempio n. 5
0
 def _load_project(self, project):
     """Load project into self.projects from project info dict"""
     try:
         project["md5sum"] = utils.md5string(project["script"])
         ret = self.build_module(project, self.env)
         self.projects[project["name"]] = ret
     except Exception as e:
         logger.exception("load project %s error", project.get("name", None))
         ret = {
             "loader": None,
             "module": None,
             "class": None,
             "instance": None,
             "exception": e,
             "exception_log": traceback.format_exc(),
             "info": project,
             "load_time": time.time(),
         }
         self.projects[project["name"]] = ret
         return False
     logger.debug("project: %s updated.", project.get("name", None))
     return True
Esempio n. 6
0
def send_message(ctx, scheduler_rpc, project, message):
    """
    Send Message to project from command line
    """
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % (
            os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')

    return scheduler_rpc.send_task({
        'taskid': utils.md5string('data:,on_message'),
        'project': project,
        'url': 'data:,on_message',
        'fetch': {
            'save': ('__command__', message),
        },
        'process': {
            'callback': '_on_message',
        }
    })
Esempio n. 7
0
 def get_taskid(self, task):
     '''Generate taskid by information of task md5(url) by default, override me'''
     return md5string(task['url'])
Esempio n. 8
0
    def on_task(self, task, response):
        """Deal one task"""
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert "taskid" in task, "need taskid in task"
            project = task["project"]
            updatetime = task.get("project_updatetime", None)
            md5sum = task.get("project_md5sum", None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get("exception"):
                ret = ProcessorResult(logs=(project_data.get("exception_log"),), exception=project_data["exception"])
            else:
                ret = project_data["instance"].run_task(project_data["module"], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr,), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get("not_send_status", False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ("etag", "last-modified"):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                "taskid": task["taskid"],
                "project": task["project"],
                "url": task.get("url"),
                "track": {
                    "fetch": {
                        "ok": response.isok(),
                        "redirect_url": response.url if response.url != response.orig_url else None,
                        "time": response.time,
                        "error": response.error,
                        "status_code": response.status_code,
                        "encoding": response.encoding,
                        "headers": track_headers,
                        "content": response.text[:500] if ret.exception else None,
                    },
                    "process": {
                        "ok": not ret.exception,
                        "time": process_time,
                        "follows": len(ret.follows),
                        "result": (None if ret.result is None else utils.text(ret.result)[: self.RESULT_RESULT_LIMIT]),
                        "logs": ret.logstr()[-self.RESULT_LOGS_LIMIT :],
                        "exception": ret.exception,
                    },
                    "save": ret.save,
                },
            }
            if "schedule" in task:
                status_pack["schedule"] = task["schedule"]

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            try:
                self.on_task(
                    {
                        "taskid": utils.md5string(url),
                        "project": project,
                        "url": url,
                        "process": {"callback": "_on_message"},
                    },
                    {"status_code": 200, "url": url, "save": (task["project"], msg)},
                )
            except Exception as e:
                logger.exception("Sending message error.")
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            "process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r"
            % (
                task["project"],
                task["taskid"],
                task.get("url"),
                response.status_code,
                len(response.content),
                ret.result,
                len(ret.follows),
                len(ret.messages),
                ret.exception,
            )
        )
        return True