Exemple #1
0
    def start(self):
        """启动这个worker
            启动的时候,会将spider中的start_tasks移到待抓取队列
            不会重复启动
        """
        if self.is_started:
            self.logger.warn("duplicate start")
        else:
            self.is_started = True
            self.worker_statistic.start_time = datetime.datetime.now()
            try:
                RecorderManager.instance().record_doing(
                    record(
                        self._worker_name,
                        self.worker_statistic.start_time.strftime(
                            "%Y-%m-%d %H:%M:%S"),
                        get_class_path(self.spider.crawl_schedule.__class__),
                        self.spider.crawl_schedule.schedule_kwargs,
                        get_class_path(self.spider.__class__),
                        self.spider.spider_kwargs))
            except Exception, e:
                self.logger.warn("record worker failed:%s" % e)

            _move_start_tasks_to_crawl_schedule(self.spider.start_tasks,
                                                self.spider.crawl_schedule)

            ioloop.IOLoop.instance().add_timeout(
                datetime.timedelta(
                    milliseconds=self.spider.crawl_schedule.interval),
                self.loop_get_and_execute)
            self.logger.info("start worker")
Exemple #2
0
def api_recover_worker(params):
    '''以恢复模式启动一个worker
        Args:
            params: 字典, 参数字典:必须包括对应的worker_name
    '''

    is_ok, errors = check_params(params, 'worker_name')
    if not is_ok:
        return result(400, "params error", str(errors))
    else:
        try:
            worker_name = params.pop('worker_name')
            record = RecorderManager.instance().get_fail_worker_record(worker_name)
            if not record:
                return result(400, "not exist this fail worker", worker_name)
            else:
                schedule_params = record.get('schedule_kwargs')
                spider_params = record.get('spider_kwargs')
                schedule_path = record.get('schedule_class')
                spider_path = record.get('spider_class')
            schedule = get_schedule_class(schedule_path)(**schedule_params)
            spider = get_spider_class(spider_path)(schedule, **spider_params)
            recover_worker(spider)
            RecorderManager.instance().remove_last_fail_worker(worker_name)
        except ScheduleError, e:
            return result(400, message="init schedule failed", result=str(e))
        except SpiderError, e:
            return result(400, message="init spider failed", result=str(e))
Exemple #3
0
def api_remove_fail_worker(params):
    """清除fail worker记录,以及相关队列
        Args:
            params: dict, 参数字典,必须包括worker_name
    """
    is_ok, error = check_params(params, "worker_name")
    if not is_ok:
        return result(400, "params error", str(error))
    else:
        worker_name = params.pop("worker_name")
        record = RecorderManager.instance().get_fail_worker_record(worker_name)
        if not record:
            return result(400, "not exist this fail worker", worker_name)
        else:
            try:
                schedule_params = record.get('schedule_kwargs')
                spider_params = record.get('spider_kwargs')
                schedule_path = record.get('schedule_class')
                spider_path = record.get('spider_class')
                schedule = get_schedule_class(schedule_path)(**schedule_params)
                spider = get_spider_class(spider_path)(schedule, **spider_params)
                spider.clear_all()
                RecorderManager.instance().remove_last_fail_worker(worker_name)
            except ScheduleError, e:
                return result(400, message="init schedule failed", result=str(e))
            except SpiderError, e:
                return result(400, message="init spider failed", result=str(e))
            except WorkerError, e:
                return result(400, message="recover worker failed", result=str(e))
Exemple #4
0
    def start(self):
        """启动这个worker
            启动的时候,会将spider中的start_tasks移到待抓取队列
            不会重复启动
        """
        if self.is_started:
            self.logger.warn("duplicate start")
        else:
            self.is_started = True
            self.worker_statistic.start_time = datetime.datetime.now()
            try:
                RecorderManager.instance().record_doing(
                    record(
                        self._worker_name,
                        self.worker_statistic.start_time.
                        strftime("%Y-%m-%d %H:%M:%S"),
                        get_class_path(self.spider.crawl_schedule.__class__),
                        self.spider.crawl_schedule.schedule_kwargs,
                        get_class_path(self.spider.__class__),
                        self.spider.spider_kwargs))
            except Exception, e:
                self.logger.warn("record worker failed:%s" % e)

            _move_start_tasks_to_crawl_schedule(self.spider.start_tasks,
                                                self.spider.crawl_schedule)

            ioloop.IOLoop.instance().add_timeout(
                datetime.timedelta(
                    milliseconds=self.spider.crawl_schedule.interval),
                self.loop_get_and_execute)
            self.logger.info("start worker")
Exemple #5
0
def api_remove_all_fail_worker(params):
    """remove all fail worker
       Args:
           params: dict, param dict
    """
    records = RecorderManager.instance().get_last_fail_worker()
    remove_rs = []

    try:
        for record in records:
            worker_name = record.get('worker_name')
            schedule_params = record.get('schedule_kwargs')
            spider_params = record.get('spider_kwargs')
            schedule_path = record.get('schedule_class')
            spider_path = record.get('spider_class')
            try:
                schedule = get_schedule_class(schedule_path)(**schedule_params)
                spider = get_spider_class(spider_path)(schedule,
                                                       **spider_params)
                spider.clear_all()
                RecorderManager.instance().remove_last_fail_worker(worker_name)
            except Exception, e:
                remove_rs.append({
                    "worker_name": worker_name,
                    "result": "fail",
                    "error": str(e)
                })
            else:
                remove_rs.append({
                    "worker_name": worker_name,
                    "result": "success",
                    "error": ""
                })
    except Exception, e:
        return result(500, "unsupported exception", result=str(e))
Exemple #6
0
def api_remove_fail_worker(params):
    """清除fail worker记录,以及相关队列
        Args:
            params: dict, 参数字典,必须包括worker_name
    """
    is_ok, error = check_params(params, "worker_name")
    if not is_ok:
        return result(400, "params error", str(error))
    else:
        worker_name = params.pop("worker_name")
        record = RecorderManager.instance().get_fail_worker_record(worker_name)
        if not record:
            return result(400, "not exist this fail worker", worker_name)
        else:
            try:
                schedule_params = record.get('schedule_kwargs')
                spider_params = record.get('spider_kwargs')
                schedule_path = record.get('schedule_class')
                spider_path = record.get('spider_class')
                schedule = get_schedule_class(schedule_path)(**schedule_params)
                spider = get_spider_class(spider_path)(schedule,
                                                       **spider_params)
                spider.clear_all()
                RecorderManager.instance().remove_last_fail_worker(worker_name)
            except ScheduleError, e:
                return result(400,
                              message="init schedule failed",
                              result=str(e))
            except SpiderError, e:
                return result(400, message="init spider failed", result=str(e))
            except WorkerError, e:
                return result(400,
                              message="recover worker failed",
                              result=str(e))
Exemple #7
0
def api_recover_worker(params):
    """以恢复模式启动一个worker
        Args:
            params: 字典, 参数字典:必须包括对应的worker_name
    """

    is_ok, errors = check_params(params, 'worker_name')
    if not is_ok:
        return result(400, "params error", str(errors))
    else:
        try:
            worker_name = params.pop('worker_name')
            record = RecorderManager.instance().\
                get_fail_worker_record(worker_name)
            if not record:
                return result(400, "not exist this fail worker", worker_name)
            else:
                schedule_params = record.get('schedule_kwargs')
                spider_params = record.get('spider_kwargs')
                schedule_path = record.get('schedule_class')
                spider_path = record.get('spider_class')
            schedule = get_schedule_class(schedule_path)(**schedule_params)
            spider = get_spider_class(spider_path)(schedule, **spider_params)
            recover_worker(spider)
            RecorderManager.instance().remove_last_fail_worker(worker_name)
        except ScheduleError, e:
            return result(400, message="init schedule failed", result=str(e))
        except SpiderError, e:
            return result(400, message="init spider failed", result=str(e))
Exemple #8
0
def api_remove_all_fail_worker(params):
    """remove all fail worker
       Args:
           params: dict, param dict
    """
    records = RecorderManager.instance().get_last_fail_worker()
    remove_rs = []

    try:
        for record in records:
            worker_name = record.get('worker_name')
            schedule_params = record.get('schedule_kwargs')
            spider_params = record.get('spider_kwargs')
            schedule_path = record.get('schedule_class')
            spider_path = record.get('spider_class')
            try:
                schedule = get_schedule_class(schedule_path)(**schedule_params)
                spider = get_spider_class(spider_path)(schedule,
                                                       **spider_params)
                spider.clear_all()
                RecorderManager.instance().remove_last_fail_worker(worker_name)
            except Exception, e:
                remove_rs.append({"worker_name": worker_name,
                                  "result": "fail",
                                  "error": str(e)})
            else:
                remove_rs.append({"worker_name": worker_name,
                                  "result": "success",
                                  "error": ""})
    except Exception, e:
        return result(500, "unsupported exception", result=str(e))
Exemple #9
0
def get_worker(params):
    """获取worker
        Args:
            params:dict 参数字典
        Returns:
            path, {}: 路径和参数字典
    """
    workers = get_all_workers()
    fail_workers = RecorderManager.instance().get_last_fail_worker()
    return "worker.html", {'workers': workers, 'fail_workers': fail_workers}
Exemple #10
0
def get_worker(params):
    """获取worker
        Args:
            params:dict 参数字典
        Returns:
            path, {}: 路径和参数字典
    """
    workers = get_all_workers()
    fail_workers = RecorderManager.instance().get_last_fail_worker()
    return "worker.html", {'workers': workers, 'fail_workers': fail_workers}
Exemple #11
0
def api_get_all_fail_worker(params):
    """获取以前失败的worker
        Args:
            params: 字典,参数字典
        Returns:
            result: str,结果
    """
    try:
        fail_worker_records = RecorderManager.instance().get_last_fail_worker()
        last_fail_worker_str = json.dumps(fail_worker_records,
                                      ensure_ascii=False, encoding="utf-8")
    except Exception, e:
        return result(500, "get fail worker failed", str(e))
Exemple #12
0
    def recover(self):
        """以恢复模式启动这个worker
            不会重复启动
        """
        if self.is_started:
            self.logger.warn("duplicate start")
        else:
            self.worker_statistic.start_time = datetime.datetime.now()
            RecorderManager.instance().record_doing(
                record(self._worker_name, self.worker_statistic.
                       start_time.strftime("%Y-%m-%d %H:%M:%S"),
                       get_class_path(self.spider.crawl_schedule.__class__),
                       self.spider.crawl_schedule.schedule_kwargs,
                       get_class_path(self.spider.__class__),
                       self.spider.spider_kwargs))

            self.is_started = True
            ioloop.IOLoop.instance().add_timeout(
                datetime.timedelta(milliseconds=
                                   self.spider.crawl_schedule.interval),
                self.loop_get_and_execute)
            self.logger.info("recover worker")
Exemple #13
0
    def stop(self):
        """关闭这个worker,并保存统计信息, store fail task
            关闭的时候,会清空所有schedule中的队列以及pipeline中的中间数据
            不会重复关闭
        """
        if not self.is_started:
            self.logger.warn("duplicate stop")
        else:
            self.is_started = False
            self.worker_statistic.end_time = datetime.datetime.now()
            fail_task_file_name = (
                self.spider.__class__.__name__ + "-" + self.worker_statistic.start_time.strftime("%Y-%m-%d %H:%M:%S")
            )
            try:
                output_fail_http_task_file(WORKER_FAIL_PATH + fail_task_file_name + ".csv", self.spider.crawl_schedule)
            except Exception, e:
                self.logger.warn("output fail task failed error:%s" % e)

            try:
                RecorderManager.instance().record_done(self._worker_name)
            except Exception, e:
                self.logger.warn("record done failed error:%s" % e)
Exemple #14
0
    def stop(self):
        """关闭这个worker,并保存统计信息, store fail task
            关闭的时候,会清空所有schedule中的队列以及pipeline中的中间数据
            不会重复关闭
        """
        if not self.is_started:
            self.logger.warn("duplicate stop")
        else:
            self.is_started = False
            self.worker_statistic.end_time = datetime.datetime.now()
            fail_task_file_name = self.spider.__class__.__name__ + "-" + \
                self.worker_statistic.start_time.strftime("%Y-%m-%d %H:%M:%S")
            try:
                output_fail_http_task_file(
                    WORKER_FAIL_PATH + fail_task_file_name + ".csv",
                    self.spider.crawl_schedule)
            except Exception, e:
                self.logger.warn("output fail task failed error:%s" % e)

            try:
                RecorderManager.instance().record_done(self._worker_name)
            except Exception, e:
                self.logger.warn("record done failed error:%s" % e)
Exemple #15
0
def api_get_all_fail_worker(params):
    """获取以前失败的worker
        Args:
            params: 字典,参数字典
        Returns:
            result: str,结果
    """
    try:
        fail_worker_records = RecorderManager.instance().get_last_fail_worker()
        last_fail_worker_str = json.dumps(fail_worker_records,
                                          ensure_ascii=False,
                                          encoding="utf-8")
    except Exception, e:
        return result(500, "get fail worker failed", str(e))
Exemple #16
0
    def recover(self):
        """以恢复模式启动这个worker
            不会重复启动
        """
        if self.is_started:
            self.logger.warn("duplicate start")
        else:
            self.worker_statistic.start_time = datetime.datetime.now()
            RecorderManager.instance().record_doing(
                record(
                    self._worker_name,
                    self.worker_statistic.start_time.strftime(
                        "%Y-%m-%d %H:%M:%S"),
                    get_class_path(self.spider.crawl_schedule.__class__),
                    self.spider.crawl_schedule.schedule_kwargs,
                    get_class_path(self.spider.__class__),
                    self.spider.spider_kwargs))

            self.is_started = True
            ioloop.IOLoop.instance().add_timeout(
                datetime.timedelta(
                    milliseconds=self.spider.crawl_schedule.interval),
                self.loop_get_and_execute)
            self.logger.info("recover worker")