コード例 #1
0
def load_projects():
    res = init()
    while not res or 'projects' not in res or not res['projects']:
        res = init()
        msg = '. 无法连接远程服务器!'
        if res:
            msg = res.get('msg', '')
        echo_err(' 初始化失败, 10秒后重试' + msg)
        gevent.sleep(10)

    return res['projects']
コード例 #2
0
    def run(self, func, current_url, project_name, init_url, gevent_id):
        """
        :param func:
        :return:
        """
        self.handle_method = func

        # while True:
        # todo 需要些速度控制方法. gevent.sleep
        self.current_url = current_url

        print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url
        if not self.current_url:
            # continue
            return
        self.put_data(urls_parsed=[self.current_url, ])
        crawl_result = self.http_helper.get(self.current_url)
        if not str(crawl_result[1]).startswith('20') \
                and not str(crawl_result[1]).startswith('30'):  # 如果不是200系列和300系列的状态码输出错误
            echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name +
                     ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str(
                crawl_result[2]) + 'ms')
            # continue
            self.put_data(
                urls_fail=(
                    get_domain(self.current_url),
                    int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0),
                    int(time.time()),
                ),
            )
            return

        # 如果抓取自定义函数存在dict返回值则将dict推送至服务器
        try:
            parse_result = self.handle_method(
                S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url))
        except:
            print traceback.format_exc()
            return

        if not isinstance(parse_result, dict):
            # continue
            return

        if 'url' not in parse_result:
            parse_result['url'] = self.current_url
        # if 'runtime' not in parse_result:
        #     parse_result['runtime'] = crawl_result[2]

        self.put_data(save=parse_result)
コード例 #3
0
def run(gevent_id, project_name, source_code, init_url):
    run.restart = False
    context = {}

    def start(callback):
        context['callback'] = callback

    def context_rebuild(new_name, new_code):
        code = compile(new_code, 'test_mode_file', 'exec')
        exec code in {'start': start}
        return Spider(new_name)

    try:
        spider = context_rebuild(project_name, source_code)

        while True:
            # 项目有修改, 重新编码执行
            if spider.has_project_change():
                for item in load_projects():
                    if item['name'] == project_name:
                        spider = context_rebuild(item['name'], item['code'])

                        print 'gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' reload !!!!!!!!!!'
                        break

                continue

            response = spider.get_data()
            if not response:
                echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 远程响应异常, 60秒后重试')
                gevent.sleep(60)
                continue

            # 准备重启
            if run.restart or ('restart' in response and response['restart']):
                run.restart = True
                echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 准备重启中...')
                return  # 当该轮全部协程返回后, 调用处会再次重新开启新一轮. 以此达到重启的目的

            if 'urls' not in response or not response['urls']:
                echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 无法从远程获取url队列, 10秒后重试' +
                         response['msg'] or '')
                gevent.sleep(10)
                continue

            spider.pre_url_queue += response['urls']

            # 执行爬取
            while spider.pre_url_queue:
                url = spider.pre_url_queue.pop(0)  # 出栈首位
                sleep = QueueSleepCtrl.get_sleep_times(url)
                # print sleep, ' -- gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - ' + url
                gevent.sleep(sleep)
                spider.run(context['callback'], url, project_name, init_url, gevent_id)

    except:
        print traceback.format_exc()

    echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' stop !!!!!!!!!!!!!!!')