Ejemplo n.º 1
0
    def run(self, func, project_name, init_url):
        self.handle_method = func

        crawl_result = self.http_helper.get(self.current_url)
        if not str(crawl_result[1]).startswith('20') \
                and not str(crawl_result[1]).startswith('30'):  # 如果不是200系列和300系列的状态码输出错误
            return {
                'error':
                'URL: ' + self.current_url + ' 获取失败 HTTP code: ' +
                str(crawl_result[1]) + ' Runtime: ' + str(crawl_result[2]) +
                'ms'
            }

        urls = get_urls_form_html(self.current_url, crawl_result[0])
        self.rest_result['current_url'] = self.current_url
        self.rest_result['http_code'] = crawl_result[1]

        current_url = self.current_url  # 缓存一下,self.current_url会被下面代码改写
        # 如果抓取自定义函数存在dict返回值则将dict推送至服务器
        parse_result = self.handle_method(
            helper.S(self, crawl_result[0], urls, project_name, init_url))

        if not isinstance(parse_result, dict):
            return self.rest_result

        if 'url' not in parse_result:
            parse_result['url'] = current_url
        if 'runtime' not in parse_result:
            parse_result['runtime'] = crawl_result[2]

        self.rest_result['result'] = parse_result

        return self.rest_result
Ejemplo n.º 2
0
    def run(self, func, project_name, init_url):
        self.handle_method = func

        crawl_result = self.http_helper.get(self.current_url)
        if not str(crawl_result[1]).startswith('20') \
                and not str(crawl_result[1]).startswith('30'):  # 如果不是200系列和300系列的状态码输出错误
            return {
                'error': 'URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str(
                    crawl_result[2]) + 'ms'}

        urls = get_urls_form_html(self.current_url, crawl_result[0])
        self.rest_result['current_url'] = self.current_url
        self.rest_result['http_code'] = crawl_result[1]

        current_url = self.current_url  # 缓存一下,self.current_url会被下面代码改写
        # 如果抓取自定义函数存在dict返回值则将dict推送至服务器
        parse_result = self.handle_method(
            helper.S(self, crawl_result[0], urls, project_name, init_url))

        if not isinstance(parse_result, dict):
            return self.rest_result

        if 'url' not in parse_result:
            parse_result['url'] = current_url
        if 'runtime' not in parse_result:
            parse_result['runtime'] = crawl_result[2]

        self.rest_result['result'] = parse_result

        return self.rest_result
Ejemplo n.º 3
0
    def run(self, func, current_url, project_name, init_url, gevent_id):
        """
        :param func:
        :return:
        """
        self.handle_method = func

        # while True:
        # todo 需要些速度控制方法. gevent.sleep
        self.current_url = current_url

        print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url
        if not self.current_url:
            # continue
            return
        self.put_data(urls_parsed=[self.current_url, ])
        crawl_result = self.http_helper.get(self.current_url)
        if not str(crawl_result[1]).startswith('20') \
                and not str(crawl_result[1]).startswith('30'):  # 如果不是200系列和300系列的状态码输出错误
            echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name +
                     ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str(
                crawl_result[2]) + 'ms')
            # continue
            self.put_data(
                urls_fail=(
                    get_domain(self.current_url),
                    int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0),
                    int(time.time()),
                ),
            )
            return

        # 如果抓取自定义函数存在dict返回值则将dict推送至服务器
        try:
            parse_result = self.handle_method(
                S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url))
        except:
            print traceback.format_exc()
            return

        if not isinstance(parse_result, dict):
            # continue
            return

        if 'url' not in parse_result:
            parse_result['url'] = self.current_url
        # if 'runtime' not in parse_result:
        #     parse_result['runtime'] = crawl_result[2]

        self.put_data(save=parse_result)