Exemple #1
0
def doPostProcessTask(res, task, key):
    
    logger.debug(current_log_tag() + '[验证页面开始上传]')

    handler = multipartuploadufile.MultipartUploadUFile(g_config.ucloud_public_key, g_config.ucloud_private_key)
    stream = BytesIO(zlib.compress(res))
    ret, resp = handler.uploadstream(g_config.ucloud_bucket, key, stream)
    # 出问题重试 2 次
    retry_times = 2
    if resp.status_code == 200:
        logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key))
        return True
    while resp.status_code != 200 and retry_times:
        retry_times -= 1
        ret, resp = handler.resumeuploadstream()
        if resp.status_code == 200:
            logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key))
            return True
    else:   
        except_logger = ExceptionLogger()
        except_logger.qid = task.req_qid
        except_logger.type = "PUSH_MD5_ERROR"
        except_logger.debug = json.dumps({"task_id": task.new_task_id, "source": task.source})
        logger.debug("\n" + except_logger.logger_info)
        logger.debug(current_log_tag() + '[验证页面上传失败] md5:{0}'.format(key))

        return False
Exemple #2
0
        def parse_result(parser):
            # 判断是否为有解析需要,且在需解析目标中
            parser_name = parser.__name__.split('_', 1)[1]
            if parser_name in required:
                logger.debug(current_log_tag() + 'parse target %s',
                             parser_name)

                per_result = parser(request_template, converted_data)
                if per_result is not None:
                    if per_result:
                        start = datetime.datetime.now()
                        if isinstance(per_result, list):
                            # 添加 guest_info
                            store_utils.add_index_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result, page_index)
                            # 添加 stopby 信息
                            store_utils.add_stop_by_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result,
                                self.spider.task)
                            result[parser_name].extend(per_result)
                        elif isinstance(per_result, dict):
                            result[parser_name].append(per_result)
                        logger.debug(current_log_tag() +
                                     '[结果保存][不使用压缩][用时: {0} ]'.format(
                                         datetime.datetime.now() - start))
Exemple #3
0
 def logging(*args, **kw):
     func_count_dict[fun.__name__] += 1
     begin = datetime.now()
     logger.debug(current_log_tag() +
                  '函数 {0} call start'.format(fun.__name__))
     result = fun(*args, **kw)
     end = datetime.now()
     logger.debug(current_log_tag() +
                  '函数 {0} call end'.format(fun.__name__))
     # logger.debug(current_log_tag() + ',函数,%s,耗时,%s,当前运行,%s,个此函数,当前,%s,协程', fun.__name__, (end - begin),
     #              func_count_dict[fun.__name__], mioji.common.pool.pool.size)
     func_count_dict[fun.__name__] -= 1
     return result
Exemple #4
0
    def crawl(self):
        """
        """
        if hasattr(self.task, 'new_task_id'):
            cur_id = self.task.new_task_id
        else:
            cur_id = str(uuid.uuid1())
        self.spider_taskinfo = {'task_id': cur_id}
        getcurrent().spider_taskinfo = self.spider_taskinfo
        # 打印任务信息
        for k, v in self.task.__dict__.items():
            self.spider_taskinfo[k] = v
            try:
                logger.info(current_log_tag() + '[任务信息][%s][%s]' %
                            (k, json.dumps(v)))
            except Exception:
                continue

        chains = self.targets_request()
        try:
            self.code = self.__crawl_by_chain(chains)
        except parser_except.ParserException as e:
            logger.exception(e)
            self.code = e.code
            self.exception = e.msg
            if e.retry_from_first:
                raise e
        return self.code
Exemple #5
0
def add_stop_by_info(versions, result, task):
    # 获取 stop by info 所在的位置
    stop_by_index = TICKET_INFO_INDEX.get(versions, None)
    if stop_by_index is None:
        logger.debug('[未找到 stop_by_index][versions: {0}]'.format(versions))
        # try:
        #     from common import db
        #     sql = 'REPLACE INTO new_frame_not_replace_stop_by (ip, versions) VALUES (%s, %s)'
        #     db.execute_into_spider_db(sql, (get_local_ip(), versions))
        # except Exception as e:
        #     logger.warning('[未成功入 未找到 stop_by 库][ERROR: {0}]'.format(e))
        return

    logger.debug(current_log_tag() +
                 '[修改 stop_by_info][versions: {0}][位置 {1}]'.format(
                     versions, stop_by_index))

    for __i in range(len(result)):
        if result[__i]:
            result[__i] = list(result[__i])

            #  E、P、B、F,E 经济舱 P 超级经济舱 B 商务舱 F 头等舱
            task_stop_by_info = task.ticket_info.get('v_seat_type',
                                                     None) or 'E'
            if versions == 'InsertMultiFlight':
                result[__i][stop_by_index] = '{0}&NULL'.format(
                    task_stop_by_info)
            else:
                result[__i][stop_by_index] = task_stop_by_info

            result[__i] = tuple(result[__i])
Exemple #6
0
    def crawl(self):
        """
        外部启动爬虫的入口方法
        当调用这个方法时才能开始爬虫工作~
        :return:
        """
        # todo
        self.__create_browser()
        cur_id = str(uuid.uuid1())
        if hasattr(self.task, 'new_task_id'):
            cur_id = self.task.new_task_id
        self.spider_taskinfo = {'task_id': cur_id}
        for k, v in self.task.__dict__.items():
            self.spider_taskinfo[k] = v
            try:
                logger.info(current_log_tag() + '[任务信息][%s][%s]' %
                            (k, json.dumps(v)))
            except Exception:
                continue
        chains = self.targets_request()
        try:
            self.code = self.__crawl_by_chain(chains)
        except parser_except.ParserException as e:
            logger.exception(e)
            self.code = e.code
            self.exception = e.msg
            if e.retry_from_first:
                raise e

        # 通过返回的全部 result 判断错误码
        self.check_all_result()
        return self.code
Exemple #7
0
    def __crawl_list(self, reqParse, browser, req_list):
        """
        串行抓取分页
        """
        result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        total_count = 0
        success_count = 0
        error_req = []
        for req in req_list:
            # 串行增加翻页限制取消
            # if NEED_FLIP_LIMIT:
            #     if total_count >= MAX_FLIP:
            #         break
            total_count += 1
            try:
                success_count += 1
                res = self.__single_crawl(reqParse,
                                          browser,
                                          req,
                                          page_count=total_count)
                self.__target_append_result(result, res)
                all_except = False
            except Exception as e:
                all_ok = False
                one_exception = e
                error_req.append((req, one_exception.message))
                logger.exception(
                    current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format(
                        traceback.format_exc().replace('\n', '\t')))

                #  抛出生成器部分的异常
                if isinstance(req, types.GeneratorType):
                    raise e
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return result, all_except, all_ok, one_exception
Exemple #8
0
    def __async_crawl_list(self, reqParse, browser, req_list):
        """
        并行抓取分页
        丢到协程池里
        """

        a_result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        params = []
        total_count = 0
        for req in req_list:
            total_count += 1
            params.append((reqParse, browser, req, total_count))

        result = block_async(pool, self.__single_crawl, params)

        success_count = 0
        error_req = []
        for a_res in result:
            err_or_data, is_data = a_res
            if is_data:
                success_count += 1
                all_except = False
                self.__target_append_result(a_result, err_or_data)
            else:
                all_ok = False
                args, kwargs, one_exception = err_or_data
                if hasattr(
                        one_exception,
                        'retry_from_first') and one_exception.retry_from_first:
                    raise one_exception
                error_req.append((args[2], one_exception.message))
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return a_result, all_except, all_ok, one_exception
Exemple #9
0
 def result(self):
     try:
         for k, v in self._result.items():
             logger.debug(
                 current_log_tag() +
                 '[抓取结果][key: {0}][value_len: {1}]'.format(k, len(v)))
     except Exception:
         pass
     return self._result
Exemple #10
0
    def crawl_data(self, request_template, browser, source_name):
        """
        页面抓取函数
        :param request_template: 请求字典
        :param browser: 抓取浏览器
        :param source_name: 源名称
        :return: 返回抓取结果 response 对象
        """
        try:
            logger.debug(current_log_tag() + 'crawl %s, retry_count: %s',
                         self.__request_func.__name__, self.req_count)
            # 代理装配
            self.browser_set_proxy(browser, source_name)

            resp, self.content_length = self.__crawl_data_str(
                request_template, browser)

            # todo 修改 user_retry 返回的结果
            if self.user_retry:
                try:
                    user_check = self.spider.user_retry_err_or_resp(
                        resp, self.req_count, request_template, False)
                except Exception as e:
                    self.user_exc = True
                    raise e

                # 当用户返回 True 时
                if user_check:
                    return resp
                else:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, '代理异常')
            else:
                return resp
        except parser_except.ParserException as e:
            self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN,
                                           parser_except.PROXY_FORBIDDEN,
                                           parser_except.REQ_ERROR)
            self.req_exception = e
        except Exception as e:
            self.req_exception = parser_except.ParserException(
                parser_except.REQ_ERROR, 'req exception:{0}'.format(e))

            # 如果有用户异常,则置位用户重试
            if self.user_exc:
                if isinstance(e, parser_except.ParserException):
                    self.req_exception = e

        finally:
            if self.req_exception:
                code = self.req_exception.code
            else:
                code = 0

        if self.req_exception:
            raise self.req_exception
Exemple #11
0
    def __crawl_by_chain(self, chains):
        """
        根据请求链的类型,进入不同的抓取顺序进行抓取
        :param chains:
        :return:
        """
        code = 0
        try:
            for reqParse in chains:
                # gevent.sleep(0)
                browser = self.__create_browser(reqParse.new_session)
                reqParse.spider = self
                t_req = reqParse.request()

                if isinstance(t_req, dict):  # 单一请求
                    new_result = self.__single_crawl(reqParse, browser, t_req,
                                                     0)

                elif isinstance(t_req, list):
                    # 爬虫有可能返回一个空列表!!!
                    if t_req:
                        if reqParse.asynchronous:  # 并行抓取
                            list_result = self.__async_crawl_list(
                                reqParse, browser, t_req)
                        else:  # 串行请求
                            list_result = self.__crawl_list(
                                reqParse, browser, t_req)
                        new_result, code = self.check_list_result(
                            list_result, code)  # $$$ 可以优化

                elif isinstance(t_req,
                                types.GeneratorType):  # 针对使用的yelid 调用方法的请求
                    list_result = self.__crawl_list(reqParse, browser, t_req)
                    new_result, code = self.check_list_result(
                        list_result, code)

                self.__spider_append_result(new_result)

            if self.use_selenium and browser.br:
                browser.close()
        except parser_except.ParserException as e:
            if self.use_selenium and browser.br:
                browser.close()
            logger.error(e)
            raise e
        except Exception:
            if self.use_selenium and browser.br:
                browser.close()
            logger.exception(current_log_tag() +
                             '[新框架 持续请求 未知问题][ {0} ]'.format(
                                 traceback.format_exc().replace('\n', '\t')))
            raise parser_except.ParserException(
                parser_except.UNKNOWN_ERROR,
                'e:{0}'.format(traceback.format_exc()))

        return code
Exemple #12
0
 def __target_append_result(result, new_result):
     """
     向 result 中添加数据
     :param result: 被添加量
     :param new_result: 添加量
     :return: None
     : 此处用了字典的单例。
     """
     for k, v in new_result.items():
         if not v:
             continue
         logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                      len(v), len(result.get(k, [])))
         result[k] += v
Exemple #13
0
    def __spider_append_result(self, new_result):
        """
        向 self.result 中添加解析结果
        :param new_result: 必须为解析结果
        :return: None
        :调用回调方法
        """

        for k, v in new_result.items():
            if not v:
                continue
            data_bind = self.targets[k].get('bind', None)
            if data_bind:
                logger.debug(
                    "current_log_tag() + [ 抓取绑定 {0} ][ 数据绑定 {1} ]".format(
                        k, data_bind))
                self._result[data_bind] += v
                logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                             len(v), len(self._result.get(k, [])))
            else:
                self._result[k] += v
                logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                             len(v), len(self._result.get(k, [])))
Exemple #14
0
 def convert(self, request_template, data):
     data_con = request_template.get('data', {})
     c_type = data_con.get('content_type', 'string')
     logger.debug(current_log_tag() + 'Converter got content_type: %s',
                  c_type)
     if c_type is 'html':
         return HTML.fromstring(data)
     elif c_type is 'json':
         return json.loads(data)
     elif isinstance(c_type, types.MethodType):
         try:
             return c_type(request_template, data)
         except:
             raise parser_except.ParserException(
                 -1, 'convert func muset error{0} ,func:{1}'.format(
                     traceback.format_exc(), c_type))
     else:
         return data
Exemple #15
0
    def __crawl_list(self, reqParse, browser, req_list):
        """
        串行抓取分页
        """
        result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        total_count = 0
        success_count = 0
        error_req = []
        for req in req_list:
            gevent.sleep(0)
            # 串行增加翻页限制取消
            # if NEED_FLIP_LIMIT:
            #     if total_count >= MAX_FLIP:
            #         break
            total_count += 1
            try:
                success_count += 1
                res = self.__single_crawl(reqParse,
                                          browser,
                                          req,
                                          page_count=total_count)
                self.__target_append_result(result, res)
                all_except = False
            except Exception, e:
                all_ok = False
                one_exception = e
                error_req.append((req, one_exception.message))
                logger.exception(
                    current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format(
                        traceback.format_exc().replace('\n', '\t')))

                #  抛出生成器部分的异常
                if isinstance(req, types.GeneratorType):
                    raise e
Exemple #16
0
def add_index_info(versions, result, page_index):
    # 获取 guest info 所在的位置
    index_info_index = INDEX_INFO_INDEX.get(versions, None)
    if index_info_index is None:
        return

    logger.debug(current_log_tag() +
                 '[修改 index_info][versions: {0}][位置 {1}]'.format(
                     versions, index_info_index))
    for __i in range(len(result)):
        if result[__i]:
            result[__i] = list(result[__i])
            old_index_info = result[__i][index_info_index]
            try:
                old_index_info = json.loads(old_index_info)
                if not isinstance(old_index_info, dict):
                    raise Exception('Type Is Not Dict')
            except Exception:
                old_index_info = {'unparse_info': old_index_info}
            index_info = {k: v for k, v in old_index_info.items()}
            index_info['page_index'] = page_index
            index_info['item_index'] = __i
            result[__i][index_info_index] = json.dumps(index_info)
            result[__i] = tuple(result[__i])
Exemple #17
0
    def req(self,
            url,
            method='get',
            params=None,
            data=None,
            json=None,
            timeout=60,
            verify=False,
            **kw):
        httpLogger = HttpLogger()
        httpLogger = copy.deepcopy(httpLogger)
        httpLogger.qid = self.qid
        httpLogger.task_id = self.task_id
        httpLogger.req_type = method
        httpLogger.source = self.source
        httpLogger.task_id = self.task_id
        httpLogger.qid = self.qid
        httpLogger.url = url
        httpLogger.proxy_out = str(self.out_ip)
        httpLogger.proxy = str(self.proxy)
        httpLogger.proxy_inf = str(self.proxy_inf)
        httpLogger.retry_count = self.req_count
        for k in kw.keys():
            if k not in [
                    'method', 'url', 'params', 'data', 'headers', 'cookies',
                    'files', 'auth', 'timeout', 'allow_redirects', 'proxies',
                    'hooks', 'stream', 'verify', 'cert', 'json'
            ]:
                logger.warning(current_log_tag() +
                               '[出现不能解析的 req 请求参数][{0}]'.format(k))
        new_kw = {
            k: v
            for k, v in kw.items() if k in [
                'method', 'url', 'params', 'data', 'headers', 'cookies',
                'files', 'auth', 'timeout', 'allow_redirects', 'proxies',
                'hooks', 'stream', 'verify', 'cert', 'json'
            ]
        }
        ts = int(1000 * time.time())
        if data:
            httpLogger.data = data
            if isinstance(data, dict):
                httpLogger.data = _json.dumps(data, ensure_ascii=False)
        if json:
            httpLogger.data = json
            if isinstance(json, dict):
                httpLogger.data = _json.dumps(json, ensure_ascii=False)

        req_func = self.req_bind.get(method.lower())
        httpLogger.cookie = str(req_func.__self__.cookies._cookies)
        httpLogger.source = self.source
        httpLogger.headers = str(new_kw.get('headers', ""))
        try:
            logger.debug(current_log_tag() +
                         'browser req start {1} {0}'.format(url, method))
            logger.debug(current_log_tag() +
                         'browser req data {0}'.format(data))
            logger.debug(current_log_tag() +
                         'browser req json {0}'.format(json))
            logger.debug(current_log_tag() +
                         'browser req params {0}'.format(params))
            logger.debug(current_log_tag() +
                         'browser req other_data {0}'.format(new_kw))
            logger.debug(current_log_tag() +
                         'browser req session_cookie {0}'.format(
                             req_func.im_self.cookies._cookies))
        except:
            logger.debug(current_log_tag() + '请求前获取部分参数失败')
        try:
            local_resp = None
            # todo API qps限制
            # try:
            #     logger.debug(current_log_tag() + 'queue and qps config:{0}'.format(str(self.queue_info)))
            #     if not self.queue_info.get('source_name'):
            #         pass
            #     elif self.queue_info['source_name'] in limit_config.keys():
            #         try:
            #             cango = self.new_limit(self.queue_info, self.task_id)
            #         except Exception as why:
            #             logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why)))
            #             raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时&reqError')
            #         if not cango:
            #             raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时')
            # except Exception as why:

            # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why)))
            self.resp = local_resp = req_func(url,
                                              params=params,
                                              data=data,
                                              json=json,
                                              timeout=timeout,
                                              verify=verify,
                                              **new_kw)
            logger.debug(
                current_log_tag() +
                'browser response headers:{0}'.format(self.resp.headers))
            ts = int(1000 * time.time()) - ts
            httpLogger.last_time = ts
            logger.debug(
                current_log_tag() +
                'browser req end {1} {0} proxy[{4}] ms[{2}] status[{3}] length[{5}]'
                .format(url, method, ts, local_resp.status_code, self.proxy,
                        resp_content_lenght(local_resp)))
            httpLogger.resp_code = local_resp.status_code
            if len(str(local_resp.content)) > 1000:
                content = str(local_resp.content)[:1000]
            else:
                content = str(local_resp.content)
            httpLogger.resp_content = content
            httpLogger.proxy_out = str(self.out_ip)
            httpLogger.proxy = str(self.proxy)
        except:
            httpLogger.exception = str(traceback.format_exc())
            logger.debug(current_log_tag() +
                         'browser req end {1} {0} proxy[{2}] error:{3}'.format(
                             url, method, self.proxy, traceback.format_exc()))
            try:
                logger.debug('\n' + httpLogger.logger_info)
            except Exception as why:
                logger.debug(str(why))
            raise
        try:
            logger.debug('\n' + httpLogger.logger_info)
        except Exception as why:
            logger.debug(str(why))
        return local_resp
Exemple #18
0
    def parse(self,
              request_template,
              targets_bind,
              converted_data,
              page_index,
              required=None,
              multi_last=False):
        result = defaultdict(list)
        parsed = set()
        if not multi_last:
            parser_list = request_template.get('user_handler', [])
            for parser in parser_list:
                if parser not in parsed:
                    logger.debug(current_log_tag() + 'user parser %s', parser)
                    parser(request_template, converted_data)

        # 通过 parse 更新 result 信息
        def parse_result(parser):
            # 判断是否为有解析需要,且在需解析目标中
            parser_name = parser.__name__.split('_', 1)[1]
            if parser_name in required:
                logger.debug(current_log_tag() + 'parse target %s',
                             parser_name)

                per_result = parser(request_template, converted_data)
                if per_result is not None:
                    if per_result:
                        start = datetime.datetime.now()
                        if isinstance(per_result, list):
                            # 添加 guest_info
                            store_utils.add_index_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result, page_index)
                            # 添加 stopby 信息
                            store_utils.add_stop_by_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result,
                                self.spider.task)
                            result[parser_name].extend(per_result)
                        elif isinstance(per_result, dict):
                            result[parser_name].append(per_result)
                        logger.debug(current_log_tag() +
                                     '[结果保存][不使用压缩][用时: {0} ]'.format(
                                         datetime.datetime.now() - start))

        # 解析目标,酒店、房间、等
        # for target, parser in targets_bind.items():
        if isinstance(self.binding,
                      Iterable) and not isinstance(self.binding, (str, bytes)):
            for binding in self.binding:
                # 对 binding 种类进行兼容判断
                if binding is None:
                    continue
                elif isinstance(binding, (str, bytes)):
                    parser = targets_bind.get(binding, '')
                    if parser == '':
                        TypeError('无法从 targets 中获取 parser {0}'.format(binding))
                elif callable(binding):
                    parser = binding
                else:
                    raise TypeError('不支持绑定类型 {0} 的 {1}'.format(
                        type(binding), repr(binding)))
                # 更新 result 信息
                parse_result(parser)

        elif isinstance(self.binding, (str, bytes)):
            parser = targets_bind.get(self.binding, '')
            if parser == '':
                TypeError('无法从 targets 中获取 parser {0}'.format(self.binding))

            # 更新 result 信息
            parse_result(parser)

        elif callable(self.binding):
            parser = self.binding
            # 更新 result 信息
            parse_result(parser)

        return result
Exemple #19
0
    def __crawl_data_str(self, request_template, browser):
        resp = None
        try:
            # 使用方法修改,用户直接修改 request_template 中的值
            self.spider.prepare_request(request_template)

            # 获得 request_template 中的 req
            req = request_template['req']

            # 用于控制qps
            if hasattr(self.spider, 'queue_info'):
                browser.queue_info = self.spider.queue_info

            if hasattr(self.spider.task, 'req_qid'):
                browser.qid = self.spider.task.req_qid
            else:
                browser.qid = ""
            browser.task_id = self.spider.task.task_id
            browser.source = self.spider.task.source
            browser.tid = self.spider.task.tid
            browser.ori_type = self.spider.task.ori_type

            resp = browser.req(**req)
            # 网络错误,异常抛出
            resp.raise_for_status()

            content_length = len(resp.content)
            if isinstance(self.need_content_length, int):
                logger.debug(current_log_tag() +
                             '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format(
                                 self.need_content_length, content_length))
                if content_length <= self.need_content_length:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, msg='data is empty')
            elif self.need_content_length is None:
                logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]')
            else:
                logger.debug(current_log_tag() +
                             '[未知 content_length 检测类型][type: {0}]'.format(
                                 str(type(self.need_content_length))))
            return resp, content_length
        # timeout
        except requests.exceptions.SSLError as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_SSL,
                                                msg=str(e),
                                                error=e)
        except requests.exceptions.ProxyError as e:  # 代理失效
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg='Proxy Error',
                                                error=e)

        except requests.exceptions.ConnectTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request connect Timeout',
                                                error=e)
        except requests.exceptions.ReadTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request read Timeout',
                                                error=e)
        except requests.exceptions.Timeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request Timeout',
                                                error=e)

        except requests.exceptions.ConnectionError as err:
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err))

        except requests.exceptions.HTTPError as err:  # 4xx 5xx 的错误码会catch到
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)

        except requests.exceptions.RequestException as err:  # 这个是总的error
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)
        except Exception as e:  # 这个是最终的error
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=traceback.format_exc())
Exemple #20
0
 def user_append_reslut(self, target, result_items):
     self._result[target] += result_items
     logger.debug(current_log_tag() + "%s, length=%s, all=%s", target,
                  len(result_items), len(self._result.get(target, [])))
Exemple #21
0
class Spider(object):
    """
    """
    __metaclass__ = abc.ABCMeta
    # 源类型
    source_type = ''
    # 抓取目标, 例如 : {'hotel':{}, 'room':{'version':'InsertNewFlight'}}
    targets = {}
    # 与老爬虫关联, 例如 : {'pricelineFlight': {'required': ['Flight']}}
    old_spider_tag = {}
    # 不启用,默认启用
    unable = False
    # 排队
    queue_info = {}
    # 重试配置
    retry_info = {'max_try': 1, 'retry_codes': []}

    def __init__(self, task=None):
        assert self.source_type != '', '缺失正确的抓取类型'
        assert self.targets != {}, '缺失正确的抓取 parser'
        assert len(self.targets) > 0, parser_except.ParserException(
            1, '必须指明解析目标')
        self.task = task
        self.task_id = ""
        self.spider_taskinfo = {}
        self.is_verify = False
        self.need_proxy = True
        self.use_selenium = False
        self.browser = None
        self.__cpu_time = 0
        self.debug = True
        self.extra = {}
        self.user_datas = dict()
        self.verify_data = {'data': []}
        self._asy_temp_result = defaultdict(list)
        self.task_post_process_queue = None
        self.code = -1
        self.cost_crawl_time = None

        self._result = defaultdict(list)
        self.__targets_parser_func_dict = {}
        self.targets_required = self.targets
        self._crawl_targets_required = self.targets_required
        self.debug_info = {'pages': []}
        self.process_callback = None
        # 用于减少一次异步回调
        self.spider_frame_status = 0
        self.exception = None

        self.machine_type = None
        self.local_ip = None
        self.env = None

        for t in self.targets.keys():
            func_name = 'parse_' + t
            parse_func = getattr(self, func_name)
            self.__targets_parser_func_dict[t] = parse_func

    @func_time_logger
    def crawl(self):
        """
        """
        if hasattr(self.task, 'new_task_id'):
            cur_id = self.task.new_task_id
        else:
            cur_id = str(uuid.uuid1())
        self.spider_taskinfo = {'task_id': cur_id}
        getcurrent().spider_taskinfo = self.spider_taskinfo
        # 打印任务信息
        for k, v in self.task.__dict__.items():
            self.spider_taskinfo[k] = v
            try:
                logger.info(current_log_tag() + '[任务信息][%s][%s]' %
                            (k, json.dumps(v)))
            except Exception:
                continue

        chains = self.targets_request()
        try:
            self.code = self.__crawl_by_chain(chains)
        except parser_except.ParserException as e:
            logger.exception(e)
            self.code = e.code
            self.exception = e.msg
            if e.retry_from_first:
                raise e
        return self.code

    @abc.abstractmethod
    def targets_request(self):
        """
        目标请求链:酒店列表、酒店详情、酒店评论等
        """

    def response_error(self, req, resp, error):
        """ 
        请求异常
        :param resp requests response
        :param error 异常
        """

        pass

    @property
    def task(self):
        return self._task

    @task.setter
    def task(self, task):
        if self.source_type.endswith(
                'Hotel') and task and "List" not in self.source_type:
            task = task_change_sass(task)
        self._task = task

    @func_time_logger
    def __crawl_by_chain(self, chains):
        """ 
        从请求链中辨别请求类型,分别丢入不同的请求方法
        单一请求,并行请求,串行请求
        """
        code = 0
        try:
            for reqParse in chains:
                gevent.sleep(0)
                browser = self.__create_browser(reqParse.new_session)
                reqParse.spider = self
                t_req = reqParse.request()

                if isinstance(t_req, types.DictType):  # 单一请求
                    new_result = self.__single_crawl(reqParse, browser, t_req,
                                                     0)

                elif isinstance(t_req, types.ListType):
                    # 爬虫有可能返回一个空列表!!!
                    if t_req:
                        if reqParse. async:  # 并行抓取
                            list_result = self.__async_crawl_list(
                                reqParse, browser, t_req)
                        else:  # 串行请求
                            list_result = self.__crawl_list(
                                reqParse, browser, t_req)
                        new_result, code = self.check_list_result(
                            list_result, code)  # $$$ 可以优化

                elif isinstance(t_req,
                                types.GeneratorType):  # 针对使用的yelid 调用方法的请求
                    list_result = self.__crawl_list(reqParse, browser, t_req)
                    new_result, code = self.check_list_result(
                        list_result, code)

                self.__spider_append_result(new_result)

            if self.use_selenium and browser.br:
                browser.close()
        except parser_except.ParserException as e:
            if self.use_selenium and browser.br:
                browser.close()
            logger.error(e)
            raise e
        except Exception:
            if self.use_selenium and browser.br:
                browser.close()
            logger.exception(current_log_tag() +
                             '[新框架 持续请求 未知问题][ {0} ]'.format(
                                 traceback.format_exc().replace('\n', '\t')))
            raise parser_except.ParserException(
                parser_except.UNKNOWN_ERROR,
                'e:{0}'.format(traceback.format_exc()))

        return code

    def __async_crawl_list(self, reqParse, browser, req_list):
        """
        并行抓取分页
        丢到协程池里
        """

        a_result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        params = []
        total_count = 0
        for req in req_list:
            total_count += 1
            params.append((reqParse, browser, req, total_count))

        result = block_async(pool, self.__single_crawl, params)

        success_count = 0
        error_req = []
        for a_res in result:
            err_or_data, is_data = a_res
            if is_data:
                success_count += 1
                all_except = False
                self.__target_append_result(a_result, err_or_data)
            else:
                all_ok = False
                args, kwargs, one_exception = err_or_data
                if hasattr(
                        one_exception,
                        'retry_from_first') and one_exception.retry_from_first:
                    raise one_exception
                error_req.append((args[2], one_exception.message))
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return a_result, all_except, all_ok, one_exception

    def __crawl_list(self, reqParse, browser, req_list):
        """
        串行抓取分页
        """
        result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        total_count = 0
        success_count = 0
        error_req = []
        for req in req_list:
            gevent.sleep(0)
            # 串行增加翻页限制取消
            # if NEED_FLIP_LIMIT:
            #     if total_count >= MAX_FLIP:
            #         break
            total_count += 1
            try:
                success_count += 1
                res = self.__single_crawl(reqParse,
                                          browser,
                                          req,
                                          page_count=total_count)
                self.__target_append_result(result, res)
                all_except = False
            except Exception, e:
                all_ok = False
                one_exception = e
                error_req.append((req, one_exception.message))
                logger.exception(
                    current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format(
                        traceback.format_exc().replace('\n', '\t')))

                #  抛出生成器部分的异常
                if isinstance(req, types.GeneratorType):
                    raise e
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return result, all_except, all_ok, one_exception
Exemple #22
0
    def __single_crawl(self, reqParse, browser, request_template, page_count):
        """ 用于请求的基本方法
        """
        # 请求链中的header 可以被沿用
        headers = request_template['req'].get('headers', None)
        use_headers = request_template['req'].get('use_headers', False)
        if headers:
            browser.add_header(headers, use_headers)

        # 设置 res 的 默认值
        res = defaultdict(list)

        # 初始化请求参数

        local_req_count = 0
        reqParse.req_count = 0
        reqParse.is_forbidden = False
        reqParse.req_exception = None
        reqParse.proxy = None
        reqParse.content_length = 0

        self.__cpu_time += time.time() * 1000

        while local_req_count < reqParse.retry_count:
            # 增加一次重试次数
            local_req_count += 1
            logger.debug(
                current_log_tag() +
                '[开始抓取][ {0} ]'.format(request_template['req'].get('url', '')))
            # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取
            try:
                resp = reqParse.crawl_data(request_template, browser,
                                           self.task.source)
            except parser_except.ParserException as e:
                traceback.print_exc()
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                # 错误码21/22/23 或 开发指定需要重试
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID,
                              parser_except.REQ_ERROR,
                              parser_except.PROXY_SSL) or e.need_retry:
                    reqParse.is_forbidden = True
                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     traceback.format_exc())
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'.
                                     format(e.code, reqParse.req_count))
                        continue
                else:
                    raise e
            except Exception, e:
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                if local_req_count >= reqParse.retry_count:
                    raise e
                else:
                    continue

            # 请求中增加 resp 的值
            request_template['resp'] = resp
            # 打印存储抓取结果
            self.response_callback(request_template, resp)
            if reqParse.res_text == 'text':
                res = resp.text
            else:
                res = resp.content
            try:
                logger.debug(current_log_tag() +
                             '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format(
                                 res[:100], res[-100:], request_template['req']
                                 ['url']).replace('\n', '').replace('\t', ''))
            except Exception:
                pass
            # 如果本地运行,将不执行上传操作
            # import pdb; pdb.set_trace()
            if not self.debug and self.env != "local":
                md5_key = get_md5(res)
                verify_task_info = {
                    'func_name': reqParse.request_func.__name__,
                    'page_index': page_count,
                    'retry_count': local_req_count - 1,
                    'md5_key': md5_key
                }
                # 把上传抓取页面至ucloud
                self.task_post_process_queue.put((res, self.task, md5_key))
                self.verify_data['data'].append(verify_task_info)

            point_time = time.time() * 1000
            try:
                convert_data = reqParse.convert(request_template, res)
            except Exception:
                if local_req_count >= reqParse.retry_count:
                    logger.debug(current_log_tag() + traceback.format_exc())
                    raise parser_except.ParserException(
                        parser_except.DATA_FORMAT_ERROR,
                        '[traceback: {0}]'.format(traceback.format_exc()))
                else:
                    continue
            finally:
                self.__cpu_time += time.time() * 1000 - point_time

            # 数据解析部分
            point_time = time.time() * 1000
            try:
                res = reqParse.parse(request_template,
                                     self.__targets_parser_func_dict,
                                     convert_data, page_count,
                                     self._crawl_targets_required)

                break
            except parser_except.ParserException as e:
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID):
                    reqParse.is_forbidden = True

                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code))
                        convert_data = None
                        continue
                else:
                    raise e
            except Exception:
                raise parser_except.ParserException(
                    parser_except.PARSE_ERROR,
                    '[traceback:{0}]'.format(traceback.format_exc()))
            finally:
                self.__cpu_time += time.time() * 1000 - point_time
                self.response_callback(request_template, resp)