def doPostProcessTask(res, task, key): logger.debug(current_log_tag() + '[验证页面开始上传]') handler = multipartuploadufile.MultipartUploadUFile(g_config.ucloud_public_key, g_config.ucloud_private_key) stream = BytesIO(zlib.compress(res)) ret, resp = handler.uploadstream(g_config.ucloud_bucket, key, stream) # 出问题重试 2 次 retry_times = 2 if resp.status_code == 200: logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key)) return True while resp.status_code != 200 and retry_times: retry_times -= 1 ret, resp = handler.resumeuploadstream() if resp.status_code == 200: logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key)) return True else: except_logger = ExceptionLogger() except_logger.qid = task.req_qid except_logger.type = "PUSH_MD5_ERROR" except_logger.debug = json.dumps({"task_id": task.new_task_id, "source": task.source}) logger.debug("\n" + except_logger.logger_info) logger.debug(current_log_tag() + '[验证页面上传失败] md5:{0}'.format(key)) return False
def parse_result(parser): # 判断是否为有解析需要,且在需解析目标中 parser_name = parser.__name__.split('_', 1)[1] if parser_name in required: logger.debug(current_log_tag() + 'parse target %s', parser_name) per_result = parser(request_template, converted_data) if per_result is not None: if per_result: start = datetime.datetime.now() if isinstance(per_result, list): # 添加 guest_info store_utils.add_index_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, page_index) # 添加 stopby 信息 store_utils.add_stop_by_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, self.spider.task) result[parser_name].extend(per_result) elif isinstance(per_result, dict): result[parser_name].append(per_result) logger.debug(current_log_tag() + '[结果保存][不使用压缩][用时: {0} ]'.format( datetime.datetime.now() - start))
def logging(*args, **kw): func_count_dict[fun.__name__] += 1 begin = datetime.now() logger.debug(current_log_tag() + '函数 {0} call start'.format(fun.__name__)) result = fun(*args, **kw) end = datetime.now() logger.debug(current_log_tag() + '函数 {0} call end'.format(fun.__name__)) # logger.debug(current_log_tag() + ',函数,%s,耗时,%s,当前运行,%s,个此函数,当前,%s,协程', fun.__name__, (end - begin), # func_count_dict[fun.__name__], mioji.common.pool.pool.size) func_count_dict[fun.__name__] -= 1 return result
def crawl(self): """ """ if hasattr(self.task, 'new_task_id'): cur_id = self.task.new_task_id else: cur_id = str(uuid.uuid1()) self.spider_taskinfo = {'task_id': cur_id} getcurrent().spider_taskinfo = self.spider_taskinfo # 打印任务信息 for k, v in self.task.__dict__.items(): self.spider_taskinfo[k] = v try: logger.info(current_log_tag() + '[任务信息][%s][%s]' % (k, json.dumps(v))) except Exception: continue chains = self.targets_request() try: self.code = self.__crawl_by_chain(chains) except parser_except.ParserException as e: logger.exception(e) self.code = e.code self.exception = e.msg if e.retry_from_first: raise e return self.code
def add_stop_by_info(versions, result, task): # 获取 stop by info 所在的位置 stop_by_index = TICKET_INFO_INDEX.get(versions, None) if stop_by_index is None: logger.debug('[未找到 stop_by_index][versions: {0}]'.format(versions)) # try: # from common import db # sql = 'REPLACE INTO new_frame_not_replace_stop_by (ip, versions) VALUES (%s, %s)' # db.execute_into_spider_db(sql, (get_local_ip(), versions)) # except Exception as e: # logger.warning('[未成功入 未找到 stop_by 库][ERROR: {0}]'.format(e)) return logger.debug(current_log_tag() + '[修改 stop_by_info][versions: {0}][位置 {1}]'.format( versions, stop_by_index)) for __i in range(len(result)): if result[__i]: result[__i] = list(result[__i]) # E、P、B、F,E 经济舱 P 超级经济舱 B 商务舱 F 头等舱 task_stop_by_info = task.ticket_info.get('v_seat_type', None) or 'E' if versions == 'InsertMultiFlight': result[__i][stop_by_index] = '{0}&NULL'.format( task_stop_by_info) else: result[__i][stop_by_index] = task_stop_by_info result[__i] = tuple(result[__i])
def crawl(self): """ 外部启动爬虫的入口方法 当调用这个方法时才能开始爬虫工作~ :return: """ # todo self.__create_browser() cur_id = str(uuid.uuid1()) if hasattr(self.task, 'new_task_id'): cur_id = self.task.new_task_id self.spider_taskinfo = {'task_id': cur_id} for k, v in self.task.__dict__.items(): self.spider_taskinfo[k] = v try: logger.info(current_log_tag() + '[任务信息][%s][%s]' % (k, json.dumps(v))) except Exception: continue chains = self.targets_request() try: self.code = self.__crawl_by_chain(chains) except parser_except.ParserException as e: logger.exception(e) self.code = e.code self.exception = e.msg if e.retry_from_first: raise e # 通过返回的全部 result 判断错误码 self.check_all_result() return self.code
def __crawl_list(self, reqParse, browser, req_list): """ 串行抓取分页 """ result = defaultdict(list) all_except = True all_ok = True one_exception = None total_count = 0 success_count = 0 error_req = [] for req in req_list: # 串行增加翻页限制取消 # if NEED_FLIP_LIMIT: # if total_count >= MAX_FLIP: # break total_count += 1 try: success_count += 1 res = self.__single_crawl(reqParse, browser, req, page_count=total_count) self.__target_append_result(result, res) all_except = False except Exception as e: all_ok = False one_exception = e error_req.append((req, one_exception.message)) logger.exception( current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) # 抛出生成器部分的异常 if isinstance(req, types.GeneratorType): raise e if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return result, all_except, all_ok, one_exception
def __async_crawl_list(self, reqParse, browser, req_list): """ 并行抓取分页 丢到协程池里 """ a_result = defaultdict(list) all_except = True all_ok = True one_exception = None params = [] total_count = 0 for req in req_list: total_count += 1 params.append((reqParse, browser, req, total_count)) result = block_async(pool, self.__single_crawl, params) success_count = 0 error_req = [] for a_res in result: err_or_data, is_data = a_res if is_data: success_count += 1 all_except = False self.__target_append_result(a_result, err_or_data) else: all_ok = False args, kwargs, one_exception = err_or_data if hasattr( one_exception, 'retry_from_first') and one_exception.retry_from_first: raise one_exception error_req.append((args[2], one_exception.message)) if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return a_result, all_except, all_ok, one_exception
def result(self): try: for k, v in self._result.items(): logger.debug( current_log_tag() + '[抓取结果][key: {0}][value_len: {1}]'.format(k, len(v))) except Exception: pass return self._result
def crawl_data(self, request_template, browser, source_name): """ 页面抓取函数 :param request_template: 请求字典 :param browser: 抓取浏览器 :param source_name: 源名称 :return: 返回抓取结果 response 对象 """ try: logger.debug(current_log_tag() + 'crawl %s, retry_count: %s', self.__request_func.__name__, self.req_count) # 代理装配 self.browser_set_proxy(browser, source_name) resp, self.content_length = self.__crawl_data_str( request_template, browser) # todo 修改 user_retry 返回的结果 if self.user_retry: try: user_check = self.spider.user_retry_err_or_resp( resp, self.req_count, request_template, False) except Exception as e: self.user_exc = True raise e # 当用户返回 True 时 if user_check: return resp else: raise parser_except.ParserException( parser_except.PROXY_INVALID, '代理异常') else: return resp except parser_except.ParserException as e: self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_FORBIDDEN, parser_except.REQ_ERROR) self.req_exception = e except Exception as e: self.req_exception = parser_except.ParserException( parser_except.REQ_ERROR, 'req exception:{0}'.format(e)) # 如果有用户异常,则置位用户重试 if self.user_exc: if isinstance(e, parser_except.ParserException): self.req_exception = e finally: if self.req_exception: code = self.req_exception.code else: code = 0 if self.req_exception: raise self.req_exception
def __crawl_by_chain(self, chains): """ 根据请求链的类型,进入不同的抓取顺序进行抓取 :param chains: :return: """ code = 0 try: for reqParse in chains: # gevent.sleep(0) browser = self.__create_browser(reqParse.new_session) reqParse.spider = self t_req = reqParse.request() if isinstance(t_req, dict): # 单一请求 new_result = self.__single_crawl(reqParse, browser, t_req, 0) elif isinstance(t_req, list): # 爬虫有可能返回一个空列表!!! if t_req: if reqParse.asynchronous: # 并行抓取 list_result = self.__async_crawl_list( reqParse, browser, t_req) else: # 串行请求 list_result = self.__crawl_list( reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) # $$$ 可以优化 elif isinstance(t_req, types.GeneratorType): # 针对使用的yelid 调用方法的请求 list_result = self.__crawl_list(reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) self.__spider_append_result(new_result) if self.use_selenium and browser.br: browser.close() except parser_except.ParserException as e: if self.use_selenium and browser.br: browser.close() logger.error(e) raise e except Exception: if self.use_selenium and browser.br: browser.close() logger.exception(current_log_tag() + '[新框架 持续请求 未知问题][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) raise parser_except.ParserException( parser_except.UNKNOWN_ERROR, 'e:{0}'.format(traceback.format_exc())) return code
def __target_append_result(result, new_result): """ 向 result 中添加数据 :param result: 被添加量 :param new_result: 添加量 :return: None : 此处用了字典的单例。 """ for k, v in new_result.items(): if not v: continue logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(result.get(k, []))) result[k] += v
def __spider_append_result(self, new_result): """ 向 self.result 中添加解析结果 :param new_result: 必须为解析结果 :return: None :调用回调方法 """ for k, v in new_result.items(): if not v: continue data_bind = self.targets[k].get('bind', None) if data_bind: logger.debug( "current_log_tag() + [ 抓取绑定 {0} ][ 数据绑定 {1} ]".format( k, data_bind)) self._result[data_bind] += v logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(self._result.get(k, []))) else: self._result[k] += v logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(self._result.get(k, [])))
def convert(self, request_template, data): data_con = request_template.get('data', {}) c_type = data_con.get('content_type', 'string') logger.debug(current_log_tag() + 'Converter got content_type: %s', c_type) if c_type is 'html': return HTML.fromstring(data) elif c_type is 'json': return json.loads(data) elif isinstance(c_type, types.MethodType): try: return c_type(request_template, data) except: raise parser_except.ParserException( -1, 'convert func muset error{0} ,func:{1}'.format( traceback.format_exc(), c_type)) else: return data
def __crawl_list(self, reqParse, browser, req_list): """ 串行抓取分页 """ result = defaultdict(list) all_except = True all_ok = True one_exception = None total_count = 0 success_count = 0 error_req = [] for req in req_list: gevent.sleep(0) # 串行增加翻页限制取消 # if NEED_FLIP_LIMIT: # if total_count >= MAX_FLIP: # break total_count += 1 try: success_count += 1 res = self.__single_crawl(reqParse, browser, req, page_count=total_count) self.__target_append_result(result, res) all_except = False except Exception, e: all_ok = False one_exception = e error_req.append((req, one_exception.message)) logger.exception( current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) # 抛出生成器部分的异常 if isinstance(req, types.GeneratorType): raise e
def add_index_info(versions, result, page_index): # 获取 guest info 所在的位置 index_info_index = INDEX_INFO_INDEX.get(versions, None) if index_info_index is None: return logger.debug(current_log_tag() + '[修改 index_info][versions: {0}][位置 {1}]'.format( versions, index_info_index)) for __i in range(len(result)): if result[__i]: result[__i] = list(result[__i]) old_index_info = result[__i][index_info_index] try: old_index_info = json.loads(old_index_info) if not isinstance(old_index_info, dict): raise Exception('Type Is Not Dict') except Exception: old_index_info = {'unparse_info': old_index_info} index_info = {k: v for k, v in old_index_info.items()} index_info['page_index'] = page_index index_info['item_index'] = __i result[__i][index_info_index] = json.dumps(index_info) result[__i] = tuple(result[__i])
def req(self, url, method='get', params=None, data=None, json=None, timeout=60, verify=False, **kw): httpLogger = HttpLogger() httpLogger = copy.deepcopy(httpLogger) httpLogger.qid = self.qid httpLogger.task_id = self.task_id httpLogger.req_type = method httpLogger.source = self.source httpLogger.task_id = self.task_id httpLogger.qid = self.qid httpLogger.url = url httpLogger.proxy_out = str(self.out_ip) httpLogger.proxy = str(self.proxy) httpLogger.proxy_inf = str(self.proxy_inf) httpLogger.retry_count = self.req_count for k in kw.keys(): if k not in [ 'method', 'url', 'params', 'data', 'headers', 'cookies', 'files', 'auth', 'timeout', 'allow_redirects', 'proxies', 'hooks', 'stream', 'verify', 'cert', 'json' ]: logger.warning(current_log_tag() + '[出现不能解析的 req 请求参数][{0}]'.format(k)) new_kw = { k: v for k, v in kw.items() if k in [ 'method', 'url', 'params', 'data', 'headers', 'cookies', 'files', 'auth', 'timeout', 'allow_redirects', 'proxies', 'hooks', 'stream', 'verify', 'cert', 'json' ] } ts = int(1000 * time.time()) if data: httpLogger.data = data if isinstance(data, dict): httpLogger.data = _json.dumps(data, ensure_ascii=False) if json: httpLogger.data = json if isinstance(json, dict): httpLogger.data = _json.dumps(json, ensure_ascii=False) req_func = self.req_bind.get(method.lower()) httpLogger.cookie = str(req_func.__self__.cookies._cookies) httpLogger.source = self.source httpLogger.headers = str(new_kw.get('headers', "")) try: logger.debug(current_log_tag() + 'browser req start {1} {0}'.format(url, method)) logger.debug(current_log_tag() + 'browser req data {0}'.format(data)) logger.debug(current_log_tag() + 'browser req json {0}'.format(json)) logger.debug(current_log_tag() + 'browser req params {0}'.format(params)) logger.debug(current_log_tag() + 'browser req other_data {0}'.format(new_kw)) logger.debug(current_log_tag() + 'browser req session_cookie {0}'.format( req_func.im_self.cookies._cookies)) except: logger.debug(current_log_tag() + '请求前获取部分参数失败') try: local_resp = None # todo API qps限制 # try: # logger.debug(current_log_tag() + 'queue and qps config:{0}'.format(str(self.queue_info))) # if not self.queue_info.get('source_name'): # pass # elif self.queue_info['source_name'] in limit_config.keys(): # try: # cango = self.new_limit(self.queue_info, self.task_id) # except Exception as why: # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why))) # raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时&reqError') # if not cango: # raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时') # except Exception as why: # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why))) self.resp = local_resp = req_func(url, params=params, data=data, json=json, timeout=timeout, verify=verify, **new_kw) logger.debug( current_log_tag() + 'browser response headers:{0}'.format(self.resp.headers)) ts = int(1000 * time.time()) - ts httpLogger.last_time = ts logger.debug( current_log_tag() + 'browser req end {1} {0} proxy[{4}] ms[{2}] status[{3}] length[{5}]' .format(url, method, ts, local_resp.status_code, self.proxy, resp_content_lenght(local_resp))) httpLogger.resp_code = local_resp.status_code if len(str(local_resp.content)) > 1000: content = str(local_resp.content)[:1000] else: content = str(local_resp.content) httpLogger.resp_content = content httpLogger.proxy_out = str(self.out_ip) httpLogger.proxy = str(self.proxy) except: httpLogger.exception = str(traceback.format_exc()) logger.debug(current_log_tag() + 'browser req end {1} {0} proxy[{2}] error:{3}'.format( url, method, self.proxy, traceback.format_exc())) try: logger.debug('\n' + httpLogger.logger_info) except Exception as why: logger.debug(str(why)) raise try: logger.debug('\n' + httpLogger.logger_info) except Exception as why: logger.debug(str(why)) return local_resp
def parse(self, request_template, targets_bind, converted_data, page_index, required=None, multi_last=False): result = defaultdict(list) parsed = set() if not multi_last: parser_list = request_template.get('user_handler', []) for parser in parser_list: if parser not in parsed: logger.debug(current_log_tag() + 'user parser %s', parser) parser(request_template, converted_data) # 通过 parse 更新 result 信息 def parse_result(parser): # 判断是否为有解析需要,且在需解析目标中 parser_name = parser.__name__.split('_', 1)[1] if parser_name in required: logger.debug(current_log_tag() + 'parse target %s', parser_name) per_result = parser(request_template, converted_data) if per_result is not None: if per_result: start = datetime.datetime.now() if isinstance(per_result, list): # 添加 guest_info store_utils.add_index_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, page_index) # 添加 stopby 信息 store_utils.add_stop_by_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, self.spider.task) result[parser_name].extend(per_result) elif isinstance(per_result, dict): result[parser_name].append(per_result) logger.debug(current_log_tag() + '[结果保存][不使用压缩][用时: {0} ]'.format( datetime.datetime.now() - start)) # 解析目标,酒店、房间、等 # for target, parser in targets_bind.items(): if isinstance(self.binding, Iterable) and not isinstance(self.binding, (str, bytes)): for binding in self.binding: # 对 binding 种类进行兼容判断 if binding is None: continue elif isinstance(binding, (str, bytes)): parser = targets_bind.get(binding, '') if parser == '': TypeError('无法从 targets 中获取 parser {0}'.format(binding)) elif callable(binding): parser = binding else: raise TypeError('不支持绑定类型 {0} 的 {1}'.format( type(binding), repr(binding))) # 更新 result 信息 parse_result(parser) elif isinstance(self.binding, (str, bytes)): parser = targets_bind.get(self.binding, '') if parser == '': TypeError('无法从 targets 中获取 parser {0}'.format(self.binding)) # 更新 result 信息 parse_result(parser) elif callable(self.binding): parser = self.binding # 更新 result 信息 parse_result(parser) return result
def __crawl_data_str(self, request_template, browser): resp = None try: # 使用方法修改,用户直接修改 request_template 中的值 self.spider.prepare_request(request_template) # 获得 request_template 中的 req req = request_template['req'] # 用于控制qps if hasattr(self.spider, 'queue_info'): browser.queue_info = self.spider.queue_info if hasattr(self.spider.task, 'req_qid'): browser.qid = self.spider.task.req_qid else: browser.qid = "" browser.task_id = self.spider.task.task_id browser.source = self.spider.task.source browser.tid = self.spider.task.tid browser.ori_type = self.spider.task.ori_type resp = browser.req(**req) # 网络错误,异常抛出 resp.raise_for_status() content_length = len(resp.content) if isinstance(self.need_content_length, int): logger.debug(current_log_tag() + '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format( self.need_content_length, content_length)) if content_length <= self.need_content_length: raise parser_except.ParserException( parser_except.PROXY_INVALID, msg='data is empty') elif self.need_content_length is None: logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]') else: logger.debug(current_log_tag() + '[未知 content_length 检测类型][type: {0}]'.format( str(type(self.need_content_length)))) return resp, content_length # timeout except requests.exceptions.SSLError as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_SSL, msg=str(e), error=e) except requests.exceptions.ProxyError as e: # 代理失效 self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg='Proxy Error', error=e) except requests.exceptions.ConnectTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request connect Timeout', error=e) except requests.exceptions.ReadTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request read Timeout', error=e) except requests.exceptions.Timeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request Timeout', error=e) except requests.exceptions.ConnectionError as err: self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err)) except requests.exceptions.HTTPError as err: # 4xx 5xx 的错误码会catch到 self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except requests.exceptions.RequestException as err: # 这个是总的error self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except Exception as e: # 这个是最终的error self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=traceback.format_exc())
def user_append_reslut(self, target, result_items): self._result[target] += result_items logger.debug(current_log_tag() + "%s, length=%s, all=%s", target, len(result_items), len(self._result.get(target, [])))
class Spider(object): """ """ __metaclass__ = abc.ABCMeta # 源类型 source_type = '' # 抓取目标, 例如 : {'hotel':{}, 'room':{'version':'InsertNewFlight'}} targets = {} # 与老爬虫关联, 例如 : {'pricelineFlight': {'required': ['Flight']}} old_spider_tag = {} # 不启用,默认启用 unable = False # 排队 queue_info = {} # 重试配置 retry_info = {'max_try': 1, 'retry_codes': []} def __init__(self, task=None): assert self.source_type != '', '缺失正确的抓取类型' assert self.targets != {}, '缺失正确的抓取 parser' assert len(self.targets) > 0, parser_except.ParserException( 1, '必须指明解析目标') self.task = task self.task_id = "" self.spider_taskinfo = {} self.is_verify = False self.need_proxy = True self.use_selenium = False self.browser = None self.__cpu_time = 0 self.debug = True self.extra = {} self.user_datas = dict() self.verify_data = {'data': []} self._asy_temp_result = defaultdict(list) self.task_post_process_queue = None self.code = -1 self.cost_crawl_time = None self._result = defaultdict(list) self.__targets_parser_func_dict = {} self.targets_required = self.targets self._crawl_targets_required = self.targets_required self.debug_info = {'pages': []} self.process_callback = None # 用于减少一次异步回调 self.spider_frame_status = 0 self.exception = None self.machine_type = None self.local_ip = None self.env = None for t in self.targets.keys(): func_name = 'parse_' + t parse_func = getattr(self, func_name) self.__targets_parser_func_dict[t] = parse_func @func_time_logger def crawl(self): """ """ if hasattr(self.task, 'new_task_id'): cur_id = self.task.new_task_id else: cur_id = str(uuid.uuid1()) self.spider_taskinfo = {'task_id': cur_id} getcurrent().spider_taskinfo = self.spider_taskinfo # 打印任务信息 for k, v in self.task.__dict__.items(): self.spider_taskinfo[k] = v try: logger.info(current_log_tag() + '[任务信息][%s][%s]' % (k, json.dumps(v))) except Exception: continue chains = self.targets_request() try: self.code = self.__crawl_by_chain(chains) except parser_except.ParserException as e: logger.exception(e) self.code = e.code self.exception = e.msg if e.retry_from_first: raise e return self.code @abc.abstractmethod def targets_request(self): """ 目标请求链:酒店列表、酒店详情、酒店评论等 """ def response_error(self, req, resp, error): """ 请求异常 :param resp requests response :param error 异常 """ pass @property def task(self): return self._task @task.setter def task(self, task): if self.source_type.endswith( 'Hotel') and task and "List" not in self.source_type: task = task_change_sass(task) self._task = task @func_time_logger def __crawl_by_chain(self, chains): """ 从请求链中辨别请求类型,分别丢入不同的请求方法 单一请求,并行请求,串行请求 """ code = 0 try: for reqParse in chains: gevent.sleep(0) browser = self.__create_browser(reqParse.new_session) reqParse.spider = self t_req = reqParse.request() if isinstance(t_req, types.DictType): # 单一请求 new_result = self.__single_crawl(reqParse, browser, t_req, 0) elif isinstance(t_req, types.ListType): # 爬虫有可能返回一个空列表!!! if t_req: if reqParse. async: # 并行抓取 list_result = self.__async_crawl_list( reqParse, browser, t_req) else: # 串行请求 list_result = self.__crawl_list( reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) # $$$ 可以优化 elif isinstance(t_req, types.GeneratorType): # 针对使用的yelid 调用方法的请求 list_result = self.__crawl_list(reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) self.__spider_append_result(new_result) if self.use_selenium and browser.br: browser.close() except parser_except.ParserException as e: if self.use_selenium and browser.br: browser.close() logger.error(e) raise e except Exception: if self.use_selenium and browser.br: browser.close() logger.exception(current_log_tag() + '[新框架 持续请求 未知问题][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) raise parser_except.ParserException( parser_except.UNKNOWN_ERROR, 'e:{0}'.format(traceback.format_exc())) return code def __async_crawl_list(self, reqParse, browser, req_list): """ 并行抓取分页 丢到协程池里 """ a_result = defaultdict(list) all_except = True all_ok = True one_exception = None params = [] total_count = 0 for req in req_list: total_count += 1 params.append((reqParse, browser, req, total_count)) result = block_async(pool, self.__single_crawl, params) success_count = 0 error_req = [] for a_res in result: err_or_data, is_data = a_res if is_data: success_count += 1 all_except = False self.__target_append_result(a_result, err_or_data) else: all_ok = False args, kwargs, one_exception = err_or_data if hasattr( one_exception, 'retry_from_first') and one_exception.retry_from_first: raise one_exception error_req.append((args[2], one_exception.message)) if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return a_result, all_except, all_ok, one_exception def __crawl_list(self, reqParse, browser, req_list): """ 串行抓取分页 """ result = defaultdict(list) all_except = True all_ok = True one_exception = None total_count = 0 success_count = 0 error_req = [] for req in req_list: gevent.sleep(0) # 串行增加翻页限制取消 # if NEED_FLIP_LIMIT: # if total_count >= MAX_FLIP: # break total_count += 1 try: success_count += 1 res = self.__single_crawl(reqParse, browser, req, page_count=total_count) self.__target_append_result(result, res) all_except = False except Exception, e: all_ok = False one_exception = e error_req.append((req, one_exception.message)) logger.exception( current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) # 抛出生成器部分的异常 if isinstance(req, types.GeneratorType): raise e if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return result, all_except, all_ok, one_exception
def __single_crawl(self, reqParse, browser, request_template, page_count): """ 用于请求的基本方法 """ # 请求链中的header 可以被沿用 headers = request_template['req'].get('headers', None) use_headers = request_template['req'].get('use_headers', False) if headers: browser.add_header(headers, use_headers) # 设置 res 的 默认值 res = defaultdict(list) # 初始化请求参数 local_req_count = 0 reqParse.req_count = 0 reqParse.is_forbidden = False reqParse.req_exception = None reqParse.proxy = None reqParse.content_length = 0 self.__cpu_time += time.time() * 1000 while local_req_count < reqParse.retry_count: # 增加一次重试次数 local_req_count += 1 logger.debug( current_log_tag() + '[开始抓取][ {0} ]'.format(request_template['req'].get('url', ''))) # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取 try: resp = reqParse.crawl_data(request_template, browser, self.task.source) except parser_except.ParserException as e: traceback.print_exc() if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e # 错误码21/22/23 或 开发指定需要重试 if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID, parser_except.REQ_ERROR, parser_except.PROXY_SSL) or e.need_retry: reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + traceback.format_exc()) logger.debug(current_log_tag() + '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'. format(e.code, reqParse.req_count)) continue else: raise e except Exception, e: if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e if local_req_count >= reqParse.retry_count: raise e else: continue # 请求中增加 resp 的值 request_template['resp'] = resp # 打印存储抓取结果 self.response_callback(request_template, resp) if reqParse.res_text == 'text': res = resp.text else: res = resp.content try: logger.debug(current_log_tag() + '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format( res[:100], res[-100:], request_template['req'] ['url']).replace('\n', '').replace('\t', '')) except Exception: pass # 如果本地运行,将不执行上传操作 # import pdb; pdb.set_trace() if not self.debug and self.env != "local": md5_key = get_md5(res) verify_task_info = { 'func_name': reqParse.request_func.__name__, 'page_index': page_count, 'retry_count': local_req_count - 1, 'md5_key': md5_key } # 把上传抓取页面至ucloud self.task_post_process_queue.put((res, self.task, md5_key)) self.verify_data['data'].append(verify_task_info) point_time = time.time() * 1000 try: convert_data = reqParse.convert(request_template, res) except Exception: if local_req_count >= reqParse.retry_count: logger.debug(current_log_tag() + traceback.format_exc()) raise parser_except.ParserException( parser_except.DATA_FORMAT_ERROR, '[traceback: {0}]'.format(traceback.format_exc())) else: continue finally: self.__cpu_time += time.time() * 1000 - point_time # 数据解析部分 point_time = time.time() * 1000 try: res = reqParse.parse(request_template, self.__targets_parser_func_dict, convert_data, page_count, self._crawl_targets_required) break except parser_except.ParserException as e: if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID): reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code)) convert_data = None continue else: raise e except Exception: raise parser_except.ParserException( parser_except.PARSE_ERROR, '[traceback:{0}]'.format(traceback.format_exc())) finally: self.__cpu_time += time.time() * 1000 - point_time self.response_callback(request_template, resp)