def parse_result(parser): # 判断是否为有解析需要,且在需解析目标中 parser_name = parser.__name__.split('_', 1)[1] if parser_name in required: logger.debug(current_log_tag() + 'parse target %s', parser_name) per_result = parser(request_template, converted_data) if per_result is not None: if per_result: start = datetime.datetime.now() if isinstance(per_result, list): # 添加 guest_info store_utils.add_index_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, page_index) # 添加 stopby 信息 store_utils.add_stop_by_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, self.spider.task) result[parser_name].extend(per_result) elif isinstance(per_result, dict): result[parser_name].append(per_result) logger.debug(current_log_tag() + '[结果保存][不使用压缩][用时: {0} ]'.format( datetime.datetime.now() - start))
def insert_rabbitmq(args, queue_list, routing_key): logger.debug('[rabbitmq 入库开始]') try: credentials = pika.PlainCredentials(username=USER, password=PASSWD) connection = pika.BlockingConnection( pika.ConnectionParameters(host=HOST, virtual_host='TrafficDataPush', credentials=credentials)) channel = connection.channel() channel.exchange_declare( exchange='TrafficDataPush', # exchange_type='fanout', durable=True, auto_delete=False) # 此部分代码会修改 exchange 以及定义 queue # for q in queue_list: # channel.queue_declare(queue=q, durable=True) # channel.queue_bind(queue=q, exchange='TrafficDataPush', routing_key=routing_key) msg = json.dumps(args, ensure_ascii=False) res = channel.basic_publish( exchange='TrafficDataPush', routing_key=routing_key, body=msg, properties=pika.BasicProperties(delivery_mode=2)) connection.close() if not res: raise Exception('RabbitMQ Result False') logger.debug('[rabbitmq 入库结束]') except Exception as exc: raise exc
def doPostProcessTask(res, task, key): logger.debug(current_log_tag() + '[验证页面开始上传]') handler = multipartuploadufile.MultipartUploadUFile(g_config.ucloud_public_key, g_config.ucloud_private_key) stream = BytesIO(zlib.compress(res)) ret, resp = handler.uploadstream(g_config.ucloud_bucket, key, stream) # 出问题重试 2 次 retry_times = 2 if resp.status_code == 200: logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key)) return True while resp.status_code != 200 and retry_times: retry_times -= 1 ret, resp = handler.resumeuploadstream() if resp.status_code == 200: logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key)) return True else: except_logger = ExceptionLogger() except_logger.qid = task.req_qid except_logger.type = "PUSH_MD5_ERROR" except_logger.debug = json.dumps({"task_id": task.new_task_id, "source": task.source}) logger.debug("\n" + except_logger.logger_info) logger.debug(current_log_tag() + '[验证页面上传失败] md5:{0}'.format(key)) return False
def spider_crawl(spider, task): """ 重头开始重试 :param parser: :param task: :return: """ retry_count = 0 need_crawl = True while need_crawl: retry_count += 1 try: spider.error_code_logger = ErrorCodeLogger() spider.crawl() return spider except ParserException as e: need_crawl = retry_count < e.retry_from_first_count if not need_crawl: raise e else: # 重试 保持相同的task_id, 开始抓取时间 spider = update_parser_from_older(spider, task) spider.error_code_logger.retry_times += 1 logger.debug('retry from first - {0}/{1}'.format(retry_count, e.retry_from_first_count))
def set_proxy(self, p, https=False): self.proxy = p proxy_type = 'NULL' if p is not None and p != "REALTIME": # socks都是内网socks服务转发,所以以 10. 开头判断 if "PROXY_API" in p: proxy_type = "API" self.br.proxies = p["PROXY_API"] elif "PROXY_GOOGLE_MAPS" in p: proxy_type = "GOOGLE_MAPS" self.br.proxies = p["PROXY_GOOGLE_MAPS"] elif p.startswith('10.'): # if p.split(':')[0] in SOCKS_PROXY: proxy_type = 'socks' self.br.proxies = { 'http': 'socks5://' + p, 'https': 'socks5://' + p } try: # self.real_ip = get_real_id(self.br.proxies) self.real_ip = p except Exception: pass else: self.real_ip = p.split(':')[0] proxy_type = 'http' self.br.proxies = { 'https': 'http://' + p, 'http': 'http://' + p, } logger.debug('[框架设置代理][代理类型: %s][代理 ip: %s ]' % (proxy_type, p))
def add_stop_by_info(versions, result, task): # 获取 stop by info 所在的位置 stop_by_index = TICKET_INFO_INDEX.get(versions, None) if stop_by_index is None: logger.debug('[未找到 stop_by_index][versions: {0}]'.format(versions)) # try: # from common import db # sql = 'REPLACE INTO new_frame_not_replace_stop_by (ip, versions) VALUES (%s, %s)' # db.execute_into_spider_db(sql, (get_local_ip(), versions)) # except Exception as e: # logger.warning('[未成功入 未找到 stop_by 库][ERROR: {0}]'.format(e)) return logger.debug(current_log_tag() + '[修改 stop_by_info][versions: {0}][位置 {1}]'.format( versions, stop_by_index)) for __i in range(len(result)): if result[__i]: result[__i] = list(result[__i]) # E、P、B、F,E 经济舱 P 超级经济舱 B 商务舱 F 头等舱 task_stop_by_info = task.ticket_info.get('v_seat_type', None) or 'E' if versions == 'InsertMultiFlight': result[__i][stop_by_index] = '{0}&NULL'.format( task_stop_by_info) else: result[__i][stop_by_index] = task_stop_by_info result[__i] = tuple(result[__i])
def load(self): logger.debug('=======初始化Spider======') spider_list = {} source_module_names = find_module_names('spider') for source in source_module_names: logger.debug("找到source:%s", source) spider_package = 'spider.' + source spider_module_names = find_module_names(spider_package) for spider_module in spider_module_names: try: logger.info("找到module: %s", spider_module) if spider_module.endswith('_spider'): desc = init_spider(spider_package + '.' + spider_module) if desc: desc[0]['source_key'] = source spider_list[desc[0]['source_type']] = desc[0] except Exception: logger.info("寻找并加载 [ module ]: {0} 时出现异常,[ {1} ]".format( spider_module, traceback.format_exc())) self.__spider_list = spider_list print('spiders: ', self.__spider_list) logger.info('=======spider init complete======')
def result(self): try: for k, v in self._result.items(): logger.debug( current_log_tag() + '[抓取结果][key: {0}][value_len: {1}]'.format(k, len(v))) except Exception: pass return self._result
def write_message(max_try): """ :param max_try: :return: """ try: max_try -= 1 msg = json.dumps({ 'qid': task.req_qid, 'type': task.callback_type, 'uid': task.req_uid, 'query': json.dumps(query), 'status': spider_status }) credentials = pika.PlainCredentials( username=task.master_info['spider_mq_user'], password=task.master_info['spider_mq_passwd']) connection = pika.BlockingConnection( pika.ConnectionParameters( host=task.master_info['spider_mq_host'], virtual_host=task.master_info['spider_mq_vhost'], credentials=credentials, # heartbeat_interval=0 )) channel = connection.channel() res = channel.basic_publish( exchange=task.master_info['spider_mq_exchange'], routing_key=task.master_info['spider_mq_routerKey'], properties=pika.BasicProperties(delivery_mode=2), body=msg, ) connection.process_data_events() connection.close() if not res: warn_msg = 'RabbitMQ Result False: {0}'.format(msg) info = warn(str(task.req_qid), 'ex_RabbitMQ', get_local_ip(), warn_msg) logger.debug("\n" + info) raise Exception('RabbitMQ Result False') logger.debug( '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}' .format(str(task.req_qid), str(task.source), task.content, spider_status)) return max_try except Exception as exc: if max_try > 0: return write_message(max_try) else: warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format( task.req_qid, traceback.format_exc(), msg) info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(), warn_msg) logger.exception("\n" + info) return max_try
def crawl_data(self, request_template, browser, source_name): """ 页面抓取函数 :param request_template: 请求字典 :param browser: 抓取浏览器 :param source_name: 源名称 :return: 返回抓取结果 response 对象 """ try: logger.debug(current_log_tag() + 'crawl %s, retry_count: %s', self.__request_func.__name__, self.req_count) # 代理装配 self.browser_set_proxy(browser, source_name) resp, self.content_length = self.__crawl_data_str( request_template, browser) # todo 修改 user_retry 返回的结果 if self.user_retry: try: user_check = self.spider.user_retry_err_or_resp( resp, self.req_count, request_template, False) except Exception as e: self.user_exc = True raise e # 当用户返回 True 时 if user_check: return resp else: raise parser_except.ParserException( parser_except.PROXY_INVALID, '代理异常') else: return resp except parser_except.ParserException as e: self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_FORBIDDEN, parser_except.REQ_ERROR) self.req_exception = e except Exception as e: self.req_exception = parser_except.ParserException( parser_except.REQ_ERROR, 'req exception:{0}'.format(e)) # 如果有用户异常,则置位用户重试 if self.user_exc: if isinstance(e, parser_except.ParserException): self.req_exception = e finally: if self.req_exception: code = self.req_exception.code else: code = 0 if self.req_exception: raise self.req_exception
def curl_real_ip(p): try: time_1 = time.time() socks_req = '''curl --socks5 {1} http://httpbin.org/ip'''.format(p) socks_IP = os.popen(socks_req).readlines() logger.debug('[框架设置代理][socks代理出口 ip: %s ]' % (socks_IP)) time_2 = time.time() socks_time = time_2 - time_1 logger.debug('[获取socks代理出口ip,耗时 %s 秒]' % (socks_IP)) except Exception: logger.error(' ') pass
def run(self): while True: spider_task = g_task_queue.get(block=True) logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count())) if g_co_pool.free_count() < 2: msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(), g_task_queue.qsize()) print callback.CallbackResult(task=spider_task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了")) else: g_co_pool.spawn(doTask, spider_task)
def logging(*args, **kw): func_count_dict[fun.__name__] += 1 begin = datetime.now() logger.debug(current_log_tag() + '函数 {0} call start'.format(fun.__name__)) result = fun(*args, **kw) end = datetime.now() logger.debug(current_log_tag() + '函数 {0} call end'.format(fun.__name__)) # logger.debug(current_log_tag() + ',函数,%s,耗时,%s,当前运行,%s,个此函数,当前,%s,协程', fun.__name__, (end - begin), # func_count_dict[fun.__name__], mioji.common.pool.pool.size) func_count_dict[fun.__name__] -= 1 return result
def __target_append_result(result, new_result): """ 向 result 中添加数据 :param result: 被添加量 :param new_result: 添加量 :return: None : 此处用了字典的单例。 """ for k, v in new_result.items(): if not v: continue logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(result.get(k, []))) result[k] += v
def __crawl_list(self, reqParse, browser, req_list): """ 串行抓取分页 """ result = defaultdict(list) all_except = True all_ok = True one_exception = None total_count = 0 success_count = 0 error_req = [] for req in req_list: # 串行增加翻页限制取消 # if NEED_FLIP_LIMIT: # if total_count >= MAX_FLIP: # break total_count += 1 try: success_count += 1 res = self.__single_crawl(reqParse, browser, req, page_count=total_count) self.__target_append_result(result, res) all_except = False except Exception as e: all_ok = False one_exception = e error_req.append((req, one_exception.message)) logger.exception( current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) # 抛出生成器部分的异常 if isinstance(req, types.GeneratorType): raise e if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return result, all_except, all_ok, one_exception
def __async_crawl_list(self, reqParse, browser, req_list): """ 并行抓取分页 丢到协程池里 """ a_result = defaultdict(list) all_except = True all_ok = True one_exception = None params = [] total_count = 0 for req in req_list: total_count += 1 params.append((reqParse, browser, req, total_count)) result = block_async(pool, self.__single_crawl, params) success_count = 0 error_req = [] for a_res in result: err_or_data, is_data = a_res if is_data: success_count += 1 all_except = False self.__target_append_result(a_result, err_or_data) else: all_ok = False args, kwargs, one_exception = err_or_data if hasattr( one_exception, 'retry_from_first') and one_exception.retry_from_first: raise one_exception error_req.append((args[2], one_exception.message)) if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return a_result, all_except, all_ok, one_exception
def convert(self, request_template, data): data_con = request_template.get('data', {}) c_type = data_con.get('content_type', 'string') logger.debug(current_log_tag() + 'Converter got content_type: %s', c_type) if c_type is 'html': return HTML.fromstring(data) elif c_type is 'json': return json.loads(data) elif isinstance(c_type, types.MethodType): try: return c_type(request_template, data) except: raise parser_except.ParserException( -1, 'convert func muset error{0} ,func:{1}'.format( traceback.format_exc(), c_type)) else: return data
def new_limit(self, limit, task): """ 排队服务 :param limit: :param task: :return: """ try: source = limit['source_name'] url = 'http://10.19.23.81:8901/sort' data = { 'source': source, 'state': 'a', 'timeout': 30, 'task': { 'task': task } } logger.debug('new limit req: ' + _json.dumps(data)) res = requests.post(url=url, data=_json.dumps(data), timeout=(10, 40)) logger.debug('new limit resp: ' + str(res.content)) except Exception as e: logger.debug('new limit error' + str(e)) return False if res.content != 'False': return True else: return False
def error_logger(spider): if hasattr(spider.task, 'new_task_id'): cur_id = spider.task.new_task_id else: cur_id = str(uuid.uuid1()) task_id = cur_id if hasattr(spider, "succeed_pages"): spider.error_code_logger.succeed_pages = spider.succeed_pages elif hasattr(spider, "success_count"): spider.error_code_logger.succeed_pages = spider.success_count if hasattr(spider, "total_crawl_pages"): spider.error_code_logger.total_crawl_pages = spider.total_crawl_pages elif hasattr(spider, "all_count"): spider.error_code_logger.total_crawl_pages = spider.all_count if hasattr(spider.task, "verify"): verify_type = spider.task.verify if isinstance(verify_type, dict): _type = verify_type.get('type', "") else: verify_type = json.loads(spider.task.verify) _type = verify_type.get('type', "") spider.error_code_logger.verify_type = _type spider.error_code_logger.task_id = cur_id spider.error_code_logger.source = spider.task.source spider.error_code_logger.tid = spider.task.tid spider.error_code_logger.ori_type = spider.task.ori_type spider.error_code_logger.task_info = json.dumps(spider.task.ticket_info, ensure_ascii=False) spider.error_code_logger.content = spider.task.content spider.error_code_logger.client_ip = spider.task.client_ip spider.error_code_logger.error_code = convert_code(spider.code) spider.error_code_logger.detail_code = spider.code spider.error_code_logger.qid = spider.task.req_qid spider.error_code_logger.MD5 = spider.verify_data["data"] spider.error_code_logger.last_time = spider.last_time if spider.code != 0: spider.error_code_logger.exception = spider.exception logger.debug('\n' + spider.error_code_logger.logger_info)
def do_worker(task_info_list): ''' 1、接收web请求 2、检查并解析task 3、一个请求中可能有多个任务,将任务依次添加进worker中 4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好) ''' bottle_r_time_0 = time.time() task_num = len(task_info_list) req_num = g_task_queue.qsize() + task_num bottle_r_time_1 = time.time() - bottle_r_time_0 for task in task_info_list: try: g_task_queue.put(task) except: # 任务队列已满 traceback.format_exc() callback.CallbackResult(task=task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了")) bottle_r_time_2 = time.time() - bottle_r_time_0 logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
def add_index_info(versions, result, page_index): # 获取 guest info 所在的位置 index_info_index = INDEX_INFO_INDEX.get(versions, None) if index_info_index is None: return logger.debug(current_log_tag() + '[修改 index_info][versions: {0}][位置 {1}]'.format( versions, index_info_index)) for __i in range(len(result)): if result[__i]: result[__i] = list(result[__i]) old_index_info = result[__i][index_info_index] try: old_index_info = json.loads(old_index_info) if not isinstance(old_index_info, dict): raise Exception('Type Is Not Dict') except Exception: old_index_info = {'unparse_info': old_index_info} index_info = {k: v for k, v in old_index_info.items()} index_info['page_index'] = page_index index_info['item_index'] = __i result[__i][index_info_index] = json.dumps(index_info) result[__i] = tuple(result[__i])
def __spider_append_result(self, new_result): """ 向 self.result 中添加解析结果 :param new_result: 必须为解析结果 :return: None :调用回调方法 """ for k, v in new_result.items(): if not v: continue data_bind = self.targets[k].get('bind', None) if data_bind: logger.debug( "current_log_tag() + [ 抓取绑定 {0} ][ 数据绑定 {1} ]".format( k, data_bind)) self._result[data_bind] += v logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(self._result.get(k, []))) else: self._result[k] += v logger.debug(current_log_tag() + "%s, length=%s, all=%s", k, len(v), len(self._result.get(k, [])))
def parse(self, request_template, targets_bind, converted_data, page_index, required=None, multi_last=False): result = defaultdict(list) parsed = set() if not multi_last: parser_list = request_template.get('user_handler', []) for parser in parser_list: if parser not in parsed: logger.debug(current_log_tag() + 'user parser %s', parser) parser(request_template, converted_data) # 通过 parse 更新 result 信息 def parse_result(parser): # 判断是否为有解析需要,且在需解析目标中 parser_name = parser.__name__.split('_', 1)[1] if parser_name in required: logger.debug(current_log_tag() + 'parse target %s', parser_name) per_result = parser(request_template, converted_data) if per_result is not None: if per_result: start = datetime.datetime.now() if isinstance(per_result, list): # 添加 guest_info store_utils.add_index_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, page_index) # 添加 stopby 信息 store_utils.add_stop_by_info( self.spider.targets.get(parser_name, {}).get( 'version', None), per_result, self.spider.task) result[parser_name].extend(per_result) elif isinstance(per_result, dict): result[parser_name].append(per_result) logger.debug(current_log_tag() + '[结果保存][不使用压缩][用时: {0} ]'.format( datetime.datetime.now() - start)) # 解析目标,酒店、房间、等 # for target, parser in targets_bind.items(): if isinstance(self.binding, Iterable) and not isinstance(self.binding, (str, bytes)): for binding in self.binding: # 对 binding 种类进行兼容判断 if binding is None: continue elif isinstance(binding, (str, bytes)): parser = targets_bind.get(binding, '') if parser == '': TypeError('无法从 targets 中获取 parser {0}'.format(binding)) elif callable(binding): parser = binding else: raise TypeError('不支持绑定类型 {0} 的 {1}'.format( type(binding), repr(binding))) # 更新 result 信息 parse_result(parser) elif isinstance(self.binding, (str, bytes)): parser = targets_bind.get(self.binding, '') if parser == '': TypeError('无法从 targets 中获取 parser {0}'.format(self.binding)) # 更新 result 信息 parse_result(parser) elif callable(self.binding): parser = self.binding # 更新 result 信息 parse_result(parser) return result
def __single_crawl(self, reqParse, browser, request_template, page_count): """ 用于请求的基本方法 """ # 请求链中的header 可以被沿用 headers = request_template['req'].get('headers', None) use_headers = request_template['req'].get('use_headers', False) if headers: browser.add_header(headers, use_headers) # 设置 res 的 默认值 res = defaultdict(list) # 初始化请求参数 local_req_count = 0 reqParse.req_count = 0 reqParse.is_forbidden = False reqParse.req_exception = None reqParse.proxy = None reqParse.content_length = 0 self.__cpu_time += time.time() * 1000 while local_req_count < reqParse.retry_count: # 增加一次重试次数 local_req_count += 1 logger.debug( current_log_tag() + '[开始抓取][ {0} ]'.format(request_template['req'].get('url', ''))) # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取 try: resp = reqParse.crawl_data(request_template, browser, self.task.source) except parser_except.ParserException as e: traceback.print_exc() if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e # 错误码21/22/23 或 开发指定需要重试 if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID, parser_except.REQ_ERROR, parser_except.PROXY_SSL) or e.need_retry: reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + traceback.format_exc()) logger.debug(current_log_tag() + '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'. format(e.code, reqParse.req_count)) continue else: raise e except Exception as e: if reqParse.user_exc: # 抛出用户在函数中抛出的错误 raise e if local_req_count >= reqParse.retry_count: raise e else: continue # 请求中增加 resp 的值 request_template['resp'] = resp # 打印存储抓取结果 self.response_callback(request_template, resp) if reqParse.res_text == 'text': res = resp.text else: res = resp.content try: logger.debug(current_log_tag() + '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format( res[:100], res[-100:], request_template['req'] ['url']).replace('\n', '').replace('\t', '')) except Exception: pass # 如果本地运行,将不执行上传操作 if not self.debug and self.env != "local": md5_key = get_md5(res) verify_task_info = { 'func_name': reqParse.request_func.__name__, 'page_index': page_count, 'retry_count': local_req_count - 1, 'md5_key': md5_key } # 把上传抓取页面至ucloud self.task_post_process_queue.put((res, self.task, md5_key)) self.verify_data['data'].append(verify_task_info) point_time = time.time() * 1000 try: convert_data = reqParse.convert(request_template, res) except Exception: if local_req_count >= reqParse.retry_count: logger.debug(current_log_tag() + traceback.format_exc()) raise parser_except.ParserException( parser_except.DATA_FORMAT_ERROR, '[traceback: {0}]'.format(traceback.format_exc())) else: continue finally: self.__cpu_time += time.time() * 1000 - point_time # 数据解析部分 point_time = time.time() * 1000 try: res = reqParse.parse(request_template, self.__targets_parser_func_dict, convert_data, page_count, self._crawl_targets_required) break except parser_except.ParserException as e: if e.code in (parser_except.PROXY_FORBIDDEN, parser_except.PROXY_INVALID): reqParse.is_forbidden = True if local_req_count >= reqParse.retry_count or e.retry_from_first: raise e else: logger.debug(current_log_tag() + '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code)) convert_data = None continue else: raise e except Exception: raise parser_except.ParserException( parser_except.PARSE_ERROR, '[traceback:{0}]'.format(traceback.format_exc())) finally: self.__cpu_time += time.time() * 1000 - point_time self.response_callback(request_template, resp) have_ticket = False for k, v in res.items(): if not v: continue self._asy_temp_result[k] += v have_ticket = True # 有票 && slave调用的爬虫才会异步回调 if have_ticket and self.process_callback and not self.debug and self.env != "local": self.process_callback(task=self.task, spider=self, result_type="RUNNING") return res
class CallbackWorkload(object): def __init__(self): self.pool_dict = dict() def __str__(self): return json.dumps(self.__dict__) def CallbackResult(self, spider=None, task=None, error_code=0, result_type="END"): """ 不只传spider进来是因为11错误码找不到spider @ task:任务task信息 @ result_type:传入回调状态,end为最终状态同步执行,running为中间状态异步执行 @ proxy:需要写入的回调数据 @ error_code 写入的错误码 """ from slave import g_asy_callback_pool if result_type == "END": self.doCallback(task, error_code, spider, result_type) else: g_asy_callback_pool.spawn(self.doCallback, task, error_code, spider, result_type) def doCallback(self, task, error_code, spider, result_type): """ 执行回调工作 """ def get_ticket_num(): ticket_num = 0 for per_data_type in spider.crawl_targets_required: ticket_num += len(spider._asy_temp_result[per_data_type]) return ticket_num def get_result(_result): _proxy_or_ticket = [] for per_data_type in spider.crawl_targets_required: _proxy_or_ticket.extend(_result[per_data_type]) return _proxy_or_ticket # 如果是 running状态 等一秒再判断下。 if result_type == "RUNNING": num1 = get_ticket_num() time.sleep(1) # 缓冲后票张数量 num2 = get_ticket_num() if num1 != num2 or spider.spider_frame_status: return task.other_info['parser_error'] = int(error_code) query = {"other_info": task.other_info} result = None redis_mq_logger = RedisMQCostLogger() extra = {} if spider: result = spider._asy_temp_result if result_type == 'RUNNING' else spider.result result = get_result(result) extra = spider.extra redis_mq_logger.ticket_num = len(spider._asy_temp_result) try: redis_mq_logger.qid = task.req_qid redis_mq_logger.source = task.source redis_mq_logger.task_id = task.new_task_id redis_mq_logger.task_info = task.content redis_mq_logger.error_code = error_code if result_type == 'END': redis_mq_logger.is_end = 1 # 写入redis redis_cost = self.write_redis_ticket(task, result, error_code, extra) if isinstance(redis_cost, tuple): redis_mq_logger.conn_redis = redis_cost[0] redis_mq_logger.write_redis = redis_cost[1] else: redis_mq_logger.exception = redis_cost except Exception, e: logger.exception('not redis con' + str(e)) # 写入mq operation_info = self.call_back_toservice(task, query, result_type) mq_try, mq_cost = operation_info.get('result', 0), operation_info.get( 'cost_time', 0) redis_mq_logger.mq_cost = mq_cost redis_mq_logger.mq_try = mq_try logger.debug('写入redis和mq:\n' + redis_mq_logger.logger_info)
def ctrip_cn_parser(content, url, other_info): hotel = CtripCNHotel() try: root = html.fromstring(content.decode('utf-8')) except Exception as e: print traceback.format_exc() raise PARSE_ERROR try: phantomjs = execjs.get('PhantomJS') js_str = root.xpath("//script[contains(text(),'hotelDomesticConfig')]/text()")[0] page_js = phantomjs.compile(js_str[:js_str.index('function loadCallback()')]) except Exception as e: print traceback.format_exc() logger.debug(current_log_tag() + '[获取JS中数据失败]') try: hotel_name = root.xpath('//h2[@class="cn_n"]/text()')[0] temp = re.findall(ur'([\u4e00-\u9fa5\s]*)', hotel_name) zh_name_tmep = [t for t in temp if t and t != ' '] if len(zh_name_tmep) == 1: hotel.hotel_name = zh_name_tmep[0].encode('utf8') elif len(zh_name_tmep) > 1: temp_ii = hotel_name.find(zh_name_tmep[-1]) + len(zh_name_tmep[-1]) temp_iii = hotel_name.find(')', temp_ii) if temp_iii>-1: hotel.hotel_name = hotel_name[:temp_iii+1].encode('utf8') else: hotel.hotel_name = hotel_name[:temp_ii + 1].encode('utf8') else: hotel.hotel_name = '' if not zh_name_tmep: hotel.hotel_name_en = hotel_name.encode('utf8').strip(')').strip('(').strip(')').strip('(').strip() else: name_en_temp = hotel_name[hotel_name.find(zh_name_tmep[-1]) + len(zh_name_tmep[-1])+1:] hotel.hotel_name_en = name_en_temp.encode('utf8').strip(')').strip('(').strip(')').strip('(').strip() except Exception as e: print traceback.format_exc() logger.debug(current_log_tag() + '[解析英文名失败]') # try: # hotel_name = root.xpath('//h2[@class="cn_n"]/text()')[0].strip() # hotel.hotel_name = re.search(u'[\u4e00-\u9fa5]+', hotel_name).group() # except Exception as e: # print traceback.format_exc() # logger.debug(current_log_tag() + '【解析中文名失败】') print "中文名:", hotel.hotel_name print "英文名:", hotel.hotel_name_en try: position = page_js.eval('hotelDomesticConfig')['hotel']['position'].split('|') hotel.map_info = position[1] + ',' + position[0] print "hotel.map_info:", hotel.map_info except Exception as e: try: position_temp = root.xpath('//*[@id="hotelCoordinate"]/@value')[0].encode('utf-8').strip().split('|') hotel.map_info = position_temp[0] + ',' + position_temp[1] except Exception, e: print traceback.format_exc() logger.debug(current_log_tag() + '【解析酒店地址失败】') hotel.map_info = 'NULL'
def __crawl_data_str(self, request_template, browser): resp = None try: # 使用方法修改,用户直接修改 request_template 中的值 self.spider.prepare_request(request_template) # 获得 request_template 中的 req req = request_template['req'] # 用于控制qps if hasattr(self.spider, 'queue_info'): browser.queue_info = self.spider.queue_info if hasattr(self.spider.task, 'req_qid'): browser.qid = self.spider.task.req_qid else: browser.qid = "" browser.task_id = self.spider.task.task_id browser.source = self.spider.task.source browser.tid = self.spider.task.tid browser.ori_type = self.spider.task.ori_type resp = browser.req(**req) # 网络错误,异常抛出 resp.raise_for_status() content_length = len(resp.content) if isinstance(self.need_content_length, int): logger.debug(current_log_tag() + '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format( self.need_content_length, content_length)) if content_length <= self.need_content_length: raise parser_except.ParserException( parser_except.PROXY_INVALID, msg='data is empty') elif self.need_content_length is None: logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]') else: logger.debug(current_log_tag() + '[未知 content_length 检测类型][type: {0}]'.format( str(type(self.need_content_length)))) return resp, content_length # timeout except requests.exceptions.SSLError as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_SSL, msg=str(e), error=e) except requests.exceptions.ProxyError as e: # 代理失效 self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg='Proxy Error', error=e) except requests.exceptions.ConnectTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request connect Timeout', error=e) except requests.exceptions.ReadTimeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request read Timeout', error=e) except requests.exceptions.Timeout as e: self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN, msg='Request Timeout', error=e) except requests.exceptions.ConnectionError as err: self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err)) except requests.exceptions.HTTPError as err: # 4xx 5xx 的错误码会catch到 self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except requests.exceptions.RequestException as err: # 这个是总的error self.spider.response_error(request_template, resp, err) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=str(err), error=err) except Exception as e: # 这个是最终的error self.spider.response_error(request_template, resp, e) raise parser_except.ParserException(parser_except.PROXY_INVALID, msg=traceback.format_exc())
def req(self, url, method='get', params=None, data=None, json=None, timeout=60, verify=False, **kw): httpLogger = HttpLogger() httpLogger = copy.deepcopy(httpLogger) httpLogger.qid = self.qid httpLogger.task_id = self.task_id httpLogger.req_type = method httpLogger.source = self.source httpLogger.task_id = self.task_id httpLogger.qid = self.qid httpLogger.url = url httpLogger.proxy_out = str(self.out_ip) httpLogger.proxy = str(self.proxy) httpLogger.proxy_inf = str(self.proxy_inf) httpLogger.retry_count = self.req_count for k in kw.keys(): if k not in [ 'method', 'url', 'params', 'data', 'headers', 'cookies', 'files', 'auth', 'timeout', 'allow_redirects', 'proxies', 'hooks', 'stream', 'verify', 'cert', 'json' ]: logger.warning(current_log_tag() + '[出现不能解析的 req 请求参数][{0}]'.format(k)) new_kw = { k: v for k, v in kw.items() if k in [ 'method', 'url', 'params', 'data', 'headers', 'cookies', 'files', 'auth', 'timeout', 'allow_redirects', 'proxies', 'hooks', 'stream', 'verify', 'cert', 'json' ] } ts = int(1000 * time.time()) if data: httpLogger.data = data if isinstance(data, dict): httpLogger.data = _json.dumps(data, ensure_ascii=False) if json: httpLogger.data = json if isinstance(json, dict): httpLogger.data = _json.dumps(json, ensure_ascii=False) req_func = self.req_bind.get(method.lower()) httpLogger.cookie = str(req_func.__self__.cookies._cookies) httpLogger.source = self.source httpLogger.headers = str(new_kw.get('headers', "")) try: logger.debug(current_log_tag() + 'browser req start {1} {0}'.format(url, method)) logger.debug(current_log_tag() + 'browser req data {0}'.format(data)) logger.debug(current_log_tag() + 'browser req json {0}'.format(json)) logger.debug(current_log_tag() + 'browser req params {0}'.format(params)) logger.debug(current_log_tag() + 'browser req other_data {0}'.format(new_kw)) logger.debug(current_log_tag() + 'browser req session_cookie {0}'.format( req_func.im_self.cookies._cookies)) except: logger.debug(current_log_tag() + '请求前获取部分参数失败') try: local_resp = None # todo API qps限制 # try: # logger.debug(current_log_tag() + 'queue and qps config:{0}'.format(str(self.queue_info))) # if not self.queue_info.get('source_name'): # pass # elif self.queue_info['source_name'] in limit_config.keys(): # try: # cango = self.new_limit(self.queue_info, self.task_id) # except Exception as why: # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why))) # raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时&reqError') # if not cango: # raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时') # except Exception as why: # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why))) self.resp = local_resp = req_func(url, params=params, data=data, json=json, timeout=timeout, verify=verify, **new_kw) logger.debug( current_log_tag() + 'browser response headers:{0}'.format(self.resp.headers)) ts = int(1000 * time.time()) - ts httpLogger.last_time = ts logger.debug( current_log_tag() + 'browser req end {1} {0} proxy[{4}] ms[{2}] status[{3}] length[{5}]' .format(url, method, ts, local_resp.status_code, self.proxy, resp_content_lenght(local_resp))) httpLogger.resp_code = local_resp.status_code if len(str(local_resp.content)) > 1000: content = str(local_resp.content)[:1000] else: content = str(local_resp.content) httpLogger.resp_content = content httpLogger.proxy_out = str(self.out_ip) httpLogger.proxy = str(self.proxy) except: httpLogger.exception = str(traceback.format_exc()) logger.debug(current_log_tag() + 'browser req end {1} {0} proxy[{2}] error:{3}'.format( url, method, self.proxy, traceback.format_exc())) try: logger.debug('\n' + httpLogger.logger_info) except Exception as why: logger.debug(str(why)) raise try: logger.debug('\n' + httpLogger.logger_info) except Exception as why: logger.debug(str(why)) return local_resp
position_temp = root.xpath('//*[@id="hotelCoordinate"]/@value')[0].encode('utf-8').strip().split('|') hotel.map_info = position_temp[0] + ',' + position_temp[1] except Exception, e: print traceback.format_exc() logger.debug(current_log_tag() + '【解析酒店地址失败】') hotel.map_info = 'NULL' try: star = root.xpath("//span[@id='ctl00_MainContentPlaceHolder_commonHead_imgStar']")[0] print "star:", star.attrib['title'].encode('utf-8') hotel.star = int(re.search(r'([\d]+)', star.attrib['title'].encode('utf-8')).group(1)) print "hotel.star:", hotel.star except: hotel.star = -1 logger.debug(current_log_tag() + '[解析酒店星级失败]') try: grade = root.xpath("//span[@class='score']/text()")[0] hotel.grade = float(grade) print "hotel.grade:", hotel.grade except: try: grade = root.xpath("//span[@class='n']/text()")[0] hotel.grade = float(grade) print "hotel.grade:", hotel.grade except: hotel.grade = -1 logger.debug(current_log_tag() + '[解析酒店评分失败]') try: