def crawl(self): """ 外部启动爬虫的入口方法 当调用这个方法时才能开始爬虫工作~ :return: """ # todo self.__create_browser() cur_id = str(uuid.uuid1()) if hasattr(self.task, 'new_task_id'): cur_id = self.task.new_task_id self.spider_taskinfo = {'task_id': cur_id} for k, v in self.task.__dict__.items(): self.spider_taskinfo[k] = v try: logger.info(current_log_tag() + '[任务信息][%s][%s]' % (k, json.dumps(v))) except Exception: continue chains = self.targets_request() try: self.code = self.__crawl_by_chain(chains) except parser_except.ParserException as e: logger.exception(e) self.code = e.code self.exception = e.msg if e.retry_from_first: raise e # 通过返回的全部 result 判断错误码 self.check_all_result() return self.code
def wapper(*args, **kwargs): parser, task = args deadline = task.deadline try: max_try = parser.retry_info.get('max_try', 1) retry_codes = parser.retry_info.get('retry_codes', []) except: max_try = 1 retry_codes = [] retry_codes.extend([22, 23, 24, 36, 37]) max_try = 100 begin = time.time() for i in range(max_try): # 剩余重试次数 remaining_times = max_try - i logger.info('retry_by_deadline开始') parser = func(parser, task) # 错误码在可重试码内 && 当前运行时间<deadline && 当前运行次数 < 爬虫最大重试次数 if parser.code in retry_codes and (time.time() - begin < deadline) and remaining_times > 1: # 重试 保持相同的task_id, 开始抓取时间, 接收第一次的抓取结果 parser = update_parser_from_older(parser, task, True) continue else: break # 爬虫任务级日志记录 if parser.code == 0: parser.error_code_logger.exception = '' logger.info('retry_by_deadline结束') return parser
def use_recode_api(**kwargs): try: logger.info("[插入查定比记录][mongo][{0}]".format(int(time.time() * 1000))) recode_api = NewCheckBook(**kwargs) if recode_api.init_code == 0: recode_api.mongo_insert() except Exception, e: logger.info("[插入查定比记录失败][mongo][{0}]".format(str(e)))
def __init__(self, **kwargs): """ source: string 爬虫请求源 没有时"" t: string 实际api请求时间 时间戳 格式为"2018-04-10T17:04:44+08:00" radio_check: int 插定比类型 类型映射:1为search, 2为realtime, 3为create_order, 4为cancel_order, 5为pay, 6为cancel_confirm, 7为order_detail type: string api_name第三方接口标识 error_id: int类型 返回业务层判断错误码 msg: string类型 对方返回业务原文和状态码 http_code: int类型 返回请求层http code req: string 请求原文 json格式字符串 content: string 爬虫任务content resp: string 只在支付阶段记录API返回报文 qid: string 爬虫qid 没有时"" ptid: string 任务所发企业ID task_info: string 爬虫任务信息 没有时"" key: string 收集日志关键字 "MJ_MD_SP_API_LOG" is_success: 是否请求成功(这里的成功是指对方是否正常返回数据,只有请求数据错误的时候才返回1) int类型 0代表正常返回 1代表请求有误 :param kwargs: """ try: self.t = datetime.now().strftime('%Y-%m-%dT%H:%M:%S+08:00') self.radio_check = kwargs.get('record_tuple', 1) self.type = kwargs.get('api_name', '') self.error_id = kwargs.get('error_id', '') self.task = kwargs.get("task") self.unionkey = kwargs.get("unionkey") self.source = de_module[self.unionkey] self.api_info = kwargs.get('api_info') self.msg = kwargs.get('msg', '') self.http_code = kwargs.get('httpcode', '') self.req = self.api_info try: self.content = self.task.content except: self.content = '' self.resp = kwargs.get('resp', '') if hasattr(self.task, 'req_qid'): self.qid = getattr(self.task, 'req_qid') else: self.qid = self.t self.qid = self.qid try: self.ptid = json.loads( self.task.ticket_info['auth'])['acc_mj_uid'] except: self.ptid = 'error_001' try: self.task_info = self.task.ticket_info except: self.task_info = '' self.key = 'MJ_MD_SP_API_LOG' self.is_success = kwargs.get('is_success', '') self.init_code = 0 except Exception as e: self.init_code = 1 logger.info("[初始化查订比数据失败][mongo][{0}]".format(str(e)))
def run(self): while True: spider_task = g_task_queue.get(block=True) logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count())) if g_co_pool.free_count() < 2: msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(), g_task_queue.qsize()) print callback.CallbackResult(task=spider_task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了")) else: g_co_pool.spawn(doTask, spider_task)
def update_parser_from_older(_old_parser, task, need_result=False): parser = _old_parser parser.task_id = _old_parser.task_id parser.start_crawl_time = _old_parser.start_crawl_time if need_result: parser._result = _old_parser._result parser._asy_temp_result = _old_parser._asy_temp_result parser.error_code_logger = _old_parser.error_code_logger parser.error_code_logger.retry_times += 1 logger.info('update_parser success') return parser
def random_useragent(): pro_ua = True us_ag = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' while pro_ua: ua = UserAgent sa = ua().random user_agent = parse(sa) if user_agent.browser.family == 'IE' and user_agent.browser.version > 8: us_ag = sa pro_ua = False elif user_agent.browser.family == 'Firefox' and user_agent.browser.version > 13: us_ag = sa pro_ua = False elif user_agent.browser.family == 'Chrome' and user_agent.browser.version > 20: us_ag = sa pro_ua = False else: pass logger.info('user_agent:%s' % us_ag) return us_ag
def load(self): logger.debug('=======初始化Spider======') spider_list = {} source_module_names = find_module_names('spider') for source in source_module_names: logger.debug("找到source:%s", source) spider_package = 'spider.' + source spider_module_names = find_module_names(spider_package) for spider_module in spider_module_names: try: logger.info("找到module: %s", spider_module) if spider_module.endswith('_spider'): desc = init_spider(spider_package + '.' + spider_module) if desc: desc[0]['source_key'] = source spider_list[desc[0]['source_type']] = desc[0] except Exception: logger.info("寻找并加载 [ module ]: {0} 时出现异常,[ {1} ]".format( spider_module, traceback.format_exc())) self.__spider_list = spider_list print('spiders: ', self.__spider_list) logger.info('=======spider init complete======')
def do_worker(task_info_list): ''' 1、接收web请求 2、检查并解析task 3、一个请求中可能有多个任务,将任务依次添加进worker中 4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好) ''' bottle_r_time_0 = time.time() task_num = len(task_info_list) req_num = g_task_queue.qsize() + task_num bottle_r_time_1 = time.time() - bottle_r_time_0 for task in task_info_list: try: g_task_queue.put(task) except: # 任务队列已满 traceback.format_exc() callback.CallbackResult(task=task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了")) bottle_r_time_2 = time.time() - bottle_r_time_0 logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
def mongo_insert(self): """ 暂未打印日志 提供给运维收取 插定比只做 mongo入库统计 :return: """ try: client = MongoClient( "mongodb://*****:*****@10.19.56.168:27017") db = client['new_api_spider_check'] collection_api = db[self.source] insert_data = { 'source': self.source, 't': self.t, 'radio_check': self.radio_check, 'type': self.type, 'error_id': int(self.error_id), 'msg': str(self.msg), 'http_code': self.http_code, 'req': self.req, 'content': self.content, 'resp': str(self.resp), 'qid': self.qid, 'ptid': self.ptid, 'task_info': self.task_info, 'key': self.key, 'is_success': self.is_success } collection_api.insert_one(insert_data) logger.info("[插入查定比记录成功][mongo][{0}]".format(self.qid)) logger.info("\n" + self.logger_info) # logger.debug('\n' + NewCheckBook.logger_info) except Exception as e: logger.info("[插入查定比记录失败][mongo][{0}]".format(str(e))) finally: try: client.close() except Exception as mme: logger.info("[插入查定比mongo连接错误][mongo][{0}]".format(str(mme)))
spider = spider_crawl(spider, task) # 执行 爬虫,并从头重试 except ParserException as e: error_info = e.msg error = e.code logger.exception('新框架 爬虫抛出异常: task:{0}, error:{1}, msg: {2}'.format(task, error_info, error)) except Exception, e: logger.exception("新框架 爬虫抛出异常: task_data:{0} error:{1}".format(task, e)) error = SLAVE_ERROR spider.last_time = int((time.time() - crawl_time) * 1000) check_all_result(spider) # 最后对所有返回数据进行check spider.spider_frame_status = 1 callback.CallbackResult(task=task, error_code=spider.code, spider=spider, result_type="END") # 执行回调操作,如果是end将执行同步回调 error_logger(spider) # 写入error日志 code = spider.code logger.info("[爬虫反馈 code: {0}][source: {1}] task: {2}".format(code, task.source, task)) def error_logger(spider): if hasattr(spider.task, 'new_task_id'): cur_id = spider.task.new_task_id else: cur_id = str(uuid.uuid1()) task_id = cur_id if hasattr(spider, "succeed_pages"): spider.error_code_logger.succeed_pages = spider.succeed_pages elif hasattr(spider, "success_count"): spider.error_code_logger.succeed_pages = spider.success_count if hasattr(spider, "total_crawl_pages"): spider.error_code_logger.total_crawl_pages = spider.total_crawl_pages
def ParseTask(req): """接收并解析task, 返回task对象list """ result = list() params = req.params client_ip = req.remote_addr req_tasks = json.loads(urllib.unquote(params.get('req'))) req_qid = params.get('qid') req_uid = params.get('uid') req_tid = params.get('tid', '') req_ori_type = params.get('ori_type', '') for req_task in req_tasks: try: task = Task() # 是否实时验证请求 task.req_qid = req_qid task.req_uid = req_uid task.order_no = req_task.get('order_no', "") task.source = req_task.get('source') task.content = req_task.get('content') task.deadline = req_task.get('deadline', 0) task.debug = req_task.get('debug', False) task.tid = req_tid task.client_ip = client_ip task.ori_type = req_ori_type # task.proxy_info = proxy_info task.ticket_info = req_task.get('ticket_info') # todo 验证信息 task.verify = req_task.get('verify', { 'type': 'pre', 'set_type': 'E' }) task.req_md5 = task.ticket_info.get('md5', 'default_md5') task.master_info = req_task.get('master_info', 'default_host') task.host = task.master_info.get('master_addr') task.redis_host = task.master_info.get('redis_addr').split(':')[0] task.redis_port = task.master_info.get('redis_addr').split(':')[-1] task.redis_db = task.master_info.get('redis_db') task.redis_passwd = task.master_info.get('redis_passwd') task.req_qid_md5 = task.req_qid + '-' + task.req_md5 task.other_info = req_task.get('other_info', {}) callback_type = 'scv100' if 'callback_type' in task.other_info: callback_type = task.other_info['callback_type'] task.callback_type = callback_type redis_key_list = task.other_info.get('redis_key', []) # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式 for each in redis_key_list: task.redis_key = each task.other_info['redis_key'] = each logger.info('s[{0}] id[{1}]new verify task:{2}'.format( task.source, task.new_task_id, task)) result.append(task) except Exception, e: continue