Esempio n. 1
0
    def crawl(self):
        """
        外部启动爬虫的入口方法
        当调用这个方法时才能开始爬虫工作~
        :return:
        """
        # todo
        self.__create_browser()
        cur_id = str(uuid.uuid1())
        if hasattr(self.task, 'new_task_id'):
            cur_id = self.task.new_task_id
        self.spider_taskinfo = {'task_id': cur_id}
        for k, v in self.task.__dict__.items():
            self.spider_taskinfo[k] = v
            try:
                logger.info(current_log_tag() + '[任务信息][%s][%s]' %
                            (k, json.dumps(v)))
            except Exception:
                continue
        chains = self.targets_request()
        try:
            self.code = self.__crawl_by_chain(chains)
        except parser_except.ParserException as e:
            logger.exception(e)
            self.code = e.code
            self.exception = e.msg
            if e.retry_from_first:
                raise e

        # 通过返回的全部 result 判断错误码
        self.check_all_result()
        return self.code
Esempio n. 2
0
File: slave.py Progetto: gitPff/sb-
 def wapper(*args, **kwargs):
     parser, task = args
     deadline = task.deadline
     try:
         max_try = parser.retry_info.get('max_try', 1)
         retry_codes = parser.retry_info.get('retry_codes', [])
     except:
         max_try = 1
         retry_codes = []
     retry_codes.extend([22, 23, 24, 36, 37])
     max_try = 100
     begin = time.time()
     for i in range(max_try):
         # 剩余重试次数
         remaining_times = max_try - i
         logger.info('retry_by_deadline开始')
         parser = func(parser, task)
         # 错误码在可重试码内 && 当前运行时间<deadline && 当前运行次数 < 爬虫最大重试次数
         if parser.code in retry_codes and (time.time() - begin < deadline) and remaining_times > 1:
             # 重试 保持相同的task_id, 开始抓取时间, 接收第一次的抓取结果
             parser = update_parser_from_older(parser, task, True)
             continue
         else:
             break
     # 爬虫任务级日志记录
     if parser.code == 0:
         parser.error_code_logger.exception = ''
     logger.info('retry_by_deadline结束')
     return parser
Esempio n. 3
0
def use_recode_api(**kwargs):
    try:
        logger.info("[插入查定比记录][mongo][{0}]".format(int(time.time() * 1000)))
        recode_api = NewCheckBook(**kwargs)
        if recode_api.init_code == 0:
            recode_api.mongo_insert()
    except Exception, e:
        logger.info("[插入查定比记录失败][mongo][{0}]".format(str(e)))
Esempio n. 4
0
 def __init__(self, **kwargs):
     """
     source: string 爬虫请求源 没有时""
     t: string 实际api请求时间 时间戳 格式为"2018-04-10T17:04:44+08:00"
     radio_check: int 插定比类型 类型映射:1为search, 2为realtime, 3为create_order, 4为cancel_order, 5为pay, 6为cancel_confirm, 7为order_detail
     type: string api_name第三方接口标识
     error_id: int类型 返回业务层判断错误码
     msg: string类型 对方返回业务原文和状态码
     http_code: int类型 返回请求层http code
     req: string 请求原文 json格式字符串
     content: string 爬虫任务content
     resp: string 只在支付阶段记录API返回报文
     qid: string 爬虫qid 没有时""
     ptid: string 任务所发企业ID
     task_info: string 爬虫任务信息 没有时""
     key: string 收集日志关键字 "MJ_MD_SP_API_LOG"
     is_success: 是否请求成功(这里的成功是指对方是否正常返回数据,只有请求数据错误的时候才返回1) int类型 0代表正常返回 1代表请求有误
     :param kwargs:
     """
     try:
         self.t = datetime.now().strftime('%Y-%m-%dT%H:%M:%S+08:00')
         self.radio_check = kwargs.get('record_tuple', 1)
         self.type = kwargs.get('api_name', '')
         self.error_id = kwargs.get('error_id', '')
         self.task = kwargs.get("task")
         self.unionkey = kwargs.get("unionkey")
         self.source = de_module[self.unionkey]
         self.api_info = kwargs.get('api_info')
         self.msg = kwargs.get('msg', '')
         self.http_code = kwargs.get('httpcode', '')
         self.req = self.api_info
         try:
             self.content = self.task.content
         except:
             self.content = ''
         self.resp = kwargs.get('resp', '')
         if hasattr(self.task, 'req_qid'):
             self.qid = getattr(self.task, 'req_qid')
         else:
             self.qid = self.t
         self.qid = self.qid
         try:
             self.ptid = json.loads(
                 self.task.ticket_info['auth'])['acc_mj_uid']
         except:
             self.ptid = 'error_001'
         try:
             self.task_info = self.task.ticket_info
         except:
             self.task_info = ''
         self.key = 'MJ_MD_SP_API_LOG'
         self.is_success = kwargs.get('is_success', '')
         self.init_code = 0
     except Exception as e:
         self.init_code = 1
         logger.info("[初始化查订比数据失败][mongo][{0}]".format(str(e)))
Esempio n. 5
0
File: slave.py Progetto: gitPff/sb-
 def run(self):
     while True:
         spider_task = g_task_queue.get(block=True)
         logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count()))
         if g_co_pool.free_count() < 2:
             msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(),
                                                        g_task_queue.qsize())
             print
             callback.CallbackResult(task=spider_task, error_code=98, result_type="END")
             logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了"))
         else:
             g_co_pool.spawn(doTask, spider_task)
Esempio n. 6
0
File: slave.py Progetto: gitPff/sb-
def update_parser_from_older(_old_parser, task, need_result=False):
    parser = _old_parser
    parser.task_id = _old_parser.task_id
    parser.start_crawl_time = _old_parser.start_crawl_time
    if need_result:
        parser._result = _old_parser._result
        parser._asy_temp_result = _old_parser._asy_temp_result
        parser.error_code_logger = _old_parser.error_code_logger
        parser.error_code_logger.retry_times += 1

    logger.info('update_parser success')
    return parser
Esempio n. 7
0
def random_useragent():
    pro_ua = True
    us_ag = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    while pro_ua:
        ua = UserAgent
        sa = ua().random
        user_agent = parse(sa)
        if user_agent.browser.family == 'IE' and user_agent.browser.version > 8:
            us_ag = sa
            pro_ua = False
        elif user_agent.browser.family == 'Firefox' and user_agent.browser.version > 13:
            us_ag = sa
            pro_ua = False
        elif user_agent.browser.family == 'Chrome' and user_agent.browser.version > 20:
            us_ag = sa
            pro_ua = False
        else:
            pass
    logger.info('user_agent:%s' % us_ag)
    return us_ag
Esempio n. 8
0
    def load(self):
        logger.debug('=======初始化Spider======')
        spider_list = {}

        source_module_names = find_module_names('spider')
        for source in source_module_names:

            logger.debug("找到source:%s", source)
            spider_package = 'spider.' + source

            spider_module_names = find_module_names(spider_package)
            for spider_module in spider_module_names:
                try:
                    logger.info("找到module: %s", spider_module)
                    if spider_module.endswith('_spider'):
                        desc = init_spider(spider_package + '.' +
                                           spider_module)
                        if desc:
                            desc[0]['source_key'] = source
                            spider_list[desc[0]['source_type']] = desc[0]
                except Exception:
                    logger.info("寻找并加载 [ module ]: {0} 时出现异常,[ {1} ]".format(
                        spider_module, traceback.format_exc()))

        self.__spider_list = spider_list
        print('spiders: ', self.__spider_list)
        logger.info('=======spider init complete======')
Esempio n. 9
0
File: slave.py Progetto: gitPff/sb-
def do_worker(task_info_list):
    ''' 
    1、接收web请求
    2、检查并解析task
    3、一个请求中可能有多个任务,将任务依次添加进worker中
    4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好)
    
    '''
    bottle_r_time_0 = time.time()

    task_num = len(task_info_list)
    req_num = g_task_queue.qsize() + task_num
    bottle_r_time_1 = time.time() - bottle_r_time_0
    for task in task_info_list:
        try:
            g_task_queue.put(task)
        except:
            # 任务队列已满
            traceback.format_exc()
            callback.CallbackResult(task=task, error_code=98, result_type="END")
            logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了"))
    bottle_r_time_2 = time.time() - bottle_r_time_0
    logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
Esempio n. 10
0
 def mongo_insert(self):
     """
     暂未打印日志 提供给运维收取
     插定比只做 mongo入库统计
     :return:
     """
     try:
         client = MongoClient(
             "mongodb://*****:*****@10.19.56.168:27017")
         db = client['new_api_spider_check']
         collection_api = db[self.source]
         insert_data = {
             'source': self.source,
             't': self.t,
             'radio_check': self.radio_check,
             'type': self.type,
             'error_id': int(self.error_id),
             'msg': str(self.msg),
             'http_code': self.http_code,
             'req': self.req,
             'content': self.content,
             'resp': str(self.resp),
             'qid': self.qid,
             'ptid': self.ptid,
             'task_info': self.task_info,
             'key': self.key,
             'is_success': self.is_success
         }
         collection_api.insert_one(insert_data)
         logger.info("[插入查定比记录成功][mongo][{0}]".format(self.qid))
         logger.info("\n" + self.logger_info)
         # logger.debug('\n' + NewCheckBook.logger_info)
     except Exception as e:
         logger.info("[插入查定比记录失败][mongo][{0}]".format(str(e)))
     finally:
         try:
             client.close()
         except Exception as mme:
             logger.info("[插入查定比mongo连接错误][mongo][{0}]".format(str(mme)))
Esempio n. 11
0
File: slave.py Progetto: gitPff/sb-
            spider = spider_crawl(spider, task) # 执行 爬虫,并从头重试
        except ParserException as e:
            error_info = e.msg
            error = e.code
            logger.exception('新框架 爬虫抛出异常: task:{0}, error:{1}, msg: {2}'.format(task, error_info, error))
        except Exception, e:
            logger.exception("新框架 爬虫抛出异常: task_data:{0}  error:{1}".format(task, e))
            error = SLAVE_ERROR
        spider.last_time = int((time.time() - crawl_time) * 1000)
        check_all_result(spider) # 最后对所有返回数据进行check 
        spider.spider_frame_status = 1
        callback.CallbackResult(task=task, error_code=spider.code, spider=spider, result_type="END") # 执行回调操作,如果是end将执行同步回调
        error_logger(spider) # 写入error日志

        code = spider.code
    logger.info("[爬虫反馈 code: {0}][source: {1}] task: {2}".format(code, task.source, task))


def error_logger(spider):
    if hasattr(spider.task, 'new_task_id'):
        cur_id = spider.task.new_task_id
    else:
        cur_id = str(uuid.uuid1())
    
    task_id = cur_id
    if hasattr(spider, "succeed_pages"):
        spider.error_code_logger.succeed_pages = spider.succeed_pages
    elif hasattr(spider, "success_count"):
        spider.error_code_logger.succeed_pages = spider.success_count
    if hasattr(spider, "total_crawl_pages"):
        spider.error_code_logger.total_crawl_pages = spider.total_crawl_pages
Esempio n. 12
0
def ParseTask(req):
    """接收并解析task, 返回task对象list
    """
    result = list()
    params = req.params
    client_ip = req.remote_addr
    req_tasks = json.loads(urllib.unquote(params.get('req')))
    req_qid = params.get('qid')
    req_uid = params.get('uid')
    req_tid = params.get('tid', '')
    req_ori_type = params.get('ori_type', '')
    for req_task in req_tasks:
        try:
            task = Task()
            # 是否实时验证请求
            task.req_qid = req_qid
            task.req_uid = req_uid
            task.order_no = req_task.get('order_no', "")
            task.source = req_task.get('source')
            task.content = req_task.get('content')
            task.deadline = req_task.get('deadline', 0)
            task.debug = req_task.get('debug', False)
            task.tid = req_tid
            task.client_ip = client_ip
            task.ori_type = req_ori_type
            # task.proxy_info = proxy_info
            task.ticket_info = req_task.get('ticket_info')
            # todo 验证信息
            task.verify = req_task.get('verify', {
                'type': 'pre',
                'set_type': 'E'
            })

            task.req_md5 = task.ticket_info.get('md5', 'default_md5')

            task.master_info = req_task.get('master_info', 'default_host')
            task.host = task.master_info.get('master_addr')

            task.redis_host = task.master_info.get('redis_addr').split(':')[0]
            task.redis_port = task.master_info.get('redis_addr').split(':')[-1]

            task.redis_db = task.master_info.get('redis_db')
            task.redis_passwd = task.master_info.get('redis_passwd')

            task.req_qid_md5 = task.req_qid + '-' + task.req_md5
            task.other_info = req_task.get('other_info', {})

            callback_type = 'scv100'
            if 'callback_type' in task.other_info:
                callback_type = task.other_info['callback_type']

            task.callback_type = callback_type

            redis_key_list = task.other_info.get('redis_key', [])
            # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式
            for each in redis_key_list:
                task.redis_key = each
                task.other_info['redis_key'] = each
                logger.info('s[{0}] id[{1}]new verify task:{2}'.format(
                    task.source, task.new_task_id, task))
                result.append(task)
        except Exception, e:
            continue