Esempio n. 1
0
        def parse_result(parser):
            # 判断是否为有解析需要,且在需解析目标中
            parser_name = parser.__name__.split('_', 1)[1]
            if parser_name in required:
                logger.debug(current_log_tag() + 'parse target %s',
                             parser_name)

                per_result = parser(request_template, converted_data)
                if per_result is not None:
                    if per_result:
                        start = datetime.datetime.now()
                        if isinstance(per_result, list):
                            # 添加 guest_info
                            store_utils.add_index_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result, page_index)
                            # 添加 stopby 信息
                            store_utils.add_stop_by_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result,
                                self.spider.task)
                            result[parser_name].extend(per_result)
                        elif isinstance(per_result, dict):
                            result[parser_name].append(per_result)
                        logger.debug(current_log_tag() +
                                     '[结果保存][不使用压缩][用时: {0} ]'.format(
                                         datetime.datetime.now() - start))
Esempio n. 2
0
def insert_rabbitmq(args, queue_list, routing_key):
    logger.debug('[rabbitmq 入库开始]')
    try:
        credentials = pika.PlainCredentials(username=USER, password=PASSWD)
        connection = pika.BlockingConnection(
            pika.ConnectionParameters(host=HOST,
                                      virtual_host='TrafficDataPush',
                                      credentials=credentials))
        channel = connection.channel()

        channel.exchange_declare(
            exchange='TrafficDataPush',
            # exchange_type='fanout',
            durable=True,
            auto_delete=False)

        # 此部分代码会修改 exchange 以及定义 queue
        # for q in queue_list:
        #     channel.queue_declare(queue=q, durable=True)
        #     channel.queue_bind(queue=q, exchange='TrafficDataPush', routing_key=routing_key)

        msg = json.dumps(args, ensure_ascii=False)

        res = channel.basic_publish(
            exchange='TrafficDataPush',
            routing_key=routing_key,
            body=msg,
            properties=pika.BasicProperties(delivery_mode=2))
        connection.close()
        if not res:
            raise Exception('RabbitMQ Result False')
        logger.debug('[rabbitmq 入库结束]')
    except Exception as exc:
        raise exc
Esempio n. 3
0
File: slave.py Progetto: gitPff/sb-
def doPostProcessTask(res, task, key):
    
    logger.debug(current_log_tag() + '[验证页面开始上传]')

    handler = multipartuploadufile.MultipartUploadUFile(g_config.ucloud_public_key, g_config.ucloud_private_key)
    stream = BytesIO(zlib.compress(res))
    ret, resp = handler.uploadstream(g_config.ucloud_bucket, key, stream)
    # 出问题重试 2 次
    retry_times = 2
    if resp.status_code == 200:
        logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key))
        return True
    while resp.status_code != 200 and retry_times:
        retry_times -= 1
        ret, resp = handler.resumeuploadstream()
        if resp.status_code == 200:
            logger.debug(current_log_tag() + '[验证页面上传结束] md5:{0}'.format(key))
            return True
    else:   
        except_logger = ExceptionLogger()
        except_logger.qid = task.req_qid
        except_logger.type = "PUSH_MD5_ERROR"
        except_logger.debug = json.dumps({"task_id": task.new_task_id, "source": task.source})
        logger.debug("\n" + except_logger.logger_info)
        logger.debug(current_log_tag() + '[验证页面上传失败] md5:{0}'.format(key))

        return False
Esempio n. 4
0
File: slave.py Progetto: gitPff/sb-
def spider_crawl(spider, task):
    """
    重头开始重试
    :param parser:
    :param task:
    :return:
    """
    retry_count = 0
    need_crawl = True

    while need_crawl:
        retry_count += 1
        try:
            spider.error_code_logger = ErrorCodeLogger()
            spider.crawl()
            return spider
        except ParserException as e:
            need_crawl = retry_count < e.retry_from_first_count
            if not need_crawl:
                raise e
            else:
                # 重试 保持相同的task_id, 开始抓取时间
                spider = update_parser_from_older(spider, task)
                spider.error_code_logger.retry_times += 1
                logger.debug('retry from first - {0}/{1}'.format(retry_count, e.retry_from_first_count))
Esempio n. 5
0
    def set_proxy(self, p, https=False):
        self.proxy = p
        proxy_type = 'NULL'
        if p is not None and p != "REALTIME":
            # socks都是内网socks服务转发,所以以 10. 开头判断
            if "PROXY_API" in p:
                proxy_type = "API"
                self.br.proxies = p["PROXY_API"]
            elif "PROXY_GOOGLE_MAPS" in p:
                proxy_type = "GOOGLE_MAPS"
                self.br.proxies = p["PROXY_GOOGLE_MAPS"]

            elif p.startswith('10.'):
                # if p.split(':')[0] in SOCKS_PROXY:
                proxy_type = 'socks'
                self.br.proxies = {
                    'http': 'socks5://' + p,
                    'https': 'socks5://' + p
                }
                try:
                    # self.real_ip = get_real_id(self.br.proxies)
                    self.real_ip = p
                except Exception:
                    pass
            else:
                self.real_ip = p.split(':')[0]
                proxy_type = 'http'
                self.br.proxies = {
                    'https': 'http://' + p,
                    'http': 'http://' + p,
                }
        logger.debug('[框架设置代理][代理类型: %s][代理 ip: %s ]' % (proxy_type, p))
Esempio n. 6
0
def add_stop_by_info(versions, result, task):
    # 获取 stop by info 所在的位置
    stop_by_index = TICKET_INFO_INDEX.get(versions, None)
    if stop_by_index is None:
        logger.debug('[未找到 stop_by_index][versions: {0}]'.format(versions))
        # try:
        #     from common import db
        #     sql = 'REPLACE INTO new_frame_not_replace_stop_by (ip, versions) VALUES (%s, %s)'
        #     db.execute_into_spider_db(sql, (get_local_ip(), versions))
        # except Exception as e:
        #     logger.warning('[未成功入 未找到 stop_by 库][ERROR: {0}]'.format(e))
        return

    logger.debug(current_log_tag() +
                 '[修改 stop_by_info][versions: {0}][位置 {1}]'.format(
                     versions, stop_by_index))

    for __i in range(len(result)):
        if result[__i]:
            result[__i] = list(result[__i])

            #  E、P、B、F,E 经济舱 P 超级经济舱 B 商务舱 F 头等舱
            task_stop_by_info = task.ticket_info.get('v_seat_type',
                                                     None) or 'E'
            if versions == 'InsertMultiFlight':
                result[__i][stop_by_index] = '{0}&NULL'.format(
                    task_stop_by_info)
            else:
                result[__i][stop_by_index] = task_stop_by_info

            result[__i] = tuple(result[__i])
Esempio n. 7
0
    def load(self):
        logger.debug('=======初始化Spider======')
        spider_list = {}

        source_module_names = find_module_names('spider')
        for source in source_module_names:

            logger.debug("找到source:%s", source)
            spider_package = 'spider.' + source

            spider_module_names = find_module_names(spider_package)
            for spider_module in spider_module_names:
                try:
                    logger.info("找到module: %s", spider_module)
                    if spider_module.endswith('_spider'):
                        desc = init_spider(spider_package + '.' +
                                           spider_module)
                        if desc:
                            desc[0]['source_key'] = source
                            spider_list[desc[0]['source_type']] = desc[0]
                except Exception:
                    logger.info("寻找并加载 [ module ]: {0} 时出现异常,[ {1} ]".format(
                        spider_module, traceback.format_exc()))

        self.__spider_list = spider_list
        print('spiders: ', self.__spider_list)
        logger.info('=======spider init complete======')
Esempio n. 8
0
 def result(self):
     try:
         for k, v in self._result.items():
             logger.debug(
                 current_log_tag() +
                 '[抓取结果][key: {0}][value_len: {1}]'.format(k, len(v)))
     except Exception:
         pass
     return self._result
Esempio n. 9
0
        def write_message(max_try):
            """
            :param max_try:
            :return:
            """
            try:
                max_try -= 1
                msg = json.dumps({
                    'qid': task.req_qid,
                    'type': task.callback_type,
                    'uid': task.req_uid,
                    'query': json.dumps(query),
                    'status': spider_status
                })
                credentials = pika.PlainCredentials(
                    username=task.master_info['spider_mq_user'],
                    password=task.master_info['spider_mq_passwd'])
                connection = pika.BlockingConnection(
                    pika.ConnectionParameters(
                        host=task.master_info['spider_mq_host'],
                        virtual_host=task.master_info['spider_mq_vhost'],
                        credentials=credentials,
                        # heartbeat_interval=0
                    ))
                channel = connection.channel()

                res = channel.basic_publish(
                    exchange=task.master_info['spider_mq_exchange'],
                    routing_key=task.master_info['spider_mq_routerKey'],
                    properties=pika.BasicProperties(delivery_mode=2),
                    body=msg,
                )
                connection.process_data_events()

                connection.close()
                if not res:
                    warn_msg = 'RabbitMQ Result False: {0}'.format(msg)
                    info = warn(str(task.req_qid), 'ex_RabbitMQ',
                                get_local_ip(), warn_msg)
                    logger.debug("\n" + info)
                    raise Exception('RabbitMQ Result False')
                logger.debug(
                    '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}'
                    .format(str(task.req_qid), str(task.source), task.content,
                            spider_status))
                return max_try
            except Exception as exc:
                if max_try > 0:
                    return write_message(max_try)
                else:
                    warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format(
                        task.req_qid, traceback.format_exc(), msg)
                    info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(),
                                warn_msg)
                    logger.exception("\n" + info)
                    return max_try
Esempio n. 10
0
    def crawl_data(self, request_template, browser, source_name):
        """
        页面抓取函数
        :param request_template: 请求字典
        :param browser: 抓取浏览器
        :param source_name: 源名称
        :return: 返回抓取结果 response 对象
        """
        try:
            logger.debug(current_log_tag() + 'crawl %s, retry_count: %s',
                         self.__request_func.__name__, self.req_count)
            # 代理装配
            self.browser_set_proxy(browser, source_name)

            resp, self.content_length = self.__crawl_data_str(
                request_template, browser)

            # todo 修改 user_retry 返回的结果
            if self.user_retry:
                try:
                    user_check = self.spider.user_retry_err_or_resp(
                        resp, self.req_count, request_template, False)
                except Exception as e:
                    self.user_exc = True
                    raise e

                # 当用户返回 True 时
                if user_check:
                    return resp
                else:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, '代理异常')
            else:
                return resp
        except parser_except.ParserException as e:
            self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN,
                                           parser_except.PROXY_FORBIDDEN,
                                           parser_except.REQ_ERROR)
            self.req_exception = e
        except Exception as e:
            self.req_exception = parser_except.ParserException(
                parser_except.REQ_ERROR, 'req exception:{0}'.format(e))

            # 如果有用户异常,则置位用户重试
            if self.user_exc:
                if isinstance(e, parser_except.ParserException):
                    self.req_exception = e

        finally:
            if self.req_exception:
                code = self.req_exception.code
            else:
                code = 0

        if self.req_exception:
            raise self.req_exception
Esempio n. 11
0
def curl_real_ip(p):
    try:
        time_1 = time.time()
        socks_req = '''curl --socks5 {1} http://httpbin.org/ip'''.format(p)
        socks_IP = os.popen(socks_req).readlines()
        logger.debug('[框架设置代理][socks代理出口 ip: %s ]' % (socks_IP))
        time_2 = time.time()
        socks_time = time_2 - time_1
        logger.debug('[获取socks代理出口ip,耗时 %s 秒]' % (socks_IP))
    except Exception:
        logger.error(' ')
        pass
Esempio n. 12
0
File: slave.py Progetto: gitPff/sb-
 def run(self):
     while True:
         spider_task = g_task_queue.get(block=True)
         logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count()))
         if g_co_pool.free_count() < 2:
             msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(),
                                                        g_task_queue.qsize())
             print
             callback.CallbackResult(task=spider_task, error_code=98, result_type="END")
             logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了"))
         else:
             g_co_pool.spawn(doTask, spider_task)
Esempio n. 13
0
 def logging(*args, **kw):
     func_count_dict[fun.__name__] += 1
     begin = datetime.now()
     logger.debug(current_log_tag() +
                  '函数 {0} call start'.format(fun.__name__))
     result = fun(*args, **kw)
     end = datetime.now()
     logger.debug(current_log_tag() +
                  '函数 {0} call end'.format(fun.__name__))
     # logger.debug(current_log_tag() + ',函数,%s,耗时,%s,当前运行,%s,个此函数,当前,%s,协程', fun.__name__, (end - begin),
     #              func_count_dict[fun.__name__], mioji.common.pool.pool.size)
     func_count_dict[fun.__name__] -= 1
     return result
Esempio n. 14
0
 def __target_append_result(result, new_result):
     """
     向 result 中添加数据
     :param result: 被添加量
     :param new_result: 添加量
     :return: None
     : 此处用了字典的单例。
     """
     for k, v in new_result.items():
         if not v:
             continue
         logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                      len(v), len(result.get(k, [])))
         result[k] += v
Esempio n. 15
0
    def __crawl_list(self, reqParse, browser, req_list):
        """
        串行抓取分页
        """
        result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        total_count = 0
        success_count = 0
        error_req = []
        for req in req_list:
            # 串行增加翻页限制取消
            # if NEED_FLIP_LIMIT:
            #     if total_count >= MAX_FLIP:
            #         break
            total_count += 1
            try:
                success_count += 1
                res = self.__single_crawl(reqParse,
                                          browser,
                                          req,
                                          page_count=total_count)
                self.__target_append_result(result, res)
                all_except = False
            except Exception as e:
                all_ok = False
                one_exception = e
                error_req.append((req, one_exception.message))
                logger.exception(
                    current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format(
                        traceback.format_exc().replace('\n', '\t')))

                #  抛出生成器部分的异常
                if isinstance(req, types.GeneratorType):
                    raise e
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return result, all_except, all_ok, one_exception
Esempio n. 16
0
    def __async_crawl_list(self, reqParse, browser, req_list):
        """
        并行抓取分页
        丢到协程池里
        """

        a_result = defaultdict(list)
        all_except = True
        all_ok = True
        one_exception = None

        params = []
        total_count = 0
        for req in req_list:
            total_count += 1
            params.append((reqParse, browser, req, total_count))

        result = block_async(pool, self.__single_crawl, params)

        success_count = 0
        error_req = []
        for a_res in result:
            err_or_data, is_data = a_res
            if is_data:
                success_count += 1
                all_except = False
                self.__target_append_result(a_result, err_or_data)
            else:
                all_ok = False
                args, kwargs, one_exception = err_or_data
                if hasattr(
                        one_exception,
                        'retry_from_first') and one_exception.retry_from_first:
                    raise one_exception
                error_req.append((args[2], one_exception.message))
        if reqParse.binding:
            self.success_count = success_count
            self.all_count = total_count
        logger.debug(
            current_log_tag() +
            '[翻页抓取][并行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count))
        if error_req:
            logger.debug(current_log_tag() +
                         '[翻页抓取][并行抓取][ 失败页请求 {0} ]'.format(str(error_req)))
        return a_result, all_except, all_ok, one_exception
Esempio n. 17
0
 def convert(self, request_template, data):
     data_con = request_template.get('data', {})
     c_type = data_con.get('content_type', 'string')
     logger.debug(current_log_tag() + 'Converter got content_type: %s',
                  c_type)
     if c_type is 'html':
         return HTML.fromstring(data)
     elif c_type is 'json':
         return json.loads(data)
     elif isinstance(c_type, types.MethodType):
         try:
             return c_type(request_template, data)
         except:
             raise parser_except.ParserException(
                 -1, 'convert func muset error{0} ,func:{1}'.format(
                     traceback.format_exc(), c_type))
     else:
         return data
Esempio n. 18
0
 def new_limit(self, limit, task):
     """
     排队服务
     :param limit:
     :param task:
     :return:
     """
     try:
         source = limit['source_name']
         url = 'http://10.19.23.81:8901/sort'
         data = {
             'source': source,
             'state': 'a',
             'timeout': 30,
             'task': {
                 'task': task
             }
         }
         logger.debug('new limit req: ' + _json.dumps(data))
         res = requests.post(url=url,
                             data=_json.dumps(data),
                             timeout=(10, 40))
         logger.debug('new limit resp: ' + str(res.content))
     except Exception as e:
         logger.debug('new limit error' + str(e))
         return False
     if res.content != 'False':
         return True
     else:
         return False
Esempio n. 19
0
File: slave.py Progetto: gitPff/sb-
def error_logger(spider):
    if hasattr(spider.task, 'new_task_id'):
        cur_id = spider.task.new_task_id
    else:
        cur_id = str(uuid.uuid1())
    
    task_id = cur_id
    if hasattr(spider, "succeed_pages"):
        spider.error_code_logger.succeed_pages = spider.succeed_pages
    elif hasattr(spider, "success_count"):
        spider.error_code_logger.succeed_pages = spider.success_count
    if hasattr(spider, "total_crawl_pages"):
        spider.error_code_logger.total_crawl_pages = spider.total_crawl_pages
    elif hasattr(spider, "all_count"):
        spider.error_code_logger.total_crawl_pages = spider.all_count

    if hasattr(spider.task, "verify"):
        verify_type = spider.task.verify
        if isinstance(verify_type, dict):
            _type = verify_type.get('type', "")
        else:
            verify_type = json.loads(spider.task.verify)
            _type = verify_type.get('type', "")
        spider.error_code_logger.verify_type = _type
    spider.error_code_logger.task_id = cur_id
    spider.error_code_logger.source = spider.task.source
    spider.error_code_logger.tid = spider.task.tid
    spider.error_code_logger.ori_type = spider.task.ori_type
    spider.error_code_logger.task_info = json.dumps(spider.task.ticket_info, ensure_ascii=False)
    spider.error_code_logger.content = spider.task.content
    spider.error_code_logger.client_ip = spider.task.client_ip
    spider.error_code_logger.error_code = convert_code(spider.code)
    spider.error_code_logger.detail_code = spider.code
    spider.error_code_logger.qid = spider.task.req_qid
    spider.error_code_logger.MD5 = spider.verify_data["data"]
    spider.error_code_logger.last_time = spider.last_time
    if spider.code != 0:
        spider.error_code_logger.exception = spider.exception
    logger.debug('\n' + spider.error_code_logger.logger_info)
Esempio n. 20
0
File: slave.py Progetto: gitPff/sb-
def do_worker(task_info_list):
    ''' 
    1、接收web请求
    2、检查并解析task
    3、一个请求中可能有多个任务,将任务依次添加进worker中
    4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好)
    
    '''
    bottle_r_time_0 = time.time()

    task_num = len(task_info_list)
    req_num = g_task_queue.qsize() + task_num
    bottle_r_time_1 = time.time() - bottle_r_time_0
    for task in task_info_list:
        try:
            g_task_queue.put(task)
        except:
            # 任务队列已满
            traceback.format_exc()
            callback.CallbackResult(task=task, error_code=98, result_type="END")
            logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了"))
    bottle_r_time_2 = time.time() - bottle_r_time_0
    logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
Esempio n. 21
0
def add_index_info(versions, result, page_index):
    # 获取 guest info 所在的位置
    index_info_index = INDEX_INFO_INDEX.get(versions, None)
    if index_info_index is None:
        return

    logger.debug(current_log_tag() +
                 '[修改 index_info][versions: {0}][位置 {1}]'.format(
                     versions, index_info_index))
    for __i in range(len(result)):
        if result[__i]:
            result[__i] = list(result[__i])
            old_index_info = result[__i][index_info_index]
            try:
                old_index_info = json.loads(old_index_info)
                if not isinstance(old_index_info, dict):
                    raise Exception('Type Is Not Dict')
            except Exception:
                old_index_info = {'unparse_info': old_index_info}
            index_info = {k: v for k, v in old_index_info.items()}
            index_info['page_index'] = page_index
            index_info['item_index'] = __i
            result[__i][index_info_index] = json.dumps(index_info)
            result[__i] = tuple(result[__i])
Esempio n. 22
0
    def __spider_append_result(self, new_result):
        """
        向 self.result 中添加解析结果
        :param new_result: 必须为解析结果
        :return: None
        :调用回调方法
        """

        for k, v in new_result.items():
            if not v:
                continue
            data_bind = self.targets[k].get('bind', None)
            if data_bind:
                logger.debug(
                    "current_log_tag() + [ 抓取绑定 {0} ][ 数据绑定 {1} ]".format(
                        k, data_bind))
                self._result[data_bind] += v
                logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                             len(v), len(self._result.get(k, [])))
            else:
                self._result[k] += v
                logger.debug(current_log_tag() + "%s, length=%s, all=%s", k,
                             len(v), len(self._result.get(k, [])))
Esempio n. 23
0
    def parse(self,
              request_template,
              targets_bind,
              converted_data,
              page_index,
              required=None,
              multi_last=False):
        result = defaultdict(list)
        parsed = set()
        if not multi_last:
            parser_list = request_template.get('user_handler', [])
            for parser in parser_list:
                if parser not in parsed:
                    logger.debug(current_log_tag() + 'user parser %s', parser)
                    parser(request_template, converted_data)

        # 通过 parse 更新 result 信息
        def parse_result(parser):
            # 判断是否为有解析需要,且在需解析目标中
            parser_name = parser.__name__.split('_', 1)[1]
            if parser_name in required:
                logger.debug(current_log_tag() + 'parse target %s',
                             parser_name)

                per_result = parser(request_template, converted_data)
                if per_result is not None:
                    if per_result:
                        start = datetime.datetime.now()
                        if isinstance(per_result, list):
                            # 添加 guest_info
                            store_utils.add_index_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result, page_index)
                            # 添加 stopby 信息
                            store_utils.add_stop_by_info(
                                self.spider.targets.get(parser_name, {}).get(
                                    'version', None), per_result,
                                self.spider.task)
                            result[parser_name].extend(per_result)
                        elif isinstance(per_result, dict):
                            result[parser_name].append(per_result)
                        logger.debug(current_log_tag() +
                                     '[结果保存][不使用压缩][用时: {0} ]'.format(
                                         datetime.datetime.now() - start))

        # 解析目标,酒店、房间、等
        # for target, parser in targets_bind.items():
        if isinstance(self.binding,
                      Iterable) and not isinstance(self.binding, (str, bytes)):
            for binding in self.binding:
                # 对 binding 种类进行兼容判断
                if binding is None:
                    continue
                elif isinstance(binding, (str, bytes)):
                    parser = targets_bind.get(binding, '')
                    if parser == '':
                        TypeError('无法从 targets 中获取 parser {0}'.format(binding))
                elif callable(binding):
                    parser = binding
                else:
                    raise TypeError('不支持绑定类型 {0} 的 {1}'.format(
                        type(binding), repr(binding)))
                # 更新 result 信息
                parse_result(parser)

        elif isinstance(self.binding, (str, bytes)):
            parser = targets_bind.get(self.binding, '')
            if parser == '':
                TypeError('无法从 targets 中获取 parser {0}'.format(self.binding))

            # 更新 result 信息
            parse_result(parser)

        elif callable(self.binding):
            parser = self.binding
            # 更新 result 信息
            parse_result(parser)

        return result
Esempio n. 24
0
    def __single_crawl(self, reqParse, browser, request_template, page_count):
        """ 用于请求的基本方法
        """
        # 请求链中的header 可以被沿用
        headers = request_template['req'].get('headers', None)
        use_headers = request_template['req'].get('use_headers', False)
        if headers:
            browser.add_header(headers, use_headers)

        # 设置 res 的 默认值
        res = defaultdict(list)

        # 初始化请求参数

        local_req_count = 0
        reqParse.req_count = 0
        reqParse.is_forbidden = False
        reqParse.req_exception = None
        reqParse.proxy = None
        reqParse.content_length = 0

        self.__cpu_time += time.time() * 1000

        while local_req_count < reqParse.retry_count:
            # 增加一次重试次数
            local_req_count += 1
            logger.debug(
                current_log_tag() +
                '[开始抓取][ {0} ]'.format(request_template['req'].get('url', '')))
            # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取
            try:
                resp = reqParse.crawl_data(request_template, browser,
                                           self.task.source)
            except parser_except.ParserException as e:
                traceback.print_exc()
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                # 错误码21/22/23 或 开发指定需要重试
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID,
                              parser_except.REQ_ERROR,
                              parser_except.PROXY_SSL) or e.need_retry:
                    reqParse.is_forbidden = True

                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     traceback.format_exc())
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'.
                                     format(e.code, reqParse.req_count))
                        continue
                else:
                    raise e
            except Exception as e:
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                if local_req_count >= reqParse.retry_count:
                    raise e
                else:
                    continue

                    # 请求中增加 resp 的值
            request_template['resp'] = resp
            # 打印存储抓取结果
            self.response_callback(request_template, resp)
            if reqParse.res_text == 'text':
                res = resp.text
            else:
                res = resp.content
            try:
                logger.debug(current_log_tag() +
                             '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format(
                                 res[:100], res[-100:], request_template['req']
                                 ['url']).replace('\n', '').replace('\t', ''))
            except Exception:
                pass
            # 如果本地运行,将不执行上传操作
            if not self.debug and self.env != "local":
                md5_key = get_md5(res)
                verify_task_info = {
                    'func_name': reqParse.request_func.__name__,
                    'page_index': page_count,
                    'retry_count': local_req_count - 1,
                    'md5_key': md5_key
                }
                # 把上传抓取页面至ucloud
                self.task_post_process_queue.put((res, self.task, md5_key))
                self.verify_data['data'].append(verify_task_info)

            point_time = time.time() * 1000
            try:
                convert_data = reqParse.convert(request_template, res)
            except Exception:
                if local_req_count >= reqParse.retry_count:
                    logger.debug(current_log_tag() + traceback.format_exc())
                    raise parser_except.ParserException(
                        parser_except.DATA_FORMAT_ERROR,
                        '[traceback: {0}]'.format(traceback.format_exc()))
                else:
                    continue
            finally:
                self.__cpu_time += time.time() * 1000 - point_time

            # 数据解析部分
            point_time = time.time() * 1000
            try:
                res = reqParse.parse(request_template,
                                     self.__targets_parser_func_dict,
                                     convert_data, page_count,
                                     self._crawl_targets_required)

                break
            except parser_except.ParserException as e:
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID):
                    reqParse.is_forbidden = True

                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code))
                        convert_data = None
                        continue
                else:
                    raise e
            except Exception:
                raise parser_except.ParserException(
                    parser_except.PARSE_ERROR,
                    '[traceback:{0}]'.format(traceback.format_exc()))
            finally:
                self.__cpu_time += time.time() * 1000 - point_time
                self.response_callback(request_template, resp)
        have_ticket = False
        for k, v in res.items():
            if not v:
                continue
            self._asy_temp_result[k] += v
            have_ticket = True
        # 有票 && slave调用的爬虫才会异步回调
        if have_ticket and self.process_callback and not self.debug and self.env != "local":
            self.process_callback(task=self.task,
                                  spider=self,
                                  result_type="RUNNING")

        return res
Esempio n. 25
0
class CallbackWorkload(object):
    def __init__(self):
        self.pool_dict = dict()

    def __str__(self):
        return json.dumps(self.__dict__)

    def CallbackResult(self,
                       spider=None,
                       task=None,
                       error_code=0,
                       result_type="END"):
        """
        不只传spider进来是因为11错误码找不到spider
        @ task:任务task信息
        @ result_type:传入回调状态,end为最终状态同步执行,running为中间状态异步执行
        @ proxy:需要写入的回调数据
        @ error_code 写入的错误码
        """
        from slave import g_asy_callback_pool
        if result_type == "END":
            self.doCallback(task, error_code, spider, result_type)
        else:
            g_asy_callback_pool.spawn(self.doCallback, task, error_code,
                                      spider, result_type)

    def doCallback(self, task, error_code, spider, result_type):
        """
        执行回调工作
        """
        def get_ticket_num():
            ticket_num = 0
            for per_data_type in spider.crawl_targets_required:
                ticket_num += len(spider._asy_temp_result[per_data_type])
            return ticket_num

        def get_result(_result):
            _proxy_or_ticket = []
            for per_data_type in spider.crawl_targets_required:
                _proxy_or_ticket.extend(_result[per_data_type])
            return _proxy_or_ticket

        # 如果是 running状态 等一秒再判断下。
        if result_type == "RUNNING":
            num1 = get_ticket_num()
            time.sleep(1)
            # 缓冲后票张数量
            num2 = get_ticket_num()
            if num1 != num2 or spider.spider_frame_status:
                return

        task.other_info['parser_error'] = int(error_code)
        query = {"other_info": task.other_info}
        result = None
        redis_mq_logger = RedisMQCostLogger()
        extra = {}
        if spider:
            result = spider._asy_temp_result if result_type == 'RUNNING' else spider.result
            result = get_result(result)
            extra = spider.extra
            redis_mq_logger.ticket_num = len(spider._asy_temp_result)

        try:
            redis_mq_logger.qid = task.req_qid
            redis_mq_logger.source = task.source
            redis_mq_logger.task_id = task.new_task_id
            redis_mq_logger.task_info = task.content
            redis_mq_logger.error_code = error_code
            if result_type == 'END':
                redis_mq_logger.is_end = 1
            # 写入redis
            redis_cost = self.write_redis_ticket(task, result, error_code,
                                                 extra)
            if isinstance(redis_cost, tuple):
                redis_mq_logger.conn_redis = redis_cost[0]
                redis_mq_logger.write_redis = redis_cost[1]
            else:
                redis_mq_logger.exception = redis_cost

        except Exception, e:
            logger.exception('not redis con' + str(e))
        # 写入mq
        operation_info = self.call_back_toservice(task, query, result_type)
        mq_try, mq_cost = operation_info.get('result', 0), operation_info.get(
            'cost_time', 0)
        redis_mq_logger.mq_cost = mq_cost
        redis_mq_logger.mq_try = mq_try
        logger.debug('写入redis和mq:\n' + redis_mq_logger.logger_info)
Esempio n. 26
0
def ctrip_cn_parser(content, url, other_info):
    hotel = CtripCNHotel()
    try:
        root = html.fromstring(content.decode('utf-8'))
    except Exception as e:
        print traceback.format_exc()
        raise PARSE_ERROR

    try:
        phantomjs = execjs.get('PhantomJS')
        js_str = root.xpath("//script[contains(text(),'hotelDomesticConfig')]/text()")[0]
        page_js = phantomjs.compile(js_str[:js_str.index('function loadCallback()')])
    except Exception as e:
        print traceback.format_exc()
        logger.debug(current_log_tag() + '[获取JS中数据失败]')

    try:
        hotel_name = root.xpath('//h2[@class="cn_n"]/text()')[0]

        temp = re.findall(ur'([\u4e00-\u9fa5\s]*)', hotel_name)
        zh_name_tmep = [t for t in temp if t and t != ' ']
        if len(zh_name_tmep) == 1:
            hotel.hotel_name = zh_name_tmep[0].encode('utf8')
        elif len(zh_name_tmep) > 1:
            temp_ii = hotel_name.find(zh_name_tmep[-1]) + len(zh_name_tmep[-1])
            temp_iii = hotel_name.find(')', temp_ii)
            if temp_iii>-1:
                hotel.hotel_name = hotel_name[:temp_iii+1].encode('utf8')
            else:
                hotel.hotel_name = hotel_name[:temp_ii + 1].encode('utf8')
        else:
            hotel.hotel_name = ''

        if not zh_name_tmep:
            hotel.hotel_name_en = hotel_name.encode('utf8').strip(')').strip('(').strip(')').strip('(').strip()
        else:
            name_en_temp = hotel_name[hotel_name.find(zh_name_tmep[-1]) + len(zh_name_tmep[-1])+1:]
            hotel.hotel_name_en = name_en_temp.encode('utf8').strip(')').strip('(').strip(')').strip('(').strip()
    except Exception as e:
        print traceback.format_exc()
        logger.debug(current_log_tag() + '[解析英文名失败]')

    # try:
    #     hotel_name = root.xpath('//h2[@class="cn_n"]/text()')[0].strip()
    #     hotel.hotel_name = re.search(u'[\u4e00-\u9fa5]+', hotel_name).group()
    # except Exception as e:
    #     print traceback.format_exc()
    #     logger.debug(current_log_tag() + '【解析中文名失败】')

    print "中文名:", hotel.hotel_name
    print "英文名:", hotel.hotel_name_en

    try:
        position = page_js.eval('hotelDomesticConfig')['hotel']['position'].split('|')
        hotel.map_info = position[1] + ',' + position[0]
        print "hotel.map_info:", hotel.map_info
    except Exception as e:
        try:
            position_temp = root.xpath('//*[@id="hotelCoordinate"]/@value')[0].encode('utf-8').strip().split('|')
            hotel.map_info = position_temp[0] + ',' + position_temp[1]

        except Exception, e:
            print traceback.format_exc()
            logger.debug(current_log_tag() + '【解析酒店地址失败】')
            hotel.map_info = 'NULL'
Esempio n. 27
0
    def __crawl_data_str(self, request_template, browser):
        resp = None
        try:
            # 使用方法修改,用户直接修改 request_template 中的值
            self.spider.prepare_request(request_template)

            # 获得 request_template 中的 req
            req = request_template['req']

            # 用于控制qps
            if hasattr(self.spider, 'queue_info'):
                browser.queue_info = self.spider.queue_info

            if hasattr(self.spider.task, 'req_qid'):
                browser.qid = self.spider.task.req_qid
            else:
                browser.qid = ""
            browser.task_id = self.spider.task.task_id
            browser.source = self.spider.task.source
            browser.tid = self.spider.task.tid
            browser.ori_type = self.spider.task.ori_type

            resp = browser.req(**req)
            # 网络错误,异常抛出
            resp.raise_for_status()

            content_length = len(resp.content)
            if isinstance(self.need_content_length, int):
                logger.debug(current_log_tag() +
                             '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format(
                                 self.need_content_length, content_length))
                if content_length <= self.need_content_length:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, msg='data is empty')
            elif self.need_content_length is None:
                logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]')
            else:
                logger.debug(current_log_tag() +
                             '[未知 content_length 检测类型][type: {0}]'.format(
                                 str(type(self.need_content_length))))
            return resp, content_length
        # timeout
        except requests.exceptions.SSLError as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_SSL,
                                                msg=str(e),
                                                error=e)
        except requests.exceptions.ProxyError as e:  # 代理失效
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg='Proxy Error',
                                                error=e)

        except requests.exceptions.ConnectTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request connect Timeout',
                                                error=e)
        except requests.exceptions.ReadTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request read Timeout',
                                                error=e)
        except requests.exceptions.Timeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request Timeout',
                                                error=e)

        except requests.exceptions.ConnectionError as err:
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err))

        except requests.exceptions.HTTPError as err:  # 4xx 5xx 的错误码会catch到
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)

        except requests.exceptions.RequestException as err:  # 这个是总的error
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)
        except Exception as e:  # 这个是最终的error
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=traceback.format_exc())
Esempio n. 28
0
    def req(self,
            url,
            method='get',
            params=None,
            data=None,
            json=None,
            timeout=60,
            verify=False,
            **kw):
        httpLogger = HttpLogger()
        httpLogger = copy.deepcopy(httpLogger)
        httpLogger.qid = self.qid
        httpLogger.task_id = self.task_id
        httpLogger.req_type = method
        httpLogger.source = self.source
        httpLogger.task_id = self.task_id
        httpLogger.qid = self.qid
        httpLogger.url = url
        httpLogger.proxy_out = str(self.out_ip)
        httpLogger.proxy = str(self.proxy)
        httpLogger.proxy_inf = str(self.proxy_inf)
        httpLogger.retry_count = self.req_count
        for k in kw.keys():
            if k not in [
                    'method', 'url', 'params', 'data', 'headers', 'cookies',
                    'files', 'auth', 'timeout', 'allow_redirects', 'proxies',
                    'hooks', 'stream', 'verify', 'cert', 'json'
            ]:
                logger.warning(current_log_tag() +
                               '[出现不能解析的 req 请求参数][{0}]'.format(k))
        new_kw = {
            k: v
            for k, v in kw.items() if k in [
                'method', 'url', 'params', 'data', 'headers', 'cookies',
                'files', 'auth', 'timeout', 'allow_redirects', 'proxies',
                'hooks', 'stream', 'verify', 'cert', 'json'
            ]
        }
        ts = int(1000 * time.time())
        if data:
            httpLogger.data = data
            if isinstance(data, dict):
                httpLogger.data = _json.dumps(data, ensure_ascii=False)
        if json:
            httpLogger.data = json
            if isinstance(json, dict):
                httpLogger.data = _json.dumps(json, ensure_ascii=False)

        req_func = self.req_bind.get(method.lower())
        httpLogger.cookie = str(req_func.__self__.cookies._cookies)
        httpLogger.source = self.source
        httpLogger.headers = str(new_kw.get('headers', ""))
        try:
            logger.debug(current_log_tag() +
                         'browser req start {1} {0}'.format(url, method))
            logger.debug(current_log_tag() +
                         'browser req data {0}'.format(data))
            logger.debug(current_log_tag() +
                         'browser req json {0}'.format(json))
            logger.debug(current_log_tag() +
                         'browser req params {0}'.format(params))
            logger.debug(current_log_tag() +
                         'browser req other_data {0}'.format(new_kw))
            logger.debug(current_log_tag() +
                         'browser req session_cookie {0}'.format(
                             req_func.im_self.cookies._cookies))
        except:
            logger.debug(current_log_tag() + '请求前获取部分参数失败')
        try:
            local_resp = None
            # todo API qps限制
            # try:
            #     logger.debug(current_log_tag() + 'queue and qps config:{0}'.format(str(self.queue_info)))
            #     if not self.queue_info.get('source_name'):
            #         pass
            #     elif self.queue_info['source_name'] in limit_config.keys():
            #         try:
            #             cango = self.new_limit(self.queue_info, self.task_id)
            #         except Exception as why:
            #             logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why)))
            #             raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时&reqError')
            #         if not cango:
            #             raise parser_except.ParserException(parser_except.NEW_QPS_OVERFLOW, msg='limit排队超时')
            # except Exception as why:

            # logger.debug(current_log_tag() + 'queue and qps fail reason:{0}'.format(str(why)))
            self.resp = local_resp = req_func(url,
                                              params=params,
                                              data=data,
                                              json=json,
                                              timeout=timeout,
                                              verify=verify,
                                              **new_kw)
            logger.debug(
                current_log_tag() +
                'browser response headers:{0}'.format(self.resp.headers))
            ts = int(1000 * time.time()) - ts
            httpLogger.last_time = ts
            logger.debug(
                current_log_tag() +
                'browser req end {1} {0} proxy[{4}] ms[{2}] status[{3}] length[{5}]'
                .format(url, method, ts, local_resp.status_code, self.proxy,
                        resp_content_lenght(local_resp)))
            httpLogger.resp_code = local_resp.status_code
            if len(str(local_resp.content)) > 1000:
                content = str(local_resp.content)[:1000]
            else:
                content = str(local_resp.content)
            httpLogger.resp_content = content
            httpLogger.proxy_out = str(self.out_ip)
            httpLogger.proxy = str(self.proxy)
        except:
            httpLogger.exception = str(traceback.format_exc())
            logger.debug(current_log_tag() +
                         'browser req end {1} {0} proxy[{2}] error:{3}'.format(
                             url, method, self.proxy, traceback.format_exc()))
            try:
                logger.debug('\n' + httpLogger.logger_info)
            except Exception as why:
                logger.debug(str(why))
            raise
        try:
            logger.debug('\n' + httpLogger.logger_info)
        except Exception as why:
            logger.debug(str(why))
        return local_resp
Esempio n. 29
0
            position_temp = root.xpath('//*[@id="hotelCoordinate"]/@value')[0].encode('utf-8').strip().split('|')
            hotel.map_info = position_temp[0] + ',' + position_temp[1]

        except Exception, e:
            print traceback.format_exc()
            logger.debug(current_log_tag() + '【解析酒店地址失败】')
            hotel.map_info = 'NULL'

    try:
        star = root.xpath("//span[@id='ctl00_MainContentPlaceHolder_commonHead_imgStar']")[0]
        print "star:", star.attrib['title'].encode('utf-8')
        hotel.star = int(re.search(r'([\d]+)', star.attrib['title'].encode('utf-8')).group(1))
        print "hotel.star:", hotel.star
    except:
        hotel.star = -1
        logger.debug(current_log_tag() + '[解析酒店星级失败]')

    try:
        grade = root.xpath("//span[@class='score']/text()")[0]
        hotel.grade = float(grade)
        print "hotel.grade:", hotel.grade
    except:
        try:
            grade = root.xpath("//span[@class='n']/text()")[0]
            hotel.grade = float(grade)
            print "hotel.grade:", hotel.grade
        except:
            hotel.grade = -1
            logger.debug(current_log_tag() + '[解析酒店评分失败]')

    try: