Ejemplo n.º 1
0
 def __init__(self, *args, **kwargs):
     curPage = int(
         rdb.get('jqw_cur_page').decode()) if rdb.get('jqw_cur_page') else 0
     maxPage = int(
         rdb.get('jqw_pages').decode()) if rdb.get('jqw_pages') else 10000
     self.curPage = curPage
     self.maxPage = 500
     self.cityCode = [
         1, 129, 350, 371, 453, 573, 686, 840, 973, 1095, 1138, 1170, 1223,
         1245, 1409, 1551, 1726, 1866, 1887, 1991, 2104, 2242, 2346, 2393,
         2588, 2782, 2941, 3091, 3169, 3302, 3431, 3554, 3973, 4002
     ]
     super(JQSpider, self).__init__(*args, **kwargs)
Ejemplo n.º 2
0
 def __init__(self, *args, **kwargs):
     self.count = rdb.get('yhby_page')
     if not self.count:
         self.count = 1
     else:
         self.count = int(self.count.decode())
     super(YhbySpider, self).__init__(*args, **kwargs)
Ejemplo n.º 3
0
 def get_new_proxy(self):
     proxy = rdb.get('proxy')
     if not proxy:
         proxy = requests.get("http://182.92.190.100:5010/get/").content
         if proxy:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode()
             return 'http://' + proxy
     return proxy.decode()
Ejemplo n.º 4
0
 def __init__(self, *args, **kwargs):
     super(CslmSpider, self).__init__(*args, **kwargs)
     self.domain = 'http://www.ccoo.cn/'
     self.cur_code = 0 if not rdb.get('cslm_city') else int(
         rdb.get('cslm_city').decode())
     self.cur_url = '' if not rdb.get('cslm_url') else rdb.get(
         'cslm_url').decode()
     city_codes = [
         150, 153, 156, 182, 184, 196, 206, 212, 227, 228, 250, 276, 291,
         301, 312, 337, 366, 379, 396, 399, 419, 430, 441, 443, 453, 466,
         476, 776, 778, 777, 3251
     ]
     if self.cur_code in city_codes:
         index = city_codes.index(self.cur_code)
         city_codes = city_codes[index:]
     self.city_codes = city_codes
     self.log('init cur_code:{} cur_url: {}'.format(self.cur_code,
                                                    self.cur_url))
Ejemplo n.º 5
0
    def parseCityChannel(self, response):
        '''
        获取城市的频道(美食,电影,休闲等)
        '''
        city = rdb.get('dzdp_cur_city')
        if isinstance(city, bytes):
            city = city.decode()
        channels = response.css(
            'div.J_filter_channel div.nc-contain div div a')
        for channel in channels:
            name = channel.xpath('./text()').extract_first()
            href = channel.xpath('./@href').extract_first()
            data = pickle.dumps({'name': name, 'href': href})
            key = 'dzdp_{}_channels'.format(city)
            rdb.rpush(key, data)
            hkey = '{}_hash'.format(key)
            rdb.hset(hkey, name, href)

        self.getChannelClassify(city, None, '生活服务')