Exemple #1
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' not in response.url and len(
             html) > 400 and r'https://www.abuyun.com' not in html:
         trade_areas = response.xpath(
             '//*[@id="bussi-nav"]/a/@href').extract()
         if trade_areas:
             for trade_area in trade_areas:
                 db.add_value('village:start_urls', trade_area)
             print(trade_areas)
             print('以商圈为单位当前URL:{}'.format(response.url))
             print('一共有{}商圈入库'.format(len(trade_areas)))
         else:
             adms = response.xpath(
                 '//*[@id="region-nav"]/a/@href').extract()
             for adm in adms:
                 db.add_value('gov:start_urls', adm)
             print('没有商圈选项,以行政区为单位')
             print('以行政区为单位当前URL:{}'.format(response.url))
             print(adms)
             print('一共有{}个行政单位入库'.format(len(adms)))
     elif len(html) < 400 and 'verify' not in response.url:
         print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         print('需要重新入库')
         db.add_value(self.redis_key, response.url)
     elif 'verify' in response.url:
         url = response.meta.get('redirect_urls')[0]
         print('出现问题,有验证码,url:{}'.format(response.url))
         print('需要重新入库,重定向之前的URL:{}'.format(url))
         db.add_value(self.redis_key, url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #2
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' not in response.url and len(
             html) > 550 and r'https://www.abuyun.com' not in html:
         trade_areas = response.xpath(
             '//*[@id="bussi-nav-sub"]/a/@href').extract()
         if trade_areas:
             for trade_area in trade_areas[1:]:
                 db.add_value('ShopList:start_urls', trade_area)
             print(trade_areas)
             print('以商圈地点为单位当前URL:{}'.format(response.url))
             print('一共有{}商圈地点入库'.format(len(trade_areas[1:])))
         else:
             print('以商圈为单位当前URL:{}'.format(response.url))
             print('该商圈没有商圈地点,存入本商圈ULR:{}'.format(response.url))
             db.add_value('ShopList:start_urls', response.url)
     elif len(html) < 550 and 'verify' not in response.url:
         print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         print('需要重新入库')
         db.add_value(self.redis_key, response.url)
     elif 'verify' in response.url:
         url = response.meta.get('redirect_urls')[0]
         print('出现问题,有验证码,url:{}'.format(response.url))
         print('需要重新入库,重定向之前的URL:{}'.format(url))
         db.add_value(self.redis_key, url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #3
0
 def parse(self, response):
     db = RedisClient()
     url_lists = response.xpath('//div[@class="letter_city"]/ul/li/div[@class="city_list"]/a/@href').extract()
     end = r'/community/?from=navigation'
     urls = []
     for url_list in url_lists:
         urls.append( url_list + end)
         db.add_value('AreaList:start_urls', url_list + end)
     print(urls)
     print('一共有{}个城市入库'.format(len(set(urls))))
Exemple #4
0
 def process_exception(self, request, exception, spider):
     print('错误原因:{}'.format(exception))
     try:
         value_url = request.meta.get('redirect_urls')[0]
     except:
         value_url = request.url
     print('IP代理不可用,本次url:{}   需要重新入库处理'.format(value_url))
     key = getattr(spider, 'redis_key')
     print('本次的类名加属性名字为:{}'.format(key))
     db = RedisClient()
     db.add_value(key, value_url)
Exemple #5
0
 def parse(self, response):
     city_list = response.xpath('//div[@class="findHeight"]/a/@href').extract()
     base_url = 'http://www.dianping.com'
     end_url = '/ch20/g187'
     citys = []
     db = RedisClient()
     for city in city_list:
         citty = base_url+city.replace(r'//www.dianping.com','')+end_url
         citys.append(citty)
         db.add_value('TradeArea1:start_urls', citty)
     print(citys)
     print('一共有{}个城市URL入库成功'.format(len(citys)))
Exemple #6
0
    def parse(self, response):
        db = RedisClient()
        city_content = response.text
        xpath_css = Selector(text=city_content)
        sp_urls = xpath_css.xpath(
            '//*[@id="list-content"]/div[@class="list-item"]/@link').extract()
        if 'verify' in response.url or 'params' in response.url:
            logger.warning('遇到验证码了,url放入待爬队列里面')
            urls = response.meta.get('redirect_urls')
            for url in urls:
                db.add_value('XzlList:start_urls', url)

        elif len(sp_urls) < 0:
            logger.warning('本url:{}-----没有搜索结果'.format(response.url))
            db.add_value('not_url:xzl', response.url)
        else:
            sp_urls = xpath_css.xpath(
                '//*[@id="list-content"]/div[@class="list-item"]/@link'
            ).extract()
            for sp_url in sp_urls:
                db.add_value('DetailSpider:start_urls', sp_url)
            logger.info(sp_urls)
            logger.info('一共{}个url已经入库完毕'.format(len(sp_urls)))
            next_page = xpath_css.xpath(
                '//a[@class="aNxt"]/@href').extract_first()
            if next_page:
                try:
                    page = re.search(r'p(\d+)', next_page)
                    logger.info('第{}页---网址为:{}'.format(page.group(1),
                                                       next_page))
                except Exception as e:
                    logger.warning('出错原因:{}'.format(e.args))
                yield Request(url=next_page, callback=self.parse)
Exemple #7
0
 def process_response(self, request, response, spider):
     # print('到这了:{}'.format('process_response'))
     # print('该URL:{},状态码为:{}'.format(response.url,response.status))
     db = RedisClient()
     if response.status in [404, 403]:
         if response.status == 404:
             print('该URL:{}已经失效,状态码为:{}'.format(response.url,
                                                response.status))
             db.add_value('not_url:Shop', response.url)
         else:
             print('该IP已经被封,该url:{},需要重新入队列,状态码为:{}'.format(
                 response.url, response.status))
             key = getattr(spider, 'redis_key')
             db.add_value(key, response.url)
         raise IgnoreRequest
     elif response.status != 200 and response.status not in [404, 403]:
         print('该URL:{}      ,状态码为:{}'.format(response.url,
                                              response.status))
         try:
             value_url = request.meta.get('redirect_urls')[0]
         except:
             value_url = request.url
         if value_url and 'verify' not in value_url:
             print('可能被重定向了,本次url:{}   需要重新入库处理'.format(value_url))
             key = getattr(spider, 'redis_key')
             print('本次的类名加属性名字为:{}'.format(key))
             db.add_value(key, value_url)
         else:
             print('这是个严重错误,request:{},response:{}'.format(
                 request, response))
     elif 'verify' in request.url:
         print('该URL有验证码需求重新入队:{}含有验证码应该加入到队列中'.format(request.url))
         try:
             value_url = request.meta.get('redirect_urls')[0]
         except:
             value_url = request.url
         key = getattr(spider, 'redis_key')
         db.add_value(key, value_url)
         raise IgnoreRequest
     elif response.status == 200 and 'verify' not in request.url:
         pass
         # print('该URL:{},状态码为:{}'.format(response.url, response.status))
     else:
         print('状态码异常,请查看原因')
         print('该URL:{},状态码为:{}'.format(response.url, response.status))
     return response
Exemple #8
0
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        logger.warning('错误原因:{}'.format(exception))
        try:
            value_url = request.meta.get('redirect_urls')[0]
        except:
            value_url = request.url
        logger.info('IP代理不可用,本次url:{}   需要重新入库处理'.format(value_url))
        key = getattr(spider, 'redis_key')
        logger.info('本次的类名加属性名字为:{}'.format(key))
        db = RedisClient()
        db.add_value(key, value_url)
Exemple #9
0
    def parse(self, response):
        res = json.loads(response.text)
        db = RedisClient()
        item = GaodepoiItem()
        if res['status'] == '1':
            count = int(res['count'])
            page = math.ceil(count / 25)
            now_page = re.findall('page=(\d+)', response.url)[0]
            next_page = int(now_page) + 1
            if len(res['pois']) == 25:
                yield item
                # for i in range(2,page+1):
                print('有下一页需要把第{}页存入进去'.format(next_page))
                url = (response.url).replace('page={}'.format(now_page),
                                             'page={}'.format(next_page))
                # print('下一页URL:{}'.format(url))
                # db.add_value(self.redis_key, url)
                yield Request(url=url, callback=self.parse)
            # if len(item['pois']) > 23:
            #     for i in range(2, maxPage):
            #         logger.info('存入下一页的URL到Redis里面')
            #         url = (response.url).replace('page={}'.format(now_page), 'page={}'.format(i))
            #         db.add_value(self.redis_key, url)
            elif len(res['pois']) > 0:
                item['pois'] = res['pois']
                yield item
            elif len(res['pois']) == 0:
                print('当前没有数据,不可用URL:{}'.format(response.url))
                # db.add_value('NotGaoDePoiByCity:start_urls', response.url)
            elif page - now_page < 0 and page != 0:
                print('出现异常内容:{}'.format(response.text))
                db.add_value('ExGaoDePoiByCity:start_urls', response.url)

            else:
                print('当前URL出现严重Bug,内容:{}'.format(response.text))
                db.add_value('BugGaoDePoiByCity:start_urls', response.url)
        elif res['status'] == '0':
            print('请求失败,重新入队列')
            db.add_value(self.redis_key, response.url)
        else:
            print('当前URL出现严重Bug,内容:{}'.format(response.text))
            db.add_value('BugGaoDePoiByCity:start_urls', response.url)
Exemple #10
0
 def parse(self, response):
     db = RedisClient()
     detail_urls_content = response.text
     xpath_css = Selector(text=detail_urls_content)
     if 'verify' in response.url:
         urls = response.meta.get('redirect_urls')
         print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls))
         for url in urls:
             db.add_value('DetailList:start_urls', url)
     sp_urls = [
         str(sp_url).replace(r'loupan/', r'loupan/canshu-')
         for sp_url in xpath_css.xpath(
             '//*[@id="container"]/div[2]/div[1]/div[@class="key-list"]/div/@data-link'
         ).extract()
     ]
     if len(sp_urls) > 0:
         for sp_url in sp_urls:
             db.add_value('XinPan_spider:start_urls', sp_url)
         print(sp_urls)
         print('一共有{}个楼盘url,入库成功'.format(len(sp_urls)))
         next_page = xpath_css.xpath(
             '//a[@class="next-page next-link"]/@href').extract_first()
         if next_page:
             try:
                 page = re.search(r'p(\d+)', next_page)
                 print('第{}页---网址为:{}'.format(page.group(1), next_page))
             except Exception as e:
                 print('出错原因:{}'.format(e.args))
             yield Request(url=next_page, callback=self.parse)
Exemple #11
0
 def process_response(self, request, response, spider):
     logger.info('到这了:{}'.format('process_response'))
     logger.info(request.url)
     db = RedisClient()
     if response.status == 404:
         logger.warning('该URL:{}已经失效,放入失效库,可查看'.format(response.url))
         db.add_value('xp:not_url', response.url)
         raise IgnoreRequest
     if response.status != 200 and response.status != 404:
         logger.warning('----')
         logger.warning('出现问题了,这是状态码:{}'.format(response.status))
         try:
             value_url = request.meta.get('redirect_urls')[0]
         except:
             value_url = request.url
         if value_url and 'verify' not in value_url:
             logger.warning('可能被重定向了,本次url:{}   需要重新入库处理'.format(value_url))
             key = getattr(spider, 'redis_key')
             logger.warning('本次的类名加属性名字为:{}'.format(key))
             db.add_value(key, value_url)
         else:
             logger.error('这是个严重错误,request:{},response:{}'.format(
                 request, response))
     if 'captcha' in request.url:
         logger.info('该URL:{}含有验证码应该加入到重'.format(response.url))
     return response
Exemple #12
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' not in response.url and len(
             html) > 700 and r'https://www.abuyun.com' not in html:
         adms = response.xpath('//*[@id="region-nav"]/a/@href').extract()
         is_50 = response.xpath(
             '/html/body/div[2]/div[3]/div[1]/div[2]/a[10]/text()'
         ).extract_first()
         try:
             page = int(is_50) if is_50 else int(1)
         except Exception as e:
             print('查找页数失败,失败原因:{},可能原因{}'.format(e.args, is_50))
             page = 1
         if adms and int(page) == 50:
             for adm in adms:
                 db.add_value('GovSpider:start_urls', adm)
             print('以行政区为单位当前URL:{}'.format(response.url))
             print(adms)
             print('一共有{}个行政单位入库'.format(len(adms)))
         elif int(page) < 50:
             print('没有50页,一共才{}页,不需要在细分了,直接把当前URL{}存入ShopList:start_urls'.
                   format(page, response.url))
             db.add_value('ShopList:start_urls', response.url)
         else:
             print('该URL:{}出现异常,该网页内容为:{}'.format(response.url, html))
     elif len(html) < 700 and 'verify' not in response.url:
         print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         print('需要重新入库')
         db.add_value(self.redis_key, response.url)
     elif 'verify' in response.url:
         url = response.meta.get('redirect_urls')[0]
         print('出现问题,有验证码,url:{}'.format(response.url))
         print('需要重新入库,重定向之前的URL:{}'.format(url))
         db.add_value(self.redis_key, url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #13
0
 def parse(self, response):
     db = RedisClient()
     detail_urls_content = response.text
     xpath_css = Selector(text=detail_urls_content)
     sp_urls = xpath_css.xpath('//*[@id="list-content"]/div[@class="list-item"]/@link').extract()
     if 'verify' in response.url:
         logger.warning('遇到验证码了,url放入待爬队列里面')
         urls = response.meta.get('redirect_urls')
         for url in urls:
             db.add_value('CitySpider:start_urls', url)
     elif len(sp_urls) < 0 or '请换个搜索词或试试筛选吧' in detail_urls_content:
             logger.warning('本url:{}-----没有搜索结果'.format(response.url))
             db.add_value('not_url:xzl', response.url)
     else:
         countys = xpath_css.xpath('//*[@id="city_list"]/dl/dd/a/@href').extract()
         for county in countys:
             db.add_value('CountySpider:start_urls', county)
         logger.info(countys)
         logger.info('一共{}个url已经入库完毕'.format(len(countys)))
Exemple #14
0
    def parse(self, response):
        db = RedisClient()
        detail_urls_content = response.text
        xpath_css = Selector(text=detail_urls_content)
        sp_urls = xpath_css.xpath(
            '//*[@id="list-content"]/div[@class="list-item"]/@link').extract()
        if 'verify' in response.url:
            logger.warning('遇到验证码了,url放入待爬队列里面')
            urls = response.meta.get('redirect_urls')
            for url in urls:
                db.add_value('TownSpider:start_urls', url)

        elif len(sp_urls) < 0 or '请换个搜索词或试试筛选吧' in detail_urls_content:
            logger.warning('本url:{}-----没有搜索结果'.format(response.url))
            db.add_value('not_url:xzl', response.url)
        else:
            towns = xpath_css.xpath(
                '//div[@class="sub-items"]/a/@href').extract()
            for town in towns[1:]:
                db.add_value('XzlList:start_urls', town)
            logger.info(towns[1:])
            logger.info('一共{}个url已经入库完毕'.format(len(towns[1:])))
Exemple #15
0
 def parse(self, response):
     db = RedisClient()
     detail_urls_content = response.text
     xpath_css = Selector(text=detail_urls_content)
     citys = xpath_css.xpath(
         '//div[@class="sel-city"]/div[@class="city-mod"]/dl/dd/a/@href'
     ).extract()
     urls_list = xpath_css.xpath(
         '//*[@id="container"]/div[2]/div[1]/div[@class="key-list"]/div/@data-link'
     ).extract()
     if 'verify' in response.url:
         urls = response.meta.get('redirect_urls')
         print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls))
         for url in urls:
             db.add_value('XinPanCity:start_urls', url)
     if len(urls_list) > 0:
         for city in citys:
             db.add_value('DetailList:start_urls', city)
     print(citys)
     print('一共有{}个城市url,入库成功'.format(len(citys)))
Exemple #16
0
 def parse(self, response):
     db = RedisClient()
     if 'verify' in response.url or r'jump' in response.url:
         logger.info('遇到验证码了,url放入待爬队列里面')
         urls = response.meta.get('redirect_urls')[0]
         db.add_value('DetailSpider:start_urls', urls)
     else:
         detail_urls_content = response.text
         if '您要查看的页面丢失了' not in response.text:
             try:
                 lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"',
                                      detail_urls_content, re.S)
                 real_lat_lng = lat_lng[0]
                 xpath_css = Selector(text=detail_urls_content)
                 item = XzlspiderItem()
                 every_address = [
                     str(ad).replace('写字楼出租',
                                     '').replace('房产网',
                                                 '').replace('写字楼出售', '')
                     for ad in xpath_css.xpath(
                         '/html/body/div[2]/a/text()').extract()[1:3]
                 ]
                 new_address = self.gen_address(every_address)
                 item['province'], item['city'], item[
                     'county'] = new_address[0], new_address[
                         1], new_address[2]
                 item['url'] = response.url
                 pin = Pinyin()
                 item['sheetname'] = pin.get_pinyin(
                     item['province'], "").replace('sheng',
                                                   '').replace('shi', '')
                 item['total'] = xpath_css.xpath(
                     '//*[@id="j-triggerlayer"]/text()').extract_first(
                     ).strip()
                 house_msgs_l = xpath_css.xpath(
                     '//*[@id="fy_info"]/ul[@class="litem"]/li')
                 house_config = settings['NEWHOUSE']
                 for house_msg in house_msgs_l:
                     key1 = house_msg.xpath(
                         './span[1]/text()').extract_first()
                     key = house_config.get(key1)
                     item[key] = remove_tags(
                         str(house_msg.xpath('./span[2]').extract_first()))
                 house_msgs_r = xpath_css.xpath(
                     '//*[@id="fy_info"]/ul[@class="ritem"]/li')
                 for house_msg in house_msgs_r:
                     key1 = house_msg.xpath(
                         './span[1]/text()').extract_first()
                     print(key1)
                     key = house_config.get(key1)
                     if key == '预估月支出' and 'zu' in response.url:
                         continue
                     else:
                         item[key] = remove_tags(
                             str(
                                 house_msg.xpath(
                                     './span[2]').extract_first()))
                 house_resources_l = xpath_css.xpath(
                     '//div[@class="itemCon clearfix"]/ul[@class="litem"]/li'
                 )
                 for house_resource in house_resources_l:
                     key1 = house_resource.xpath(
                         './span[1]/text()').extract_first()
                     key = house_config.get(key1)
                     item[key] = remove_tags(
                         str(
                             house_resource.xpath(
                                 './span[2]').extract_first()))
                 house_resources_r = xpath_css.xpath(
                     '//div[@class="itemCon clearfix"]/ul[@class="ritem"]/li'
                 )
                 for house_resource in house_resources_r:
                     key1 = house_resource.xpath(
                         './span[1]/text()').extract_first()
                     key = house_config.get(key1)
                     if key == '得房率':
                         continue
                     else:
                         item[key] = remove_tags(
                             str(
                                 house_resource.xpath(
                                     './span[2]').extract_first()))
                 describes = xpath_css.xpath(
                     '//*[@id="xzl_desc"]/div').extract_first()
                 real_describe = remove_tags(str(describes))
                 item['describe'] = real_describe.replace('\xa0','').replace('\r','')\
                     .replace('\n','').replace('\t','').replace('\u200b','').replace('\u200c','').strip()
                 item['lat_lng'] = real_lat_lng
                 public_time = xpath_css.xpath(
                     '//*[@id="xzl_desc"]/h3/div/text()')[1].root
                 item['public_time'] = public_time.strip()
                 house_number = xpath_css.xpath(
                     '//*[@id="xzl_desc"]/h3/div/text()')[2].root
                 item['house_number'] = house_number.strip()
                 yield item
             except Exception as e:
                 logger.error('严重错误看日志', e.args)
                 if 'antispam' in response.url or 'jump' in response.url:
                     url = response.meta.get('redirect_urls')[0]
                 else:
                     url = response.url
                 logger.error(
                     '出现异常下载,可能IP有问题---------------:{}'.format(url))
                 logger.error('重新入库')
                 db.add_value('sp_detail:start_urls', url)
         else:
             logger.error('该URL已经失效:{}'.format(response.url))
             db.add_value('not_url:sp_detail', response.url)
Exemple #17
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' in response.url:
         print('出现验证码,该url重新入队列')
         url = response.meta.get('redirect_urls')[0]
         db.add_value(self.redis_key, url)
     elif len(html) < 700 and 'verify' not in response.url:
         print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         db.add_value(self.redis_key, response.url)
     elif 'verify' not in response.url and len(
             html) > 700 and r'https://www.abuyun.com' not in html:
         shops_url = response.xpath(
             '//*[@id="shop-all-list"]/ul/li/div[1]/a/@href').extract()
         if shops_url:
             for shop_url in shops_url:
                 db.add_value('Shop:start_urls', shop_url)
             print(shops_url)
             print('当前URL:{}'.format(response.url))
             print('一共有{}商铺详情URL入库'.format(len(shops_url)))
             next_page = response.xpath(
                 '//a[@class="next"]/@href').extract_first()
             if next_page:
                 try:
                     page = re.search(r'p(\d+)', next_page)
                     print('第{}页---网址为:{}'.format(page.group(1), next_page))
                 except Exception as e:
                     print('出错原因:{}'.format(e.args))
                 yield Request(url=next_page, callback=self.parse)
         else:
             if '没有找到符合条件的商户' in html:
                 print('该URL:{}已经无商铺列表可选'.format(response.url))
                 print('shops_url:{}'.format(shops_url))
             else:
                 print('IP可能出现异常,检查原因,内容为{}'.format(html))
                 print('URL:{}需要重新入库'.format(response.url))
                 db.add_value(self.redis_key, response.url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
         db.add_value('Error:start_urls', response.url)
Exemple #18
0
    def parse(self, response):
        db = RedisClient()
        item = {}
        html = response.text
        # print(html)
        # print(response.decode('utf-8'))
        if 'verify' in response.url:
            url = response.meta.get('redirect_urls')[0]
            print('出现问题,有验证码,url:{}'.format(response.url))
            print('需要重新入库,重定向之前的URL:{}'.format(url))
            db.add_value(self.redis_key, url)
        if not html:
            print('返回状态:{},返回内容:{}'.format(response.status, html))
            print('需要重新入库')
            db.add_value(self.redis_key, response.url)
        if '页面无法访问' in html or '页面不存在' in html and 'verify' not in response.url:
            print('失效URL:{}'.format(response.url))
            db.add_value('Not:start_urls', response.url)

        if html and 'window.shop_config=' in html and 'verify' not in response.url and r'https://www.abuyun.com' not in html and 'window.shop_config.shopId' not in html:
            mes = html.split('window.shop_config=')[-1]
            me = mes.split(r'</script> <script src')[0]
            result = self.str_to_dict(me.strip())
            dict = json.loads(result)
            item['shopId'] = dict.get('shopId')
            item['shopName'] = dict.get('shopName')
            item['address'] = dict.get('address')
            item['fullName'] = dict.get('fullName')
            item['shopGlat'] = dict.get('shopGlat')
            item['shopGlng'] = dict.get('shopGlng')
            item['reviewCount'] = response.xpath(
                '//*[@id="reviewCount"]/text()').extract_first()
            item['avgPrice'] = self.str_to_deciphering(
                response, '//*[@id="avgPriceTitle"]',
                '//*[@id="avgPriceTitle"]/span/@class')
            item['productScore'] = self.str_to_deciphering(
                response, '//*[@id="comment_score"]/span[1]',
                '//span[@class="item"]/span/@class')
            item['environmentScore'] = self.str_to_deciphering(
                response, '//*[@id="comment_score"]/span[2]',
                '//span[@class="item"]/span/@class')
            item['serviceScore'] = self.str_to_deciphering(
                response, '//*[@id="comment_score"]/span[3]',
                '//span[@class="item"]/span/@class')
            item['telephone'] = self.str_to_deciphering(
                response, '//*[@id="basic-info"]/p[1]', '//p/span/@class')
            rank = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[1]/@class').extract_first()
            rank_handle = re.findall('\d+', rank)
            rankStars = ''.join(rank_handle) if rank_handle else 0
            item['rankStars'] = rankStars
            shop_hours = response.xpath(
                '//p[@class="info info-indent"]/span[2]/text()').extract()
            item['shopHours'] = shop_hours
            ad = re.findall(
                '<meta name="location" content="province=(.*?);city=(.*?);">',
                html, re.S)
            item['url'] = response.url
            item['province'] = ad[0][0]
            item['city'] = ad[0][1]
            now_time = datetime.datetime.now()
            item['now_time'] = str(now_time)[0:-7]
            from xpinyin import Pinyin
            pin = Pinyin()
            item['sheetName'] = pin.get_pinyin(item['province'], '')

        if html and 'window.shop_config.shopId' in html and 'verify' not in response.url and r'https://www.abuyun.com' not in html:
            item['shopId'] = re.findall('shop/(\d+)', response.url)[0]
            item['shopName'] = response.xpath(
                '//*[@id="basic-info"]/h1/text()').extract_first().strip()
            item['address'] = response.xpath(
                'string(//*[@id="basic-info"]/div[2])').extract_first().strip(
                )
            item['fullName'] = response.xpath(
                '//*[@id="basic-info"]/h1/text()').extract_first().strip()
            ll = re.findall(r'{(lng:.*?)}', html)[0]
            ll = re.split(r'[:,]', ll)
            item['shopGlng'] = float(ll[1])
            item['shopGlat'] = float(ll[-1])
            # item['shopGlat'] = response.xpath('//*[@id="basic-info"]/h1/text()').extract_first().strip()
            # item['shopGlng'] = response.xpath('//*[@id="basic-info"]/h1/text()').extract_first().strip()
            item['reviewCount'] = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[2]/text()').extract_first()
            item['avgPrice'] = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[3]/text()').extract_first()
            item['productScore'] = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[4]/text()').extract_first()
            item['environmentScore'] = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[5]/text()').extract_first()
            item['serviceScore'] = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[6]/text()').extract_first()
            item['telephone'] = response.xpath(
                '//*[@id="basic-info"]/p/span[2]/text()').extract_first()
            rank = response.xpath(
                '//*[@id="basic-info"]/div[1]/span[1]/@class').extract_first()
            rank_handle = re.findall('\d+', rank)
            rankStars = ''.join(rank_handle) if rank_handle else 0
            shop_hours = response.xpath(
                '//p[@class="info info-indent"]/span[2]/text()').extract()
            item['shopHours'] = shop_hours if shop_hours else ''
            item['rankStars'] = rankStars
            item['url'] = response.url
            ad = re.findall(
                '<meta name="location" content="province=(.*?);city=(.*?);">',
                html, re.S)[0]
            item['province'] = ad[0]
            item['city'] = ad[1]
            from xpinyin import Pinyin
            pin = Pinyin()
            item['sheetName'] = pin.get_pinyin(item['province'], '')
            now_time = datetime.datetime.now()
            item['now_time'] = str(now_time)[0:-7]
        if len(item) > 0:
            # print('这是结果:{}'.format(item))
            # print(item)
            yield item
Exemple #19
0
    def parse(self, response):
        if 'verify' in response.url:
            db = RedisClient()
            urls = response.meta.get('redirect_urls')[0]
            print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls))
            db.add_value('XinPan_spiders:start_urls', urls)
        else:
            item = {}
            start_url_content = response.text
            detail_urls_content = start_url_content
            xpath_css = Selector(text=detail_urls_content)
            every_address = [
                str(ad).replace('楼盘', '') for ad in xpath_css.xpath(
                    '//*[@id="header"]/div[2]/div[1]/a/text()').extract()[1:3]
            ]
            if len(every_address) > 0:
                new_address = self.gen_address(every_address)
                item['province'], item['city'], item['county'] = new_address[
                    0], new_address[1], new_address[2]
                item['url'] = response.url
                pin = Pinyin()
                item['sheetname'] = pin.get_pinyin(item['province'],
                                                   "").replace('sheng',
                                                               '').replace(
                                                                   'shi', '')
                house_msgs_l = xpath_css.xpath(
                    '//*[@id="container"]/div[1]/div[1]/div/div[2]/ul/li')[:-2]
                new_house = settings['NEWHOUSE']
                print(len(house_msgs_l), type(house_msgs_l))
                for house_msg in house_msgs_l:
                    key1 = house_msg.xpath('./div[1]/text()').extract_first()
                    key2 = new_house.get(key1)
                    if key2 == 'None':
                        print(key1)

                    if 'property_features' == key2:
                        item[key2] = [
                            i for i in str(
                                remove_tags(
                                    str(
                                        house_msg.xpath('./div[2]').
                                        extract_first()).replace(
                                            '\n', ''))).strip().split(' ') if i
                        ]
                    else:
                        item[key2] = remove_tags(
                            str(house_msg.xpath('./div[2]').extract_first()).
                            replace('\n', '').replace(' ', '').replace(
                                r'[价格走势]', '').replace(r'[查看地图]', '').replace(
                                    r'[房贷计算器]',
                                    '').replace(r'[查看详情]',
                                                '').replace(r'[查看详情]', ''))
                print(item)

            else:
                print('这是个严重的错误')
                print(response.url)
                db = RedisClient()
                urls = response.meta.get('redirect_urls')[0:1]
                print(urls)
                print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls))
                for url in urls:
                    db.add_value('XinPan_spider:start_urls', url)
Exemple #20
0
    def parse(self, response):
        # self.CutChina(response.url)
        item = GaodepoiItem()
        db = RedisClient()
        result = json.loads(response.text)
        try:
            sum = int(result['count'])
        except:
            sum = 0
        maxPage = math.ceil(sum / 20)
        now_page = re.findall('page=(\d+)', response.url)[0]
        next_page = int(now_page) + 1
        if result['status'] == '1':
            item['pois'] = result['pois']
            if sum > 840:
                logger.info('数量大于850,需要再次切分')
                self.CutChina(response.url)
            elif len(item['pois']) > 0:
                yield item
                if len(item['pois']) == 20:
                    # for i in range(now_page+1,maxPage+1):
                    url = (response.url).replace('page={}'.format(now_page),
                                                 'page={}'.format(next_page))
                    logger.info('要访问第{}页了一共有{}页'.format(next_page, maxPage))
                    logger.info('一共有{}条数据'.format(sum))
                    logger.info('下一页URL:{}'.format(url))
                    # db.add_value(self.redis_key,url)
                    yield Request(url=url,
                                  callback=self.parse,
                                  dont_filter=True)
                if len(item['pois']) > 0 and len(
                        item['pois']) < 20 and maxPage - int(now_page) != 0:
                    url = (response.url).replace(
                        'page={}'.format(now_page),
                        'page={}'.format(next_page + 1))
                    logger.info('要访问第{}页了一共有{}页'.format(
                        next_page + 1, maxPage))
                    yield Request(url=url,
                                  callback=self.parse,
                                  dont_filter=True)
            elif maxPage - int(now_page) < 0 and len(item['pois']) == 0:
                if maxPage == 0 and int(now_page) == 1:
                    logger.info('第一页就没有结果没有结果的URL:{}'.format(response.url))
                elif int(now_page) - maxPage == 1:
                    logger.info('当前第{}页了一共有{}'.format(now_page, maxPage + 1))
                    logger.info('没有结果的URL:{}'.format(response.url))
                else:
                    logger.warning('当前URL:{},还没取取到最后一页就为0了,重新如队列'.format(
                        response.url))
                    logger.info('把第{}页了URL存入Redis,一共有{}页'.format(
                        now_page, maxPage))
                    db.add_value(self.redis_key, response.url)
            elif sum == 0 and int(now_page) == 1 and len(item['pois']) == 0:
                logger.info('当前第{}页了一共有{}'.format(now_page, maxPage + 1))
                logger.info('没有结果的URL:{}'.format(response.url))
            elif sum != 0 and maxPage - int(now_page) == -1 and len(
                    item['pois']) == 0:
                logger.info('当前第{}页了一共有{}'.format(now_page, maxPage))
                logger.info('没有结果的URL:{}'.format(response.url))

            else:
                logger.warning('1-出现严重的异常的URL:{},内容为:{}'.format(
                    response.url, response.text))
                db.add_value('Exception1:start_urls', response.url)
        elif result['status'] == '0':
            logger.info('请求失败,重新入队列')
            db.add_value(self.redis_key, response.url)
        else:
            logger.warning('2-出现严重的异常的URL:{},内容为:{}'.format(
                response.url, response.text))
            db.add_value('Exception2:start_urls', response.url)
Exemple #21
0
 def CutChina(self, url):
     url = parse.unquote(url)
     db = RedisClient()
     x = []
     a = re.findall('polygon=(.*?)&', url)[0]
     c = a.split('|')
     for i in c:
         x.extend(i.split(','))
     logger.info('从URL:里面查找出两个点的坐标')
     rect = Rect(xmin=float(x[0]),
                 ymin=float(x[1]),
                 xmax=float(x[2]),
                 ymax=float(x[3]))
     # 规则:经度和纬度用","分割,经度在前,纬度在后,坐标对用"|"分割。经纬度小数点后不得超过6位。
     polygon = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(rect.xmin, rect.ymin,
                                                    rect.xmax, rect.ymax)
     middleX = (rect.xmin + rect.xmax) / 2
     middleY = (rect.ymin + rect.ymax) / 2
     lng_lat = []
     rect1 = Rect(xmin=rect.xmin,
                  ymin=rect.ymin,
                  xmax=middleX,
                  ymax=middleY)
     polygon1 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(
         rect1.xmin, rect1.ymin, rect1.xmax, rect1.ymax)
     lng_lat.append(polygon1)
     rect2 = Rect(xmin=middleX,
                  ymin=rect.ymin,
                  xmax=rect.xmax,
                  ymax=middleY)
     polygon2 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(
         rect2.xmin, rect2.ymin, rect2.xmax, rect2.ymax)
     lng_lat.append(polygon2)
     rect3 = Rect(xmin=rect.xmin,
                  ymin=middleY,
                  xmax=middleX,
                  ymax=rect.ymax)
     polygon3 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(
         rect3.xmin, rect3.ymin, rect3.xmax, rect3.ymax)
     lng_lat.append(polygon3)
     rect4 = Rect(xmin=middleX,
                  ymin=middleY,
                  xmax=rect.xmax,
                  ymax=rect.ymax)
     polygon4 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(
         rect4.xmin, rect4.ymin, rect4.xmax, rect4.ymax)
     lng_lat.append(polygon4)
     count = 0
     for i in lng_lat:
         for type in self.df1['NEW_TYPE']:
             params = {
                 'polygon': i,
                 'types': type,
                 'page': '1',
                 'offset': 20,
                 'extensions': 'all'
             }
             real_url = self.base_url + urlencode(params)
             db.add_value(self.redis_key, real_url)
             count += 1
     logger.info('切分的矩形重新放入redis中,一共{}个URL'.format(count))
Exemple #22
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' in response.url or 'params' in response.url:
         print('遇到验证码了,url放入待爬队列里面')
         url = response.meta.get('redirect_urls')[0]
         db.add_value(self.redis_key, url)
     elif r'abuyun.com' not in html and len(html) > 600 and '没有找到符合' not in html:
         count = response.xpath('//span[@class="tit"]/em[2]/text()').extract_first()
         count = int(count)
         print('一共有{}个'.format(count))
         if count > 1500 and count < 3000:
             print('当前小区个数大于100页,需要存入每一页的URL地址')
             for i in range(1,int(count/30)+1):
                 db.add_value('dayu:start_urls',response.url+r'/p'+i)
         elif count > 3000:
             print('此URL:{}为特殊URL,需要特殊处理'.format(response.url))
             db.add_value('special:start_urls',response.url)
         elif count > 0 and count < 1500:
             print('采集详情页URL信息')
             houses = response.xpath('//*[@id="list-content"]/div[@class="li-itemmod"]/@link').extract()[1:]
             for house in houses:
                 db.add_value('DetailHouse:start_urls', house+'?from=Filter_1&hfilter=filterlist')
             print(houses)
             if len(houses)==0:
                 print('这是个不正常的URL:{}'.format(response.url))
                 db.add_value('NotHouseList:start_urls',response.url)
             print('该URL:{}一共有{}个房屋'.format(response.url,len(houses)))
             next_page = response.xpath('//a[@class="aNxt"]/@href').extract_first()
             if next_page:
                 try:
                     page = re.search(r'p(\d+)', next_page)
                     print('第{}页---网址为:{}'.format(page.group(1), next_page))
                 except Exception as e:
                     print('出错原因:{}'.format(e.args))
                 yield Request(url=next_page, callback=self.parse)
         else:
             print('当前URL:{}'.format(response.url))
             print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
     elif r'abuyun.com' in html:
         print('IP出问题了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         db.add_value(self.redis_key, response.url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #23
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' in response.url or 'params' in response.url:
         print('遇到验证码了,url放入待爬队列里面')
         url = response.meta.get('redirect_urls')[0]
         db.add_value(self.redis_key, url)
     elif r'abuyun.com' not in html and len(html) > 600:
         count = response.xpath(
             '//span[@class="tit"]/em[2]/text()').extract_first()
         print('一共有{}个'.format(count))
         if int(count) < 1500 and int(count) != 0:
             print('直接把URL存到list列表中')
             db.add_value('HouseList:start_urls', response.url)
         elif int(count) == 0:
             print('该URL:{}没有小区信息'.format(response.url))
             db.add_value('NotAreaList:start_urls', response.url)
         else:
             print('继续细分到区')
             areas = response.xpath(
                 '//div[@class="div-border items-list"]/div[1]/span[2]/a/@href'
             ).extract()[1:]
             for area in areas:
                 db.add_value('AddressList:start_urls', area)
             if len(areas) == 0:
                 print('这是个不正常的URL:{}'.format(response.url))
                 db.add_value('NotAreaList:start_urls', response.url)
             print(areas)
             print('该URL:{}一共有{}个地区'.format(response.url, len(areas)))
     elif r'abuyun.com' in html:
         print('IP出问题了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         db.add_value(self.redis_key, response.url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #24
0
    def process_request(self, request, spider):
        # ua = UserAgent()
        # agent = ua.chrome
        # f = faker.Faker(locale='zh_cn')
        # agent = f.user_agent()
        agent = random.choice(useragent)
        same = get_lxsdk_cuid(agent)
        cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lxsdk_s={}'.format(
            same, same, get_hc(), get_lxsdk_s())
        # cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s={}'.format(same,same,get_hc(),get_lxsdk_s())
        cook1 = 'cy=1236; cityid=1236; cye=huixian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _lxsdk=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _hc.v=0c84e8b5-c945-5c86-bb54-94e4936012e5.1536637332; s_ViewType=10; cye=beijing; _lxsdk_s=165cb7d7e23-268-18-f1%7C%7C87'
        # print(cook)
        headers = {
            'Host':'www.dianping.com',
            'Upgrade-Insecure-Requests':'1',
            'Cookie':cook,
            'User-Agent':agent ,
            # 'Proxy-Connection':'keep-alive'
        }
        proxyHost = "http-dyn.abuyun.com"
        proxyPort = "9020"
        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"
        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        proxies = {
            # "http": proxyMeta,
            "https": proxyMeta,
        }
        proxiess = {
            "https": "http://140.255.6.45:5649",
            # "https": "http://118.79.54.90:6996",
            # "https": "http://117.42.201.221:6214",

        }

        import requests
        #s = requests.Session()
        #base = 'https://www.dianping.com/'
        try:
            # start_url = requests.get(base, headers=headers, proxies=proxies, timeout=15)
            # print(start_url.text)
            res = requests.get(request.url, headers=headers, proxies=proxies, timeout=15)
            if res.status_code != 200 or len(res.text) < 560:
                if res.status_code == 403 or res.status_code == 404:
                    content = '页面无法访问'
                else:
                    content = res.text
                print('该URL:{},状态码:{},内容为:{}'.format(request.url, res.status_code, content))
                key = getattr(spider, 'redis_key')
                db = RedisClient()
                print('该URL:{}需要重新入队列'.format(request.url))
                db.add_value(key, request.url)
                raise IgnoreRequest
            else:
                from scrapy.http.response.html import HtmlResponse
                rs = res.content.decode('utf-8')
                # print(rs)
                response = HtmlResponse(url=request.url, body=res.content.decode('utf-8'), encoding="utf-8", request=request)
                return response
        except Exception as e:
            print('出现错误,原因{}'.format(e.args))
            key = getattr(spider, 'redis_key')
            db = RedisClient()
            print('该URL:{}需要重新入队列'.format(request.url))
            db.add_value(key, request.url)
            raise IgnoreRequest
Exemple #25
0
 def parse(self, response):
     db = RedisClient()
     item = CommunityItem()
     html = response.text
     if 'verify' in response.url or 'params' in response.url:
         print('遇到验证码了,url放入待爬队列里面')
         url = response.meta.get('redirect_urls')[0]
         db.add_value(self.redis_key, url)
     elif r'abuyun.com' not in html and len(
             html) > 600 and '您要查看的页面丢失了' not in html:
         from scrapy.conf import settings
         ho = settings['HOUSE']
         try:
             price = re.findall('"comm_midprice":"(.*?)","area_midprice"',
                                html, re.S)[0]
         except:
             price = re.findall('"comm_midprice":(.*?),"area_midprice"',
                                html, re.S)[0]
         print(price)
         # print(price)
         item['price'] = price
         try:
             l2 = re.findall('lat : "(.*?)",.*?lng : "(.*?)"', html, re.S)
             lat_lng = [float(l2[0][0]), float(l2[0][1])]
         except:
             lat_lng = [0, 0]
         # print(lat_lng)
         item['lat_lng'] = lat_lng
         detali_dt = response.xpath('//*[@id="basic-infos-box"]/dl/dt')
         address = response.xpath(
             '//span[@class="sub-hd"]/text()').extract_first()
         all_add = response.xpath(
             '//div[@class="p_1180 p_crumbs"]/a/text()').extract()
         city = all_add[1].replace('小区', '')
         county = all_add[2]
         community = all_add[3]
         community_name = all_add[4]
         pin = Pinyin()
         province = self.gen_address(city)
         sheet_name = pin.get_pinyin(province,
                                     "").replace('sheng',
                                                 '').replace('shi', '')
         item['sheet_name'] = sheet_name
         print(province, city, county, community, community_name)
         item['province'] = province
         item['city'] = city
         item['county'] = county
         item['community'] = community
         item['community_name'] = community_name
         # print(address)
         item['address'] = address
         dt = []
         for i in detali_dt:
             key1 = i.xpath('./text()').extract_first().replace('\xa0',
                                                                '').replace(
                                                                    ':', '')
             key = ho.get(key1)
             dt.append(key)
         detali_dd = response.xpath('//*[@id="basic-infos-box"]/dl/dd')
         dd = []
         for i in detali_dd:
             dd.append(i.xpath('./text()').extract_first())
         house_mes = dict(zip(dt, dd))
         item.update(house_mes)
         item['url'] = response.url
         print('这是结果:{}'.format(item))
         yield item
     elif r'abuyun.com' in html:
         print('IP出问题了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         db.add_value(self.redis_key, response.url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))
Exemple #26
0
 def parse(self, response):
     html = response.text
     db = RedisClient()
     if 'verify' not in response.url and len(
             html) > 700 and r'https://www.abuyun.com' not in html:
         is_50 = response.xpath(
             '/html/body/div[2]/div[3]/div[1]/div[2]/a[10]/text()'
         ).extract_first()
         try:
             page = int(is_50) if is_50 else int(1)
         except Exception as e:
             print('查找页数失败,失败原因:{},可能原因{}'.format(e.args, is_50))
             page = 1
         if int(page) == 50:
             print('该地区已经满50页需要在划分')
             areas = response.xpath(
                 '//*[@id="region-nav-sub"]/a/@href').extract()
             if areas:
                 for area in areas:
                     db.add_value('village:start_urls', area)
                 print(areas)
                 print('一共有{}个详细地点入库'.format(len(areas)))
             else:
                 print('该URL:{}出现异常,该网页内容为:{}'.format(response.url, html))
                 db.add_value('ExGovSpider:start_urls', response.url)
         else:
             print('该地区页数不满50页,一共有{}页,直接将本页存入本URL:{}'.format(
                 page, response.url))
             db.add_value('ShopList:start_urls', response.url)
     elif len(html) < 700 and 'verify' not in response.url:
         print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url))
         print('返回状态:{},返回内容:{}'.format(response.status, html))
         print('需要重新入库')
         db.add_value(self.redis_key, response.url)
     elif 'verify' in response.url:
         url = response.meta.get('redirect_urls')[0]
         print('出现问题,有验证码,url:{}'.format(response.url))
         print('需要重新入库,重定向之前的URL:{}'.format(url))
         db.add_value(self.redis_key, url)
     else:
         print('当前URL:{}'.format(response.url))
         print('这是个严重错误,请查看详情:{}   该网页内容:{}'.format(response.url, html))