def parse(self, response): db = RedisClient() detail_urls_content = response.text xpath_css = Selector(text=detail_urls_content) if 'verify' in response.url: urls = response.meta.get('redirect_urls') print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls)) for url in urls: db.add_value('DetailList:start_urls', url) sp_urls = [ str(sp_url).replace(r'loupan/', r'loupan/canshu-') for sp_url in xpath_css.xpath( '//*[@id="container"]/div[2]/div[1]/div[@class="key-list"]/div/@data-link' ).extract() ] if len(sp_urls) > 0: for sp_url in sp_urls: db.add_value('XinPan_spider:start_urls', sp_url) print(sp_urls) print('一共有{}个楼盘url,入库成功'.format(len(sp_urls))) next_page = xpath_css.xpath( '//a[@class="next-page next-link"]/@href').extract_first() if next_page: try: page = re.search(r'p(\d+)', next_page) print('第{}页---网址为:{}'.format(page.group(1), next_page)) except Exception as e: print('出错原因:{}'.format(e.args)) yield Request(url=next_page, callback=self.parse)
def process_response(self, request, response, spider): logger.info('到这了:{}'.format('process_response')) logger.info(request.url) db = RedisClient() if response.status == 404: logger.warning('该URL:{}已经失效,放入失效库,可查看'.format(response.url)) db.add_value('xp:not_url', response.url) raise IgnoreRequest if response.status != 200 and response.status != 404: logger.warning('----') logger.warning('出现问题了,这是状态码:{}'.format(response.status)) try: value_url = request.meta.get('redirect_urls')[0] except: value_url = request.url if value_url and 'verify' not in value_url: logger.warning('可能被重定向了,本次url:{} 需要重新入库处理'.format(value_url)) key = getattr(spider, 'redis_key') logger.warning('本次的类名加属性名字为:{}'.format(key)) db.add_value(key, value_url) else: logger.error('这是个严重错误,request:{},response:{}'.format( request, response)) if 'captcha' in request.url: logger.info('该URL:{}含有验证码应该加入到重'.format(response.url)) return response
def parse(self, response): db = RedisClient() url_lists = response.xpath('//div[@class="letter_city"]/ul/li/div[@class="city_list"]/a/@href').extract() end = r'/community/?from=navigation' urls = [] for url_list in url_lists: urls.append( url_list + end) db.add_value('AreaList:start_urls', url_list + end) print(urls) print('一共有{}个城市入库'.format(len(set(urls))))
def process_exception(self, request, exception, spider): print('错误原因:{}'.format(exception)) try: value_url = request.meta.get('redirect_urls')[0] except: value_url = request.url print('IP代理不可用,本次url:{} 需要重新入库处理'.format(value_url)) key = getattr(spider, 'redis_key') print('本次的类名加属性名字为:{}'.format(key)) db = RedisClient() db.add_value(key, value_url)
def parse(self, response): city_list = response.xpath('//div[@class="findHeight"]/a/@href').extract() base_url = 'http://www.dianping.com' end_url = '/ch20/g187' citys = [] db = RedisClient() for city in city_list: citty = base_url+city.replace(r'//www.dianping.com','')+end_url citys.append(citty) db.add_value('TradeArea1:start_urls', citty) print(citys) print('一共有{}个城市URL入库成功'.format(len(citys)))
def parse(self, response): html = response.text db = RedisClient() if 'verify' not in response.url and len( html) > 400 and r'https://www.abuyun.com' not in html: trade_areas = response.xpath( '//*[@id="bussi-nav"]/a/@href').extract() if trade_areas: for trade_area in trade_areas: db.add_value('village:start_urls', trade_area) print(trade_areas) print('以商圈为单位当前URL:{}'.format(response.url)) print('一共有{}商圈入库'.format(len(trade_areas))) else: adms = response.xpath( '//*[@id="region-nav"]/a/@href').extract() for adm in adms: db.add_value('gov:start_urls', adm) print('没有商圈选项,以行政区为单位') print('以行政区为单位当前URL:{}'.format(response.url)) print(adms) print('一共有{}个行政单位入库'.format(len(adms))) elif len(html) < 400 and 'verify' not in response.url: print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) print('需要重新入库') db.add_value(self.redis_key, response.url) elif 'verify' in response.url: url = response.meta.get('redirect_urls')[0] print('出现问题,有验证码,url:{}'.format(response.url)) print('需要重新入库,重定向之前的URL:{}'.format(url)) db.add_value(self.redis_key, url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def parse(self, response): html = response.text db = RedisClient() if 'verify' not in response.url and len( html) > 550 and r'https://www.abuyun.com' not in html: trade_areas = response.xpath( '//*[@id="bussi-nav-sub"]/a/@href').extract() if trade_areas: for trade_area in trade_areas[1:]: db.add_value('ShopList:start_urls', trade_area) print(trade_areas) print('以商圈地点为单位当前URL:{}'.format(response.url)) print('一共有{}商圈地点入库'.format(len(trade_areas[1:]))) else: print('以商圈为单位当前URL:{}'.format(response.url)) print('该商圈没有商圈地点,存入本商圈ULR:{}'.format(response.url)) db.add_value('ShopList:start_urls', response.url) elif len(html) < 550 and 'verify' not in response.url: print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) print('需要重新入库') db.add_value(self.redis_key, response.url) elif 'verify' in response.url: url = response.meta.get('redirect_urls')[0] print('出现问题,有验证码,url:{}'.format(response.url)) print('需要重新入库,重定向之前的URL:{}'.format(url)) db.add_value(self.redis_key, url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain logger.warning('错误原因:{}'.format(exception)) try: value_url = request.meta.get('redirect_urls')[0] except: value_url = request.url logger.info('IP代理不可用,本次url:{} 需要重新入库处理'.format(value_url)) key = getattr(spider, 'redis_key') logger.info('本次的类名加属性名字为:{}'.format(key)) db = RedisClient() db.add_value(key, value_url)
def parse(self, response): html = response.text db = RedisClient() if 'verify' not in response.url and len( html) > 700 and r'https://www.abuyun.com' not in html: is_50 = response.xpath( '/html/body/div[2]/div[3]/div[1]/div[2]/a[10]/text()' ).extract_first() try: page = int(is_50) if is_50 else int(1) except Exception as e: print('查找页数失败,失败原因:{},可能原因{}'.format(e.args, is_50)) page = 1 if int(page) == 50: print('该地区已经满50页需要在划分') areas = response.xpath( '//*[@id="region-nav-sub"]/a/@href').extract() if areas: for area in areas: db.add_value('village:start_urls', area) print(areas) print('一共有{}个详细地点入库'.format(len(areas))) else: print('该URL:{}出现异常,该网页内容为:{}'.format(response.url, html)) db.add_value('ExGovSpider:start_urls', response.url) else: print('该地区页数不满50页,一共有{}页,直接将本页存入本URL:{}'.format( page, response.url)) db.add_value('ShopList:start_urls', response.url) elif len(html) < 700 and 'verify' not in response.url: print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) print('需要重新入库') db.add_value(self.redis_key, response.url) elif 'verify' in response.url: url = response.meta.get('redirect_urls')[0] print('出现问题,有验证码,url:{}'.format(response.url)) print('需要重新入库,重定向之前的URL:{}'.format(url)) db.add_value(self.redis_key, url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def parse(self, response): html = response.text db = RedisClient() if 'verify' in response.url: print('出现验证码,该url重新入队列') url = response.meta.get('redirect_urls')[0] db.add_value(self.redis_key, url) elif len(html) < 700 and 'verify' not in response.url: print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) db.add_value(self.redis_key, response.url) elif 'verify' not in response.url and len( html) > 700 and r'https://www.abuyun.com' not in html: shops_url = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[1]/a/@href').extract() if shops_url: for shop_url in shops_url: db.add_value('Shop:start_urls', shop_url) print(shops_url) print('当前URL:{}'.format(response.url)) print('一共有{}商铺详情URL入库'.format(len(shops_url))) next_page = response.xpath( '//a[@class="next"]/@href').extract_first() if next_page: try: page = re.search(r'p(\d+)', next_page) print('第{}页---网址为:{}'.format(page.group(1), next_page)) except Exception as e: print('出错原因:{}'.format(e.args)) yield Request(url=next_page, callback=self.parse) else: if '没有找到符合条件的商户' in html: print('该URL:{}已经无商铺列表可选'.format(response.url)) print('shops_url:{}'.format(shops_url)) else: print('IP可能出现异常,检查原因,内容为{}'.format(html)) print('URL:{}需要重新入库'.format(response.url)) db.add_value(self.redis_key, response.url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html)) db.add_value('Error:start_urls', response.url)
def parse(self, response): db = RedisClient() detail_urls_content = response.text xpath_css = Selector(text=detail_urls_content) citys = xpath_css.xpath( '//div[@class="sel-city"]/div[@class="city-mod"]/dl/dd/a/@href' ).extract() urls_list = xpath_css.xpath( '//*[@id="container"]/div[2]/div[1]/div[@class="key-list"]/div/@data-link' ).extract() if 'verify' in response.url: urls = response.meta.get('redirect_urls') print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls)) for url in urls: db.add_value('XinPanCity:start_urls', url) if len(urls_list) > 0: for city in citys: db.add_value('DetailList:start_urls', city) print(citys) print('一共有{}个城市url,入库成功'.format(len(citys)))
def parse(self, response): db = RedisClient() city_content = response.text xpath_css = Selector(text=city_content) sp_urls = xpath_css.xpath( '//*[@id="list-content"]/div[@class="list-item"]/@link').extract() if 'verify' in response.url or 'params' in response.url: logger.warning('遇到验证码了,url放入待爬队列里面') urls = response.meta.get('redirect_urls') for url in urls: db.add_value('XzlList:start_urls', url) elif len(sp_urls) < 0: logger.warning('本url:{}-----没有搜索结果'.format(response.url)) db.add_value('not_url:xzl', response.url) else: sp_urls = xpath_css.xpath( '//*[@id="list-content"]/div[@class="list-item"]/@link' ).extract() for sp_url in sp_urls: db.add_value('DetailSpider:start_urls', sp_url) logger.info(sp_urls) logger.info('一共{}个url已经入库完毕'.format(len(sp_urls))) next_page = xpath_css.xpath( '//a[@class="aNxt"]/@href').extract_first() if next_page: try: page = re.search(r'p(\d+)', next_page) logger.info('第{}页---网址为:{}'.format(page.group(1), next_page)) except Exception as e: logger.warning('出错原因:{}'.format(e.args)) yield Request(url=next_page, callback=self.parse)
def process_response(self, request, response, spider): # print('到这了:{}'.format('process_response')) # print('该URL:{},状态码为:{}'.format(response.url,response.status)) db = RedisClient() if response.status in [404, 403]: if response.status == 404: print('该URL:{}已经失效,状态码为:{}'.format(response.url, response.status)) db.add_value('not_url:Shop', response.url) else: print('该IP已经被封,该url:{},需要重新入队列,状态码为:{}'.format( response.url, response.status)) key = getattr(spider, 'redis_key') db.add_value(key, response.url) raise IgnoreRequest elif response.status != 200 and response.status not in [404, 403]: print('该URL:{} ,状态码为:{}'.format(response.url, response.status)) try: value_url = request.meta.get('redirect_urls')[0] except: value_url = request.url if value_url and 'verify' not in value_url: print('可能被重定向了,本次url:{} 需要重新入库处理'.format(value_url)) key = getattr(spider, 'redis_key') print('本次的类名加属性名字为:{}'.format(key)) db.add_value(key, value_url) else: print('这是个严重错误,request:{},response:{}'.format( request, response)) elif 'verify' in request.url: print('该URL有验证码需求重新入队:{}含有验证码应该加入到队列中'.format(request.url)) try: value_url = request.meta.get('redirect_urls')[0] except: value_url = request.url key = getattr(spider, 'redis_key') db.add_value(key, value_url) raise IgnoreRequest elif response.status == 200 and 'verify' not in request.url: pass # print('该URL:{},状态码为:{}'.format(response.url, response.status)) else: print('状态码异常,请查看原因') print('该URL:{},状态码为:{}'.format(response.url, response.status)) return response
def parse(self, response): res = json.loads(response.text) db = RedisClient() item = GaodepoiItem() if res['status'] == '1': count = int(res['count']) page = math.ceil(count / 25) now_page = re.findall('page=(\d+)', response.url)[0] next_page = int(now_page) + 1 if len(res['pois']) == 25: yield item # for i in range(2,page+1): print('有下一页需要把第{}页存入进去'.format(next_page)) url = (response.url).replace('page={}'.format(now_page), 'page={}'.format(next_page)) # print('下一页URL:{}'.format(url)) # db.add_value(self.redis_key, url) yield Request(url=url, callback=self.parse) # if len(item['pois']) > 23: # for i in range(2, maxPage): # logger.info('存入下一页的URL到Redis里面') # url = (response.url).replace('page={}'.format(now_page), 'page={}'.format(i)) # db.add_value(self.redis_key, url) elif len(res['pois']) > 0: item['pois'] = res['pois'] yield item elif len(res['pois']) == 0: print('当前没有数据,不可用URL:{}'.format(response.url)) # db.add_value('NotGaoDePoiByCity:start_urls', response.url) elif page - now_page < 0 and page != 0: print('出现异常内容:{}'.format(response.text)) db.add_value('ExGaoDePoiByCity:start_urls', response.url) else: print('当前URL出现严重Bug,内容:{}'.format(response.text)) db.add_value('BugGaoDePoiByCity:start_urls', response.url) elif res['status'] == '0': print('请求失败,重新入队列') db.add_value(self.redis_key, response.url) else: print('当前URL出现严重Bug,内容:{}'.format(response.text)) db.add_value('BugGaoDePoiByCity:start_urls', response.url)
def parse(self, response): html = response.text db = RedisClient() if 'verify' not in response.url and len( html) > 700 and r'https://www.abuyun.com' not in html: adms = response.xpath('//*[@id="region-nav"]/a/@href').extract() is_50 = response.xpath( '/html/body/div[2]/div[3]/div[1]/div[2]/a[10]/text()' ).extract_first() try: page = int(is_50) if is_50 else int(1) except Exception as e: print('查找页数失败,失败原因:{},可能原因{}'.format(e.args, is_50)) page = 1 if adms and int(page) == 50: for adm in adms: db.add_value('GovSpider:start_urls', adm) print('以行政区为单位当前URL:{}'.format(response.url)) print(adms) print('一共有{}个行政单位入库'.format(len(adms))) elif int(page) < 50: print('没有50页,一共才{}页,不需要在细分了,直接把当前URL{}存入ShopList:start_urls'. format(page, response.url)) db.add_value('ShopList:start_urls', response.url) else: print('该URL:{}出现异常,该网页内容为:{}'.format(response.url, html)) elif len(html) < 700 and 'verify' not in response.url: print('遇到反爬了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) print('需要重新入库') db.add_value(self.redis_key, response.url) elif 'verify' in response.url: url = response.meta.get('redirect_urls')[0] print('出现问题,有验证码,url:{}'.format(response.url)) print('需要重新入库,重定向之前的URL:{}'.format(url)) db.add_value(self.redis_key, url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def parse(self, response): db = RedisClient() detail_urls_content = response.text xpath_css = Selector(text=detail_urls_content) sp_urls = xpath_css.xpath('//*[@id="list-content"]/div[@class="list-item"]/@link').extract() if 'verify' in response.url: logger.warning('遇到验证码了,url放入待爬队列里面') urls = response.meta.get('redirect_urls') for url in urls: db.add_value('CitySpider:start_urls', url) elif len(sp_urls) < 0 or '请换个搜索词或试试筛选吧' in detail_urls_content: logger.warning('本url:{}-----没有搜索结果'.format(response.url)) db.add_value('not_url:xzl', response.url) else: countys = xpath_css.xpath('//*[@id="city_list"]/dl/dd/a/@href').extract() for county in countys: db.add_value('CountySpider:start_urls', county) logger.info(countys) logger.info('一共{}个url已经入库完毕'.format(len(countys)))
def parse(self, response): db = RedisClient() detail_urls_content = response.text xpath_css = Selector(text=detail_urls_content) sp_urls = xpath_css.xpath( '//*[@id="list-content"]/div[@class="list-item"]/@link').extract() if 'verify' in response.url: logger.warning('遇到验证码了,url放入待爬队列里面') urls = response.meta.get('redirect_urls') for url in urls: db.add_value('TownSpider:start_urls', url) elif len(sp_urls) < 0 or '请换个搜索词或试试筛选吧' in detail_urls_content: logger.warning('本url:{}-----没有搜索结果'.format(response.url)) db.add_value('not_url:xzl', response.url) else: towns = xpath_css.xpath( '//div[@class="sub-items"]/a/@href').extract() for town in towns[1:]: db.add_value('XzlList:start_urls', town) logger.info(towns[1:]) logger.info('一共{}个url已经入库完毕'.format(len(towns[1:])))
def parse(self, response): db = RedisClient() item = {} html = response.text # print(html) # print(response.decode('utf-8')) if 'verify' in response.url: url = response.meta.get('redirect_urls')[0] print('出现问题,有验证码,url:{}'.format(response.url)) print('需要重新入库,重定向之前的URL:{}'.format(url)) db.add_value(self.redis_key, url) if not html: print('返回状态:{},返回内容:{}'.format(response.status, html)) print('需要重新入库') db.add_value(self.redis_key, response.url) if '页面无法访问' in html or '页面不存在' in html and 'verify' not in response.url: print('失效URL:{}'.format(response.url)) db.add_value('Not:start_urls', response.url) if html and 'window.shop_config=' in html and 'verify' not in response.url and r'https://www.abuyun.com' not in html and 'window.shop_config.shopId' not in html: mes = html.split('window.shop_config=')[-1] me = mes.split(r'</script> <script src')[0] result = self.str_to_dict(me.strip()) dict = json.loads(result) item['shopId'] = dict.get('shopId') item['shopName'] = dict.get('shopName') item['address'] = dict.get('address') item['fullName'] = dict.get('fullName') item['shopGlat'] = dict.get('shopGlat') item['shopGlng'] = dict.get('shopGlng') item['reviewCount'] = response.xpath( '//*[@id="reviewCount"]/text()').extract_first() item['avgPrice'] = self.str_to_deciphering( response, '//*[@id="avgPriceTitle"]', '//*[@id="avgPriceTitle"]/span/@class') item['productScore'] = self.str_to_deciphering( response, '//*[@id="comment_score"]/span[1]', '//span[@class="item"]/span/@class') item['environmentScore'] = self.str_to_deciphering( response, '//*[@id="comment_score"]/span[2]', '//span[@class="item"]/span/@class') item['serviceScore'] = self.str_to_deciphering( response, '//*[@id="comment_score"]/span[3]', '//span[@class="item"]/span/@class') item['telephone'] = self.str_to_deciphering( response, '//*[@id="basic-info"]/p[1]', '//p/span/@class') rank = response.xpath( '//*[@id="basic-info"]/div[1]/span[1]/@class').extract_first() rank_handle = re.findall('\d+', rank) rankStars = ''.join(rank_handle) if rank_handle else 0 item['rankStars'] = rankStars shop_hours = response.xpath( '//p[@class="info info-indent"]/span[2]/text()').extract() item['shopHours'] = shop_hours ad = re.findall( '<meta name="location" content="province=(.*?);city=(.*?);">', html, re.S) item['url'] = response.url item['province'] = ad[0][0] item['city'] = ad[0][1] now_time = datetime.datetime.now() item['now_time'] = str(now_time)[0:-7] from xpinyin import Pinyin pin = Pinyin() item['sheetName'] = pin.get_pinyin(item['province'], '') if html and 'window.shop_config.shopId' in html and 'verify' not in response.url and r'https://www.abuyun.com' not in html: item['shopId'] = re.findall('shop/(\d+)', response.url)[0] item['shopName'] = response.xpath( '//*[@id="basic-info"]/h1/text()').extract_first().strip() item['address'] = response.xpath( 'string(//*[@id="basic-info"]/div[2])').extract_first().strip( ) item['fullName'] = response.xpath( '//*[@id="basic-info"]/h1/text()').extract_first().strip() ll = re.findall(r'{(lng:.*?)}', html)[0] ll = re.split(r'[:,]', ll) item['shopGlng'] = float(ll[1]) item['shopGlat'] = float(ll[-1]) # item['shopGlat'] = response.xpath('//*[@id="basic-info"]/h1/text()').extract_first().strip() # item['shopGlng'] = response.xpath('//*[@id="basic-info"]/h1/text()').extract_first().strip() item['reviewCount'] = response.xpath( '//*[@id="basic-info"]/div[1]/span[2]/text()').extract_first() item['avgPrice'] = response.xpath( '//*[@id="basic-info"]/div[1]/span[3]/text()').extract_first() item['productScore'] = response.xpath( '//*[@id="basic-info"]/div[1]/span[4]/text()').extract_first() item['environmentScore'] = response.xpath( '//*[@id="basic-info"]/div[1]/span[5]/text()').extract_first() item['serviceScore'] = response.xpath( '//*[@id="basic-info"]/div[1]/span[6]/text()').extract_first() item['telephone'] = response.xpath( '//*[@id="basic-info"]/p/span[2]/text()').extract_first() rank = response.xpath( '//*[@id="basic-info"]/div[1]/span[1]/@class').extract_first() rank_handle = re.findall('\d+', rank) rankStars = ''.join(rank_handle) if rank_handle else 0 shop_hours = response.xpath( '//p[@class="info info-indent"]/span[2]/text()').extract() item['shopHours'] = shop_hours if shop_hours else '' item['rankStars'] = rankStars item['url'] = response.url ad = re.findall( '<meta name="location" content="province=(.*?);city=(.*?);">', html, re.S)[0] item['province'] = ad[0] item['city'] = ad[1] from xpinyin import Pinyin pin = Pinyin() item['sheetName'] = pin.get_pinyin(item['province'], '') now_time = datetime.datetime.now() item['now_time'] = str(now_time)[0:-7] if len(item) > 0: # print('这是结果:{}'.format(item)) # print(item) yield item
def parse(self, response): # self.CutChina(response.url) item = GaodepoiItem() db = RedisClient() result = json.loads(response.text) try: sum = int(result['count']) except: sum = 0 maxPage = math.ceil(sum / 20) now_page = re.findall('page=(\d+)', response.url)[0] next_page = int(now_page) + 1 if result['status'] == '1': item['pois'] = result['pois'] if sum > 840: logger.info('数量大于850,需要再次切分') self.CutChina(response.url) elif len(item['pois']) > 0: yield item if len(item['pois']) == 20: # for i in range(now_page+1,maxPage+1): url = (response.url).replace('page={}'.format(now_page), 'page={}'.format(next_page)) logger.info('要访问第{}页了一共有{}页'.format(next_page, maxPage)) logger.info('一共有{}条数据'.format(sum)) logger.info('下一页URL:{}'.format(url)) # db.add_value(self.redis_key,url) yield Request(url=url, callback=self.parse, dont_filter=True) if len(item['pois']) > 0 and len( item['pois']) < 20 and maxPage - int(now_page) != 0: url = (response.url).replace( 'page={}'.format(now_page), 'page={}'.format(next_page + 1)) logger.info('要访问第{}页了一共有{}页'.format( next_page + 1, maxPage)) yield Request(url=url, callback=self.parse, dont_filter=True) elif maxPage - int(now_page) < 0 and len(item['pois']) == 0: if maxPage == 0 and int(now_page) == 1: logger.info('第一页就没有结果没有结果的URL:{}'.format(response.url)) elif int(now_page) - maxPage == 1: logger.info('当前第{}页了一共有{}'.format(now_page, maxPage + 1)) logger.info('没有结果的URL:{}'.format(response.url)) else: logger.warning('当前URL:{},还没取取到最后一页就为0了,重新如队列'.format( response.url)) logger.info('把第{}页了URL存入Redis,一共有{}页'.format( now_page, maxPage)) db.add_value(self.redis_key, response.url) elif sum == 0 and int(now_page) == 1 and len(item['pois']) == 0: logger.info('当前第{}页了一共有{}'.format(now_page, maxPage + 1)) logger.info('没有结果的URL:{}'.format(response.url)) elif sum != 0 and maxPage - int(now_page) == -1 and len( item['pois']) == 0: logger.info('当前第{}页了一共有{}'.format(now_page, maxPage)) logger.info('没有结果的URL:{}'.format(response.url)) else: logger.warning('1-出现严重的异常的URL:{},内容为:{}'.format( response.url, response.text)) db.add_value('Exception1:start_urls', response.url) elif result['status'] == '0': logger.info('请求失败,重新入队列') db.add_value(self.redis_key, response.url) else: logger.warning('2-出现严重的异常的URL:{},内容为:{}'.format( response.url, response.text)) db.add_value('Exception2:start_urls', response.url)
def CutChina(self, url): url = parse.unquote(url) db = RedisClient() x = [] a = re.findall('polygon=(.*?)&', url)[0] c = a.split('|') for i in c: x.extend(i.split(',')) logger.info('从URL:里面查找出两个点的坐标') rect = Rect(xmin=float(x[0]), ymin=float(x[1]), xmax=float(x[2]), ymax=float(x[3])) # 规则:经度和纬度用","分割,经度在前,纬度在后,坐标对用"|"分割。经纬度小数点后不得超过6位。 polygon = "{:.6f},{:.6f}|{:.6f},{:.6f}".format(rect.xmin, rect.ymin, rect.xmax, rect.ymax) middleX = (rect.xmin + rect.xmax) / 2 middleY = (rect.ymin + rect.ymax) / 2 lng_lat = [] rect1 = Rect(xmin=rect.xmin, ymin=rect.ymin, xmax=middleX, ymax=middleY) polygon1 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format( rect1.xmin, rect1.ymin, rect1.xmax, rect1.ymax) lng_lat.append(polygon1) rect2 = Rect(xmin=middleX, ymin=rect.ymin, xmax=rect.xmax, ymax=middleY) polygon2 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format( rect2.xmin, rect2.ymin, rect2.xmax, rect2.ymax) lng_lat.append(polygon2) rect3 = Rect(xmin=rect.xmin, ymin=middleY, xmax=middleX, ymax=rect.ymax) polygon3 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format( rect3.xmin, rect3.ymin, rect3.xmax, rect3.ymax) lng_lat.append(polygon3) rect4 = Rect(xmin=middleX, ymin=middleY, xmax=rect.xmax, ymax=rect.ymax) polygon4 = "{:.6f},{:.6f}|{:.6f},{:.6f}".format( rect4.xmin, rect4.ymin, rect4.xmax, rect4.ymax) lng_lat.append(polygon4) count = 0 for i in lng_lat: for type in self.df1['NEW_TYPE']: params = { 'polygon': i, 'types': type, 'page': '1', 'offset': 20, 'extensions': 'all' } real_url = self.base_url + urlencode(params) db.add_value(self.redis_key, real_url) count += 1 logger.info('切分的矩形重新放入redis中,一共{}个URL'.format(count))
def parse(self, response): if 'verify' in response.url: db = RedisClient() urls = response.meta.get('redirect_urls')[0] print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls)) db.add_value('XinPan_spiders:start_urls', urls) else: item = {} start_url_content = response.text detail_urls_content = start_url_content xpath_css = Selector(text=detail_urls_content) every_address = [ str(ad).replace('楼盘', '') for ad in xpath_css.xpath( '//*[@id="header"]/div[2]/div[1]/a/text()').extract()[1:3] ] if len(every_address) > 0: new_address = self.gen_address(every_address) item['province'], item['city'], item['county'] = new_address[ 0], new_address[1], new_address[2] item['url'] = response.url pin = Pinyin() item['sheetname'] = pin.get_pinyin(item['province'], "").replace('sheng', '').replace( 'shi', '') house_msgs_l = xpath_css.xpath( '//*[@id="container"]/div[1]/div[1]/div/div[2]/ul/li')[:-2] new_house = settings['NEWHOUSE'] print(len(house_msgs_l), type(house_msgs_l)) for house_msg in house_msgs_l: key1 = house_msg.xpath('./div[1]/text()').extract_first() key2 = new_house.get(key1) if key2 == 'None': print(key1) if 'property_features' == key2: item[key2] = [ i for i in str( remove_tags( str( house_msg.xpath('./div[2]'). extract_first()).replace( '\n', ''))).strip().split(' ') if i ] else: item[key2] = remove_tags( str(house_msg.xpath('./div[2]').extract_first()). replace('\n', '').replace(' ', '').replace( r'[价格走势]', '').replace(r'[查看地图]', '').replace( r'[房贷计算器]', '').replace(r'[查看详情]', '').replace(r'[查看详情]', '')) print(item) else: print('这是个严重的错误') print(response.url) db = RedisClient() urls = response.meta.get('redirect_urls')[0:1] print(urls) print('遇到验证码了,url:{}重新放入待爬队列里面'.format(urls)) for url in urls: db.add_value('XinPan_spider:start_urls', url)
def parse(self, response): html = response.text db = RedisClient() if 'verify' in response.url or 'params' in response.url: print('遇到验证码了,url放入待爬队列里面') url = response.meta.get('redirect_urls')[0] db.add_value(self.redis_key, url) elif r'abuyun.com' not in html and len(html) > 600 and '没有找到符合' not in html: count = response.xpath('//span[@class="tit"]/em[2]/text()').extract_first() count = int(count) print('一共有{}个'.format(count)) if count > 1500 and count < 3000: print('当前小区个数大于100页,需要存入每一页的URL地址') for i in range(1,int(count/30)+1): db.add_value('dayu:start_urls',response.url+r'/p'+i) elif count > 3000: print('此URL:{}为特殊URL,需要特殊处理'.format(response.url)) db.add_value('special:start_urls',response.url) elif count > 0 and count < 1500: print('采集详情页URL信息') houses = response.xpath('//*[@id="list-content"]/div[@class="li-itemmod"]/@link').extract()[1:] for house in houses: db.add_value('DetailHouse:start_urls', house+'?from=Filter_1&hfilter=filterlist') print(houses) if len(houses)==0: print('这是个不正常的URL:{}'.format(response.url)) db.add_value('NotHouseList:start_urls',response.url) print('该URL:{}一共有{}个房屋'.format(response.url,len(houses))) next_page = response.xpath('//a[@class="aNxt"]/@href').extract_first() if next_page: try: page = re.search(r'p(\d+)', next_page) print('第{}页---网址为:{}'.format(page.group(1), next_page)) except Exception as e: print('出错原因:{}'.format(e.args)) yield Request(url=next_page, callback=self.parse) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html)) elif r'abuyun.com' in html: print('IP出问题了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) db.add_value(self.redis_key, response.url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def parse(self, response): html = response.text db = RedisClient() if 'verify' in response.url or 'params' in response.url: print('遇到验证码了,url放入待爬队列里面') url = response.meta.get('redirect_urls')[0] db.add_value(self.redis_key, url) elif r'abuyun.com' not in html and len(html) > 600: count = response.xpath( '//span[@class="tit"]/em[2]/text()').extract_first() print('一共有{}个'.format(count)) if int(count) < 1500 and int(count) != 0: print('直接把URL存到list列表中') db.add_value('HouseList:start_urls', response.url) elif int(count) == 0: print('该URL:{}没有小区信息'.format(response.url)) db.add_value('NotAreaList:start_urls', response.url) else: print('继续细分到区') areas = response.xpath( '//div[@class="div-border items-list"]/div[1]/span[2]/a/@href' ).extract()[1:] for area in areas: db.add_value('AddressList:start_urls', area) if len(areas) == 0: print('这是个不正常的URL:{}'.format(response.url)) db.add_value('NotAreaList:start_urls', response.url) print(areas) print('该URL:{}一共有{}个地区'.format(response.url, len(areas))) elif r'abuyun.com' in html: print('IP出问题了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) db.add_value(self.redis_key, response.url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def process_request(self, request, spider): # ua = UserAgent() # agent = ua.chrome # f = faker.Faker(locale='zh_cn') # agent = f.user_agent() agent = random.choice(useragent) same = get_lxsdk_cuid(agent) cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lxsdk_s={}'.format( same, same, get_hc(), get_lxsdk_s()) # cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s={}'.format(same,same,get_hc(),get_lxsdk_s()) cook1 = 'cy=1236; cityid=1236; cye=huixian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _lxsdk=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _hc.v=0c84e8b5-c945-5c86-bb54-94e4936012e5.1536637332; s_ViewType=10; cye=beijing; _lxsdk_s=165cb7d7e23-268-18-f1%7C%7C87' # print(cook) headers = { 'Host':'www.dianping.com', 'Upgrade-Insecure-Requests':'1', 'Cookie':cook, 'User-Agent':agent , # 'Proxy-Connection':'keep-alive' } proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { # "http": proxyMeta, "https": proxyMeta, } proxiess = { "https": "http://140.255.6.45:5649", # "https": "http://118.79.54.90:6996", # "https": "http://117.42.201.221:6214", } import requests #s = requests.Session() #base = 'https://www.dianping.com/' try: # start_url = requests.get(base, headers=headers, proxies=proxies, timeout=15) # print(start_url.text) res = requests.get(request.url, headers=headers, proxies=proxies, timeout=15) if res.status_code != 200 or len(res.text) < 560: if res.status_code == 403 or res.status_code == 404: content = '页面无法访问' else: content = res.text print('该URL:{},状态码:{},内容为:{}'.format(request.url, res.status_code, content)) key = getattr(spider, 'redis_key') db = RedisClient() print('该URL:{}需要重新入队列'.format(request.url)) db.add_value(key, request.url) raise IgnoreRequest else: from scrapy.http.response.html import HtmlResponse rs = res.content.decode('utf-8') # print(rs) response = HtmlResponse(url=request.url, body=res.content.decode('utf-8'), encoding="utf-8", request=request) return response except Exception as e: print('出现错误,原因{}'.format(e.args)) key = getattr(spider, 'redis_key') db = RedisClient() print('该URL:{}需要重新入队列'.format(request.url)) db.add_value(key, request.url) raise IgnoreRequest
def parse(self, response): db = RedisClient() item = CommunityItem() html = response.text if 'verify' in response.url or 'params' in response.url: print('遇到验证码了,url放入待爬队列里面') url = response.meta.get('redirect_urls')[0] db.add_value(self.redis_key, url) elif r'abuyun.com' not in html and len( html) > 600 and '您要查看的页面丢失了' not in html: from scrapy.conf import settings ho = settings['HOUSE'] try: price = re.findall('"comm_midprice":"(.*?)","area_midprice"', html, re.S)[0] except: price = re.findall('"comm_midprice":(.*?),"area_midprice"', html, re.S)[0] print(price) # print(price) item['price'] = price try: l2 = re.findall('lat : "(.*?)",.*?lng : "(.*?)"', html, re.S) lat_lng = [float(l2[0][0]), float(l2[0][1])] except: lat_lng = [0, 0] # print(lat_lng) item['lat_lng'] = lat_lng detali_dt = response.xpath('//*[@id="basic-infos-box"]/dl/dt') address = response.xpath( '//span[@class="sub-hd"]/text()').extract_first() all_add = response.xpath( '//div[@class="p_1180 p_crumbs"]/a/text()').extract() city = all_add[1].replace('小区', '') county = all_add[2] community = all_add[3] community_name = all_add[4] pin = Pinyin() province = self.gen_address(city) sheet_name = pin.get_pinyin(province, "").replace('sheng', '').replace('shi', '') item['sheet_name'] = sheet_name print(province, city, county, community, community_name) item['province'] = province item['city'] = city item['county'] = county item['community'] = community item['community_name'] = community_name # print(address) item['address'] = address dt = [] for i in detali_dt: key1 = i.xpath('./text()').extract_first().replace('\xa0', '').replace( ':', '') key = ho.get(key1) dt.append(key) detali_dd = response.xpath('//*[@id="basic-infos-box"]/dl/dd') dd = [] for i in detali_dd: dd.append(i.xpath('./text()').extract_first()) house_mes = dict(zip(dt, dd)) item.update(house_mes) item['url'] = response.url print('这是结果:{}'.format(item)) yield item elif r'abuyun.com' in html: print('IP出问题了,该URL:{}需要重新入队列'.format(response.url)) print('返回状态:{},返回内容:{}'.format(response.status, html)) db.add_value(self.redis_key, response.url) else: print('当前URL:{}'.format(response.url)) print('这是个严重错误,请查看详情:{} 该网页内容:{}'.format(response.url, html))
def parse(self, response): db = RedisClient() if 'verify' in response.url or r'jump' in response.url: logger.info('遇到验证码了,url放入待爬队列里面') urls = response.meta.get('redirect_urls')[0] db.add_value('DetailSpider:start_urls', urls) else: detail_urls_content = response.text if '您要查看的页面丢失了' not in response.text: try: lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"', detail_urls_content, re.S) real_lat_lng = lat_lng[0] xpath_css = Selector(text=detail_urls_content) item = XzlspiderItem() every_address = [ str(ad).replace('写字楼出租', '').replace('房产网', '').replace('写字楼出售', '') for ad in xpath_css.xpath( '/html/body/div[2]/a/text()').extract()[1:3] ] new_address = self.gen_address(every_address) item['province'], item['city'], item[ 'county'] = new_address[0], new_address[ 1], new_address[2] item['url'] = response.url pin = Pinyin() item['sheetname'] = pin.get_pinyin( item['province'], "").replace('sheng', '').replace('shi', '') item['total'] = xpath_css.xpath( '//*[@id="j-triggerlayer"]/text()').extract_first( ).strip() house_msgs_l = xpath_css.xpath( '//*[@id="fy_info"]/ul[@class="litem"]/li') house_config = settings['NEWHOUSE'] for house_msg in house_msgs_l: key1 = house_msg.xpath( './span[1]/text()').extract_first() key = house_config.get(key1) item[key] = remove_tags( str(house_msg.xpath('./span[2]').extract_first())) house_msgs_r = xpath_css.xpath( '//*[@id="fy_info"]/ul[@class="ritem"]/li') for house_msg in house_msgs_r: key1 = house_msg.xpath( './span[1]/text()').extract_first() print(key1) key = house_config.get(key1) if key == '预估月支出' and 'zu' in response.url: continue else: item[key] = remove_tags( str( house_msg.xpath( './span[2]').extract_first())) house_resources_l = xpath_css.xpath( '//div[@class="itemCon clearfix"]/ul[@class="litem"]/li' ) for house_resource in house_resources_l: key1 = house_resource.xpath( './span[1]/text()').extract_first() key = house_config.get(key1) item[key] = remove_tags( str( house_resource.xpath( './span[2]').extract_first())) house_resources_r = xpath_css.xpath( '//div[@class="itemCon clearfix"]/ul[@class="ritem"]/li' ) for house_resource in house_resources_r: key1 = house_resource.xpath( './span[1]/text()').extract_first() key = house_config.get(key1) if key == '得房率': continue else: item[key] = remove_tags( str( house_resource.xpath( './span[2]').extract_first())) describes = xpath_css.xpath( '//*[@id="xzl_desc"]/div').extract_first() real_describe = remove_tags(str(describes)) item['describe'] = real_describe.replace('\xa0','').replace('\r','')\ .replace('\n','').replace('\t','').replace('\u200b','').replace('\u200c','').strip() item['lat_lng'] = real_lat_lng public_time = xpath_css.xpath( '//*[@id="xzl_desc"]/h3/div/text()')[1].root item['public_time'] = public_time.strip() house_number = xpath_css.xpath( '//*[@id="xzl_desc"]/h3/div/text()')[2].root item['house_number'] = house_number.strip() yield item except Exception as e: logger.error('严重错误看日志', e.args) if 'antispam' in response.url or 'jump' in response.url: url = response.meta.get('redirect_urls')[0] else: url = response.url logger.error( '出现异常下载,可能IP有问题---------------:{}'.format(url)) logger.error('重新入库') db.add_value('sp_detail:start_urls', url) else: logger.error('该URL已经失效:{}'.format(response.url)) db.add_value('not_url:sp_detail', response.url)