class MeituanSpider(CrawlSpider): name = 'meituan' allowed_domains = ['meituan.com'] start_urls = [] startUrlsFile = "../hlwdata/data/url/meituan_start_url.txt" downLoadUrlsFile = "../hlwdata/data/url/meituan_download_url.txt" lst = loadUrl(downLoadUrlsFile) rules = (Rule(FilterLinkExtractor( allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst), callback='parse_meituan', process_links='link_filtering', follow=True), ) def link_filtering(self, links): for link in links: link.url = link.url.rstrip('.html') return links def start_requests(self): self.start_urls += loadUrl(self.startUrlsFile) for url in self.start_urls: yield Request(url, callback=self.parse_meituan) yield self.make_requests_from_url(url) def parse_meituan(self, response): with open(self.downLoadUrlsFile, 'a') as f: f.write(response.url + '\n') item = parse_meituan(response) if item: return item
class DianpingreviewSpider(scrapy.Spider): name = "dianpingreview" allowed_domains = ["dianping.com"] start_urls = ( 'https://www.dianping.com/shop/20919783/review_more', 'https://www.dianping.com/shop/18506539/review_more' ) start_urls =loadUrl("../hlwdata/data/url/dianping_start_review_url.txt") def parse(self, response): shop_id = re.search(r'/shop/([\d]+)/review_more', response.url).group(1) for comment in response.xpath('//div[@class="comment-list"]/ul/li'): item = DianPingReviewItem() item['shop_id'] = shop_id item['user_id'] = comment.xpath('./div[@class="pic"]/a/@user-id').extract() item['user_name'] = comment.xpath('./div[@class="pic"]/p[@class="name"]/a/text()').extract() item['review_star'] = comment.xpath('./div[@class="content"]/div[@class="user-info"]/span/@class').extract() item['review_Content'] = comment.xpath('./div[@class="content"]/div[@class="comment-txt"]/div/text()').extract() item['review_date'] = comment.xpath('./div[@class="content"]/div[@class="misc-info"]/span[@class="time"]/text()').extract() yield item nextPages = response.xpath('//div[@class="Pages"]/div[@class="Pages"]/a[@class="NextPage"]/@href').extract_first() if nextPages: meta = dict() url = response.url.split('?')[0] + nextPages meta['rowkey'] = url yield Request(response.url.split('?')[0] + nextPages, dont_filter=True,callback= self.parse, meta=meta)
class DianpingSpider(CrawlSpider): name = 'dianping' allowed_domains = ['dianping.com'] start_urls = [] startUrlsFile = "../hlwdata/data/url/dianping_start_url.txt" downLoadUrlsFile = "../hlwdata/data/url/dianping_download_url.txt" lst = loadUrl(downLoadUrlsFile) rules = ( Rule(FilterLinkExtractor(allow=r'/shop/[\d]+$', deny=r'upload/shop/', download=lst), callback='parse_dianping', follow=False), Rule(FilterLinkExtractor(allow=r'/search/category/[\d]+/[\d]+/p[\d]+', download=lst), callback='parse_dianping_search', follow=True), # Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$', download = lst), callback='parse_dianping', follow=True), ) def start_requests(self): self.start_urls += loadUrl(self.startUrlsFile) for url in self.start_urls: yield Request(url, callback=self.parse_dianping) yield self.make_requests_from_url(url) def parse_dianping(self, response): with open(self.downLoadUrlsFile, 'a') as f: f.write(response.url + '\n') item = parse_dianping(response) if item: return item def parse_dianping_search(self, response): pass
def start_requests(self): self.start_urls += loadUrl(self.startUrlsFile) for url in self.start_urls: yield Request(url, callback=self.parse_meituan) yield self.make_requests_from_url(url)
class NuomiSpider(CrawlSpider): name = 'nuomi' allowed_domains = ['nuomi.com'] start_urls = [] startUrlsFile = "../hlwdata/data/url/nuomi_deal_start_url.txt" downLoadUrlsFile = "../hlwdata/data/url/nuomi_deal_download_url.txt" downshopUrlsFile = "../hlwdata/data/url/nuomi_shop_download_url.txt" jsonDir = "../hlwdata/data/json/nuomi/shop/" jsonDir = "../hlwdata/data/json/nuomi/city/" lst = loadUrl(downLoadUrlsFile) rules = ( Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+', download=lst), callback='parse_nuomi_deal', follow=True), # Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/shop/[\d]+$', download = lst), callback='parse_nuomi_shop', follow=True), ) def start_requests(self): self.start_urls += loadUrl(self.startUrlsFile) for url in self.start_urls: yield Request(url, callback=self.parse_nuomi_deal) yield self.make_requests_from_url(url) visitedShop = set(loadUrl(downshopUrlsFile)) def parse_nuomi_deal(self, response): with open(self.downLoadUrlsFile, 'a') as f: f.write(response.url + '\n') navs = response.xpath( '//div[@class="w-bread-crumb"]/ul[@class="crumb-list clearfix"]/li/a/text()' ).extract() parmeta = dict() parmeta['nav'] = True parmeta['deal'] = response.url for i in range(6): parmeta['nav' + str(i)] = '' for i in range(len(navs)): parmeta['nav' + str(i)] = navs[i].strip('\n') dealId = response.xpath( '//div[@class="p-item-info"]/@mon').extract_first().split('=')[1] dealUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId # html = requests.get(dealUrl, headers=self.headers) # js['data']['city'][shopCity] html = requests.get(dealUrl) js = json.loads(html.text) for shop in js['data']['shop']: shopCity = shop['city_id'] district_id = shop['district_id'] shopId = shop['merchant_id'] with open(self.jsonDir + shopId, 'w') as f: f.write(json.dumps(shop)) with open(self.jsonDir + shopId + '.' + shopCity, 'w') as f: f.write(json.dumps(js['data']['city'][shopCity])) shoplink = shop['link'] #只获取重庆的美食信息 # if shopId in self.visitedShop or shopCity != u'900010000': if shoplink in self.visitedShop: continue else: self.visitedShop.add(shoplink) city = js['data']['city'][shopCity] shopCityName = city['city_name'] district = city['district'][district_id]['dist_name'] parmeta['shopCityName'] = shopCityName parmeta['district'] = district yield scrapy.Request(shop['link'], self.parse_nuomi_shop, meta=parmeta) def parse_nuomi_shop(self, response): with open(self.downshopUrlsFile, 'a') as f: f.write(response.url + '\n') meta = response.meta item = parse_nuomi(response, meta=response.meta) if item: return item
def start_requests(self): self.start_urls += loadUrl(self.startUrlsFile) for url in self.start_urls: yield Request(url,callback=self.parse_nuomi_deal) yield self.make_requests_from_url(url)
def start_requests(self): for url in loadUrl(self.startUrlsFile): yield self.make_requests_from_url(url) for url in self.start_urls: yield self.make_requests_from_url(url)
class ShopindexSpider(CrawlSpider): name = 'shopindex' allowed_domains = ['nuomi.com', 'dianping.com', 'cq.meituan.com'] start_urls = [ # 'https://www.dianping.com/shop/24098260' # 'http://cq.meituan.com/shop/82458075' # ,'http://www.nuomi.com/deal/d3ccslof.html' # ,'https://www.dianping.com/shop/32463358' ] # settings = get_project_settings() downLoadUrlsFile = '../hlwdata/data/start_url.txt' startUrlsFile = '../hlwdata/data/downloaded_url.txt' lst = loadUrl(downLoadUrlsFile) rules = ( Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+', download=lst), callback='parse_nuomi', follow=True), Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$', download=lst), callback='parse_dianping', follow=True), Rule(FilterLinkExtractor( allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst), callback='parse_meituan', process_links='link_filtering', follow=True), ) def link_filtering(self, links): for link in links: link.url = link.url.rstrip('.html') return links visitedShop = set() def start_requests(self): for url in loadUrl(self.startUrlsFile): yield self.make_requests_from_url(url) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_nuomi(self, response): #只爬取美食类信息 prdType = response.xpath( '//div[@class="w-bread-crumb"]//a[@href="/326"]/text()').extract() prdType = "".join(prdType).strip('\n') if prdType != u'美食': return items = [] sel = response.xpath('//div[@class="p-item-info"]') dealId = sel.xpath('@mon').extract_first().split('=')[1] shopUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId html = requests.get(shopUrl, headers=headers) js = json.loads(html.text) # shopCity = js['data']['city']['900010000']['city_name'] for shop in js['data']['shop']: shopId = shop['merchant_id'] shopCity = shop['city_id'] #只获取重庆的美食信息 # if shopId in self.visitedShop or shopCity != u'900010000': if shopId in self.visitedShop: continue else: self.visitedShop.add(shopId) shopName = shop['name'] shopCity = js['data']['city'][shopCity]['city_name'] shopAddr = shop['address'] shopPhone = shop['phone'] shopGlat = shop['baidu_latitude'] shopGlng = shop['baidu_longitude'] shopUrl = shop['link'] shopPicSave = '' shopScrapWeb = 'nuomi' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = shopCity item['shopName'] = shopName item['shopAddr'] = shopAddr item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb items.append(item) return items def parse_dianping(self, response): sel = response.xpath('//div[@id="basic-info"]') #只爬取美食类信息, 有如上标记,判断为美食信息 if not sel: print 'not meishi ' + response.url return shopId = re.search(r'/shop/([\d]+)$', response.url).group(1) if shopId in self.visitedShop: return else: self.visitedShop.add(shopId) shopCity = response.xpath( '//*[@id="page-header"]//a[@class="city J-city"]/text()' ).extract_first() shopName = sel.xpath('h1[@class="shop-name"]/text()').extract_first() shopAddr = sel.xpath( './/span[@itemprop="street-address"]/text()').extract_first() shopPhone = sel.xpath( './/span[@itemprop="tel"]/text()').extract_first() # shopDataUrl = 'http://www.dianping.com/ajax/json/shop/wizard/BasicHideInfoAjaxFP?shopId=%s'%shopId # htmlshop = requests.get(shopDataUrl, headers= headers) # try: # shopJson = json.loads(htmlshop.text) # shopInfo = shopJson['msg']['shopInfo'] # shopGlat = str(shopInfo['glat']) # shopGlng = str(shopInfo['glng']) # # except (ValueError, KeyError, TypeError): # print "JSON format error" shopInfo = '' lng = re.search(r'lng:([\d]+\.[\d]+)', response.body) lat = re.search(r'lat:([\d]+\.[\d]+)', response.body) shopGlat = '' shopGlng = '' if lng and lat: shopGlng = lng.group(1) shopGlat = lat.group(1) shopUrl = response.url shopPicSave = '' shopScrapWeb = 'dianping' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = shopCity item['shopName'] = shopName.strip('\n').strip(' ').strip('\n') item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n') item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb yield item def parse_meituan(self, response): sel = response.xpath('//div[@class="fs-section__left"]') # if not response.xpath('//div[@id="meishi-menu"]/h2[@class="content-title"]'): # print 'not meishi ' + response.url # return shopId = re.search(r'/shop/([\d]+)$', response.url).group(1) if shopId in self.visitedShop: return else: self.visitedShop.add(shopId) shopName = sel.xpath( './/h2/span[@class="title"]/text()').extract_first() shopAddr = sel.xpath('.//p/span[@class="geo"]/text()').extract_first() shopJson = json.loads( sel.xpath( './/p/span[@id="map-canvas"]/@data-params').extract_first()) shopInfo = shopJson['shops'][shopId] shopPhone = shopInfo['phone'] shopGlat = str(shopInfo['position'][0]) shopGlng = str(shopInfo['position'][1]) shopUrl = response.url shopPicSave = '' shopScrapWeb = 'meituan' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = '' item['shopName'] = shopName.strip('\n').strip(' ').strip('\n') item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n') item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb yield item