def get_food_list(category, poiInfos): item_list = [] for i in range(0, len(poiInfos)): item = MeituanItem() item.pk_id = str(uuid.uuid1()) item.dish_type = category item.restaurant_name = poiInfos[i]['title'] item.location = poiInfos[i]['address'] item.price = 0 if poiInfos[i]['avgPrice'] is None else int( poiInfos[i]['avgPrice']) item.star = float(poiInfos[i]['avgScore']) item.img_url = poiInfos[i]['frontImg'] item.comment_num = int(poiInfos[i]['allCommentNum']) item_list.append(item) return item_list
def parse_comment(self, response): content = response.body data = re.findall('<weak class="username">(.*?)</weak>', content) if data: item = MeituanItem() item['content'] = content shop_id = response.meta['shop_id'] item['meituantype'] = shop_id item['dt'] = dt yield item if len(data) == 15: url = response.url page = response.meta['page'] page = page + 1 url_list = url.split('page_') comment_url = url_list[0] + 'page_' + str(page) shop_id = response.meta['shop_id'] yield Request(comment_url, callback=self.parse_comment, dont_filter=True, headers=header, meta={ 'shop_id': shop_id, 'page': page })
def parse_comment1(self, response): content = response.body try: content_json = json.loads(content) item = MeituanItem() item['content'] = content shop_id = response.meta['shop_id'] item['meituantype'] = shop_id yield item page_sign = response.meta['page_sign'] if page_sign: total = content_json.get('content_json') page_ = divmod(int(total), 30) if page_[0]: pages = page_[0] + 1 for i in xrange(1, pages): offset = i * 30 print offset url = response.url url = url.split('&offset=') comment_url = url[0] + '&offset=%s' % offset print comment_url comment_url = comment_url yield Request(comment_url, callback=self.parse_comment, dont_filter=True, headers=header, meta={ 'shop_id': shop_id, 'page_sign': 0 }) except: pass
def parse_shop(self, response): content = response.body item = MeituanItem() item['content'] = content item['meituantype'] = 'shop' item['dt'] = dt yield item
def parse(self, response): self.offset += 20 result = json.loads(response.text) # 拿到酒店信息和id,id用来构造请求照片的链接 for i in result['data']['searchresult']: item = MeituanItem() item['name'] = i['name'] item['poiid'] = i['poiid'] # 价格-多少元起 item['originalPrice'] = i['originalPrice'] # 地址 item['addr'] = i['addr'] # 评论数 item['commentsCountDesc'] = re.search( '\d+', i['commentsCountDesc']).group() # 评分 item['avgScore'] = i['avgScore'] # 所属商圈 item['areaName'] = i['areaName'] # 标签 item['poiAttrTagList'] = i['poiAttrTagList'] meta = {'name': i['name']} yield item # 请求酒店图片的接口 get_img_url = 'https://ihotel.meituan.com/group/v1/poi/' + str( i['poiid'] ) + '/imgs?utm_medium=touch&version_name=999.9&classified=true&X-FOR-WITH=s99Eh6sbSKFk2CtgH69UM3vUAY6t0ETpasHKECThU0OOv6duCovPaszE2v8xYNb0y5tFbd0R1Bz7HxqoWbDCnlQzCaz1U%2FWG2Is1UD6ycN%2Fv5sM9vSw%2BtMe6IAQAogdarTlC3CQPD6IfxiaVmV005g%3D%3D' yield scrapy.Request(get_img_url, callback=self.parse_img, meta=meta) next_page_url = 'https://ihotel.meituan.com/hbsearch/HotelSearch?utm_medium=pc&version_name=999.9&cateId=20&attr_28=129&uuid=4C4844EA1E383289F62BCF8C630947BC4DA3121E283B39A9EF152D2C34203C23%401533035734323&cityId=96&offset=' + str( self.offset ) + '&limit=20&startDay=20180731&endDay=20180731&q=&sort=defaults&X-FOR-WITH=14%2FqUl622CaRIB0XobpocXLZRvKEUIWbrzBk%2BTGfRG79yI5uZ60MWS%2BwGukD4LfoDQ%2BVr%2BQFEJQOTlTggX031A4wTNWNW1rEcwxOSyFjHbQK6x1dLtpcn9xIOIfd%2F%2BdCWXRVaKJX7Sc9TSdBPzjiJg%3D%3D' yield scrapy.Request(next_page_url, callback=self.parse)
def parse_deal(self, response): content = response.body try: content_json = json.loads(content) item = MeituanItem() item['content'] = content item['meituantype'] = 'deal' item['dt'] = dt yield item except: pass
def parse(self, response): div_list = response.xpath("//div[@class='alphabet-city-area']//div") for div in div_list: citys = div.xpath("./span[2]//a") for city in citys: item = MeituanItem() item["city"] = city.xpath("./text()").extract_first() city_url = city.xpath("./@href").extract_first() city_url = urljoin(response.url, city_url) yield scrapy.Request(url=city_url + "/meishi/", callback=self.parse_url, meta={"item": item})
def parse_shop(self, response): item = MeituanItem() try: res_json = json.loads(response.text) result_list = res_json["data"]["searchresult"] for result in result_list: shop_id = result["poiid"] title = result["name"] city = response.meta["city"] phone = result["phone"] item["shop_id"] = shop_id item["title"] = title item["city"] = city item["phone"] = phone yield item except: print("返回内容无数据,内容为:",response.text)
def parse(self, response): #soup=BeautifulSoup(response) #print soup.prettify() movies = response.xpath('//a[@class="reco-movieinfo__cover"]') titles =movies.xpath('@title').extract() rates=movies.xpath("string(strong)").extract() img_url1=movies.xpath('img/@src').extract()[0:5] img_url2=movies.xpath('img/@data-src').extract() img_urls = img_url1+img_url2 count = 0 for title,rate,img_url in zip(titles,rates,img_urls): item = MeituanItem() item['title']=title item['rate']=rate item['source']='meituan' item['img_url']=img_url #print item yield item
def parse(self, response): response_str = response.body_as_unicode() detail_info = re.search( r'"detailInfo":\{"poiId":(\d+),"name":"(.*?)","avgScore":(.*?),"address":"(.*?)","phone":"(.*?)","openTime":"(.*?)","extraInfos":\[(.*?)\],"hasFoodSafeInfo":(.*?),"longitude":(.*?),"latitude":(.*?),"avgPrice":(\d+),"brandId":(\d+),"brandName":"(.*?)",".*?photos":{"frontImgUrl":"(.*?)","albumImgUrls":(.*?)},"recommended":(.*?),"crumbNav":(.*?),"prefer', response_str) if detail_info: poiId = detail_info.group(1) name = detail_info.group(2) avgScore = detail_info.group(3) address = detail_info.group(4) phone = detail_info.group(5) openTime = detail_info.group(6) extraInfos = detail_info.group(7) hasFoodSafeInfo = detail_info.group(8) longitude = detail_info.group(9) latitude = detail_info.group(10) avgPrice = detail_info.group(11) brandId = detail_info.group(12) brandName = detail_info.group(13) frontImgUrl = detail_info.group(14) albumImgUrls = detail_info.group(15) # 其他信息解析 if extraInfos: items = json.loads("[" + extraInfos + "]") extraInfos = '' for item_1 in items: extraInfos = item_1.get('text') + ' ' + extraInfos # 推荐菜处理 recommended = json.loads(detail_info.group(16)) # # 打印推荐菜 global recommend_name global recommend_price global recommend_img for item_1 in list(recommended): # recommend_id = item['id'] # 推荐菜id recommend_name = item_1['name'] # 推荐菜名 recommend_price = item_1['price'] # 菜品价格 recommend_img = item_1['frontImgUrl'] # 菜品图片 # print(recommend_name, end=' ') # 面包屑抽离 crumbNav = json.loads(detail_info.group(17)) area = crumbNav[0].get('title')[:-2] food_type = crumbNav[2].get('title')[len(area):] item = MeituanItem() item['area'] = area, item['food_type'] = food_type, item['poiId'] = poiId, item['name'] = name, item['avgScore'] = avgScore, item['address'] = address, item['phone'] = phone, item['openTime'] = openTime, item['extraInfos'] = extraInfos, item['hasFoodSafeInfo'] = hasFoodSafeInfo, item['longitude'] = longitude, item['latitude'] = latitude, item['avgPrice'] = avgPrice, item['brandId'] = brandId, item['brandName'] = brandName, item['frontImgUrl'] = frontImgUrl, item['albumImgUrls'] = albumImgUrls, item['recommend_name'] = recommend_name, item['recommend_price'] = recommend_price, item['recommend_img'] = recommend_img, yield item