def detail_parse(self, response): print response.url city = self._get_city_from_url(response.url) filename = "{0}/{1}/{2}/{3}.csv".format(DATA_BASE_PATH, self.data_date, self.allowed_domains[0], city) contexts = response.xpath( '//li[@class="regular-search-result"]/div').extract() for c in contexts: s = Selector(text=c) item = YelpItem() item['filename'] = filename item['name'] = [ _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath( '//a[@class="biz-name js-analytics-click"]//text()'). extract() ] item['address'] = [ _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath('//address/text()').extract() ] item['phone'] = [ _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath( '///span[@class="biz-phone"]/text()').extract() ] item['categories'] = [ _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath( '//span[@class="category-str-list"]/a/text()').extract() ] item['img'] = s.xpath( '//div[@class="photo-box pb-90s"]/a/img/@src').extract()[0] yield item
def parse(self, response, **kwargs): loader = ItemLoader(item=YelpItem(), response=response) loader.default_output_processor = TakeFirst() for script in response.css('script').getall(): if '{"gaConfig' in script: detail_json = json.loads( re.search(r'({"gaConfig.*?)-->', script).group(1)) loader.add_value('direct_url', detail_json['staticUrl']) loader.add_value( 'business_id', detail_json['bizDetailsPageProps'] ['bizContactInfoProps']['businessId']) loader.add_value( 'categories', detail_json['gaConfig']['dimensions']['www'] ['second_level_categories'][1]) loader.add_value( 'site', detail_json['bizDetailsPageProps']['bizContactInfoProps'] ['businessWebsite']['linkText']) loader.add_value('title', detail_json['bizDetailsPageProps']['businessName']) loader.add_value( 'review_count', detail_json['bizDetailsPageProps'] ['ratingDetailsProps']['numReviews']) yield scrapy.Request( 'https://www.yelp.com/biz_attribute?biz_id={}'.format("".join( loader.get_output_value('business_id'))), method='GET', callback=self.linkedData, meta={'item': loader.load_item()})
def parse(self, response): sel = Selector(response) infos = sel.xpath('//a[@class="biz-name"]') for info in infos: name = ''.join(info.xpath('text()').extract()).strip() #if 'Vietthao' in name: if True: url = ''.join(info.xpath('@href').extract()).strip() #print url #print name item = YelpItem() item['name'] = name url_i = "http://%s%s" % (urlparse(response.url).hostname, url) yield Request(url_i, meta={'item': item}, callback=self.parse_items) next = sel.xpath( '//a[@class="page-option prev-next next"]/@href').extract() for i in next: url_i = "http://%s%s" % (urlparse(response.url).hostname, i) yield Request(url_i, callback=self.parse)
def parse(self, response, **kwargs): page_hash_script = response.css( '#yelp-js-error-reporting-init-error-reporting::text').get() page_hash = json.loads(page_hash_script).get('config', {}).get('release') if not page_hash: raise CloseSpider('Have no page hash') data_key = "yelp_main__{}__yelp_main__BizDetailsApp__dynamic".format( page_hash) data_block = response.xpath( '//div[@data-hypernova-key="{}"]'.format(data_key)) main_data = data_block.xpath( 'script[contains(text(), "telephone")]/text()').get() main_data = json.loads(main_data) item = YelpItem() item['name'] = main_data.get('name') item['url'] = response.url item['email'] = None item['address'] = main_data['address'] item['rating'] = main_data['aggregateRating']['ratingValue'] item['reviews_count'] = main_data['aggregateRating']['reviewCount'] cat_block = response.xpath( '//script[@data-hypernova-key="{}"]/text()'.format( data_key)).get() cat_data = json.loads(cat_block[4:-3]) business_id = cat_data['bizDetailsPageProps']['claimStatusGQLProps'][ 'businessId'] item['id'] = business_id item['categories'] = cat_data['gaConfig']['dimensions']['www']['second_level_categories'] + \ cat_data['gaConfig']['dimensions']['www']['top_level_categories'] item['business_website'] = cat_data['bizDetailsPageProps']['bizContactInfoProps'] \ .get('businessWebsite', {}).get('linkText') item['work_schedule'] = cat_data['bizDetailsPageProps'][ 'bizHoursProps']['hoursInfoRows'] item['about_business'] = cat_data['bizDetailsPageProps'][ 'fromTheBusinessProps'] # item['about_business'] = cat_data['bizDetailsPageProps'].get('fromTheBusinessProps', {}) \ # .get('fromTheBusinessContentProps') item['main_image'] = cat_data['bizDetailsPageProps'][ 'photoHeaderProps']['photoHeaderMedias'] item['phone'] = cat_data['bizDetailsPageProps'][ 'bizContactInfoProps'].get('phoneNumber') self.AMENITIES_DATA[0]['variables']['BizEncId'] = business_id yield scrapy.http.JsonRequest(self.AMENITIES_URL, data=self.AMENITIES_DATA, callback=self.parse_amenities, method='POST', meta={'item': item})
def parse_result_page( self, response): #actually extracting the info from each page reviews = response.xpath( '//span[@class="lemon--span__373c0__3997G display--inline__373c0__1DbOG border-color--default__373c0__2oFDT"]/div/@aria-label' ).extract() city = response.xpath( '//span[@class="queryLocation__373c0__15viw"]/text()').extract() for k in range(1, len(reviews)): print(k) item = YelpItem() item['reviews'] = reviews[k] item['city'] = city yield item
def parse(self, response): loader = ItemLoader(item=YelpItem(), response=response) data = response.css("script[type='application/ld+json']::text").get() data = json.loads(data) app_data = response.xpath( "//script[contains(@data-hypernova-key, 'BizDetailsApp')]/text()" ).get() app_data = json.loads(app_data.strip('--><!--')) _properties = response.css("script[data-apollo-state]::text").get() categories = app_data['adSyndicationConfig'].get('categoryAliases') from_the_biz = app_data['bizDetailsPageProps'].get( 'fromTheBusinessProps') rating_review = data.get('aggregateRating') address = data.get('address') schedule_sel = response.css("tbody.lemon--tbody__373c0__2T6Pl tr") schedule_d = self.get_schedule(schedule_sel) if from_the_biz is not None: specialties = from_the_biz['fromTheBusinessContentProps'][ 'specialtiesText'] history = from_the_biz['fromTheBusinessContentProps'][ 'historyText'] year_established = from_the_biz['fromTheBusinessContentProps'][ 'yearEstablished'] about = self.get_about_text(specialties, history, year_established) loader.add_value('about', about) if address is not None: loader.add_value('geo_street', address.get('streetAddress')) loader.add_value('geo_city', address.get('addressLocality')) loader.add_value('geo_state', address.get('addressRegion')) loader.add_value('geo_country', address.get('addressCountry')) loader.add_value('geo_post_code', address.get('postalCode')) if rating_review is not None: loader.add_value('rating', float(rating_review.get('ratingValue'))) loader.add_value('reviews_count', rating_review.get('reviewCount')) loader.add_value('schedule_d', schedule_d) loader.add_value('name', data.get('name')) loader.add_value('url', response.request.url) loader.add_xpath('biz_id', "//a[contains(@href, 'biz_id=')]/@href") loader.add_value('image', data.get('image')) loader.add_value('phone', data.get('telephone')) loader.add_value('categories', categories) loader.add_xpath('link', "//a[contains(@href, '/biz_redir?')]/@href") loader.add_value('_properties', _properties) yield loader.load_item()
def parse_item(self, response): hxs = HtmlXPathSelector(response) item = YelpItem() website_div = hxs.select("//div[@class='biz-website']") yelp_redirect_url = website_div.select("a/@href").extract() # address_div = hxs.select("//address") # item['street_address'] = hxs.select("//address/span[itemprop='streetAddress']").extract() # item['postal_code'] = "" # item['city'] = "" if (yelp_redirect_url != []): site_parse = urlparse(''.join(yelp_redirect_url)) site_qs = parse_qs(site_parse.query) site_url = site_qs['url'] site = ''.join(site_url) item['external_website'] = site yield item
def parse(self, response, **kwargs): loader = ItemLoader(item=YelpItem(), response=response) for script in response.css('script').getall(): if '{"gaConfig' in script: detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1)) loader.add_value('direct_url', detail_json['staticUrl']) loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId']) loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1]) if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']: loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText']) loader.add_value('title', detail_json['bizDetailsPageProps']['businessName']) loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews']) #TODO: find way to not use hardcoded documentIds post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}] yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data), headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
def parse_restaurant_reviews_page(self, response): reviews = response.xpath('//div[@class = "review review--with-sidebar"]') restaurant = response.xpath('//div[@class = "biz-page-header-left claim-status"]/div/h1/text()').extract_first().strip() address = response.xpath('//div[@class="mapbox"]//address/text()').extract_first().strip() price = response.xpath('//span[@class="business-attribute price-range"]/text()').extract_first() for review in reviews: rating = review.xpath('.//div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title').extract_first()[0] text = review.xpath('.//p[@lang="en"]/text()').extract() date = review.xpath('.//span[@class="rating-qualifier"]/text()').extract_first().strip() item = YelpItem() item['restaurant'] = restaurant item['rating'] = rating item['text'] = text item['date'] = date item['address'] = address item['price'] = price yield item
def parse(self, response): meta_bizid = response.xpath( '/html/head/meta[@name="yelp-biz-id"]/@content').get() data_content = response.xpath( '//script[@type="application/ld+json"]//text()').getall() data_json = response.xpath( '//script[@type="application/json"]//text()').getall() general_data = json.loads(data_content[0]) business_data1 = self._prepare_json(data_json[2]) business_data2 = self._prepare_json(data_json[3]) biz_details1 = json.loads(business_data1)["bizDetailsPageProps"] biz_details2 = json.loads(business_data2) l = ItemLoader(item=YelpItem(), response=response) l.add_value("name", general_data["name"]) l.add_value("item_url", response.url) l.add_value("biz_id", meta_bizid) l.add_xpath("image", '/html/head/meta[@property="og:image"]/@content') l.add_value( "phone", (general_data["telephone"] if "telephone" in general_data.keys() else None), ) l.add_value("email", (general_data["email"] if "email" in general_data.keys() else None)) l.add_value("address", general_data["address"]) l.add_value("rating_value", general_data["aggregateRating"]["ratingValue"]) l.add_value("review_count", general_data["aggregateRating"]["reviewCount"]) l.add_value("categories", self._get_categories(data=data_content)) l.add_value("home_url", self._get_homeurl(data=biz_details1)) encid = "{'encid':'" + meta_bizid + "'}" client_platform = "{'clientPlatform':'WWW'}" l.add_value("hours", self._get_hours(encid=encid, data=biz_details2)) l.add_value("about", self._get_about(data=biz_details1)) l.add_value( "amenities", self._get_amenities(encid=encid, client_platform=client_platform, data=biz_details2), ) return l.load_item()
def parse_gym_reviews_page(self, response): reviews = response.xpath('//li [@class="lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU"]') gym = response.xpath('//h1[@class="lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"]/text()').extract_first() address = response.xpath('//span [@class="lemon--span__373c0__3997G raw__373c0__3rcx7"]/text()').extract_first() zipcode = re.findall(r'9\d{4}',",".join(response.xpath('//span [@class="lemon--span__373c0__3997G raw__373c0__3rcx7"]/text()').extract()))[0] category = re.findall(r'(\w+\s?\w+)', ",".join(response.xpath('//span [@class="lemon--span__373c0__3997G display--inline__373c0__3JqBP margin-r1__373c0__zyKmV border-color--default__373c0__3-ifU"]//text()').extract())) try: num_review = int(re.search(r'(\d*\.?\d*)', response.xpath('//p [@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/text()').extract_first()).group(0)) except: num_review = 0 about = response.xpath('//div [@class="lemon--div__373c0__1mboc margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/p/span//text()').extract_first() region = response.xpath('//div [@class="lemon--div__373c0__1mboc pseudoIsland__373c0__Fak5q"]//p [@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/text()').extract_first() avg_rating = float(response.xpath('//span [@class="lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU"]/div/@aria-label').extract_first().split()[0]) for review in reviews: user_name = review.xpath('.//div [@class="lemon--div__373c0__1mboc border-color--default__373c0__3-ifU"]//a [@class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"]/text()').extract_first() user_id = review.xpath('.//div [@class="lemon--div__373c0__1mboc border-color--default__373c0__3-ifU"]//a [@class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"]/@href').extract_first().partition("=")[2] rating = int(review.xpath('.//div [@class="lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-grid-column--8__373c0__2dUx_ border-color--default__373c0__3-ifU"]//div/@aria-label').extract_first().split()[0]) text = review.xpath('.//div [@class ="lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-grid-column--8__373c0__2dUx_ border-color--default__373c0__3-ifU"]//p [@class= "lemon--p__373c0__3Qnnj text__373c0__2Kxyz comment__373c0__3EKjH text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/span [@class="lemon--span__373c0__3997G raw__373c0__3rKqk"]/text()').extract_first() reviewer_date = review.xpath('.//span [@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"]/text()').extract_first() item = YelpItem() item["gym"] = gym item["zipcode"] = zipcode item["address"] = address item["category"] = category item["about"] = about item["region"] = region item["num_review"] = num_review item["avg_rating"] = avg_rating item["user_name"] = user_name item["user_id"] = user_id item["rating"] = rating item["text"] = text item["reviewer_date"] = reviewer_date yield item
def parse(self, response): # Defining rows to be scraped rows = response.xpath( '//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/ul/li') for row in rows: # Scraping Busines' Name name = row.xpath('.//p/a/text()').extract_first() # Scraping Phone number phone = row.xpath( './/div[1]/p[1][@class= "lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--right__373c0__3ARv7"]/text()' ).extract_first() # scraping area area = row.xpath( './/p/span[@class = "lemon--span__373c0__3997G"]/text()' ).extract_first() # Scraping services they offer services = row.xpath( './/a[@class="lemon--a__373c0__IEZFH link__373c0__29943 link-color--inherit__373c0__15ymx link-size--default__373c0__1skgq"]/text()' ).extract() # Extracting internal link link = row.xpath('.//p/a/@href').extract_first() link = response.urljoin(link) item = YelpItem() item['name'] = name item['phone'] = phone item['area'] = area item['services'] = services item['link'] = link yield scrapy.Request(link, callback=self.parse_detail, meta={'item': item})
def parse_page(self, response): Names = response.css( "div.lemon--div__373c0__1mboc.margin-b1__373c0__1khoT.border-color--default__373c0__3-ifU>h1.lemon--h1__373c0__2ZHSL.heading--h1__373c0___56D3.undefined.heading--inline__373c0__1jeAh::text" ).extract_first() Phone = response.xpath( "//p[contains(text(), 'Phone number')]/following-sibling::p/text()" ).extract_first() Website = response.xpath("//a[@rel='noopener']/text()").extract_first() Open_Status = response.xpath( "//span[contains(@class, 'status')]/text()").extract_first() Postal_code = response.xpath("//address/p/span/text()").extract() rating = response.css( "div.lemon--div__373c0__1mboc.i-stars__373c0__1T6rz.i-stars--large-1__373c0__1kclN.border-color--default__373c0__3-ifU.overflow--hidden__373c0__2y4YK::attr(aria-label)" ).extract_first() days = response.xpath( "//tbody[@class='lemon--tbody__373c0__2T6Pl']/tr/th/p/text()" ).getall() hrs = response.xpath( "//tbody[@class='lemon--tbody__373c0__2T6Pl']//p[contains(@class, 'no-wrap')]/text()" ).getall() mon = days[0] + '--' + hrs[0] tue = days[1] + '--' + hrs[1] wed = days[2] + '--' + hrs[2] thu = days[3] + '--' + hrs[3] fri = days[4] + '--' + hrs[4] sat = days[5] + '--' + hrs[5] sun = days[6] + '--' + hrs[6] total = mon + '\n' + tue + '\n' + wed + '\n' + thu + '\n' + fri + '\n' + sat + '\n' + sun loader = ItemLoader(item=YelpItem()) loader.add_value("Business_Name", Names) loader.add_value("Phone", Phone) loader.add_value("Website", Website) loader.add_value("Open_Status", Open_Status) loader.add_value("Postal_Code", Postal_code) loader.add_value("Rating", rating) loader.add_value("Open_Hours", total) return loader.load_item()
def get_review(self, response): item = YelpItem() name_list = response.css( 'h1[class="biz-page-title embossed-text-white shortenough"]::text' ).extract() if (len(name_list) == 0): name = 'none' name = ' '.join(name_list) address_list = response.css('strong[class="street-address"]').css( 'address::text').extract() if (len(address_list) == 0): address = 'none' address = ' '.join(address_list) date = response.css( 'span[class="rating-qualifier"]::text').extract_first() review = response.css('li').css('div').css( 'p[lang = "en"]::text').extract() item['name'] = name item['address'] = address item['date'] = date item['review'] = review yield item
def parse_business_page(self, response): restaurant_name = response.xpath( '//h1[@class="lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"]/text()' ).extract_first() try: avg_rating = response.xpath( '//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-1-5__373c0__2vL-3 vertical-align-middle__373c0__1SDTo margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/div/span/div/@aria-label' ).extract_first() avg_rating = float( re.findall('(\d?\.?\d) star rating', avg_rating)[0]) except: avg_rating = None print('=' * 50) print(f'Error with avg_rating at url: {response.url}') print('=' * 50) try: num_reviews = response.xpath( '//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/text()' ).extract_first() num_reviews = int(re.findall('(\d+) review[s]?', num_reviews)[0]) except: num_reviews = 0 print('=' * 50) print(f'Error with num_reviews at url: {response.url}') print('=' * 50) phone_num = response.xpath( '//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-2__373c0__1DiLQ vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU"]//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/text()' ).extract_first() address = response.xpath( '//address[@class="lemon--address__373c0__2sPac"]//span/text()' ).extract() address = ', '.join(address) days = response.xpath( '//table[@class="lemon--table__373c0__2clZZ hours-table__373c0__1S9Q_ table__373c0__3JVzr table--simple__373c0__3lyDA"]//tr' ) hours_dict = { day.xpath('./th/p/text()').extract_first(): day.xpath('./td/ul/li/p/text()').extract_first() for day in days } # for day in days: # key = day.xpath('./th/p/text()').extract_first() # value = day.xpath('./td/ul/li/p/text()').extract_first() # hours_dict[key] = value price_range = response.xpath( '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-bullet--after__373c0__3fS1Z text-size--large__373c0__3t60B"]/text()' ).extract_first() category = response.xpath( '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/a/text()' ).extract() # This return a list of all review ratings in the first review page , some problems # review_rating = response.xpath('//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-1__373c0__2l5bx vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU"]/div/span/div/@aria-label').extract()[:20] review_rating = response.xpath( '//div[@class="lemon--div__373c0__1mboc margin-t1__373c0__oLmO6 margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/div/div/span/div/@aria-label' ).extract() review_rating = review_rating[: -1] # Drop the last review which is not chronological review_date = response.xpath( '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"]/text()' ).extract() review_date = review_date[: -1] # Drop the last review which is not chronological # review_user = response.xpath('//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz fs-block text-color--blue-dark__373c0__1jX7S text-align--left__373c0__2XGa- text-weight--bold__373c0__1elNz"]/a/text()').extract() recent_reviews = list(zip(review_rating, review_date)) covid_updates_text = response.xpath( '//div[@class="lemon--div__373c0__1mboc margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]//p/text()' ).extract_first() covid_update_time = response.xpath( '//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--subtle__373c0__3DZpi text-align--left__373c0__2XGa-"]/text()' ).extract_first() pairs = response.xpath( '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-weight--semibold__373c0__2l0fe text-size--large__373c0__3t60B"]/text()' ).extract() covid_services = {} for x in response.xpath( '//div[@class="lemon--div__373c0__1mboc margin-t2__373c0__1CFWK border-color--default__373c0__3-ifU"]//div[@class="lemon--div__373c0__1mboc display--inline-block__373c0__1ZKqC margin-r3__373c0__r37sx margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]' ).getall(): service = re.search('<span.*?text.*?([A-Za-z\s\-]*)<\/span', x) if service != None: covid_services[service.group(1)] = re.search("checkmark", x) != None item = YelpItem() item['restaurant_name'] = restaurant_name item['avg_rating'] = avg_rating item['num_reviews'] = num_reviews item['phone_num'] = phone_num item['address'] = address item['business_hours'] = hours_dict item['price_range'] = price_range item['category'] = category item['recent_reviews'] = recent_reviews item['covid_updates_text'] = covid_updates_text item['covid_update_time'] = covid_update_time item['covid_services'] = covid_services item['location'] = response.meta['location'] item['cuisine'] = response.meta['cuisine'] item['url'] = response.meta['url'] yield item
def parse1(self,response): item = YelpItem() # 详情页面URL item['detail_page_url'] = response.meta['aurl'] # 城市 item['city'] = 'Oakland, CA' # 名称 name1 = ''.join(response.xpath('//div[@class="biz-page-header-left claim-status"]/div/h1/text()').extract()) name2 = ''.join(response.xpath('//div[@class="biz-page-header-left claim-status"]/div/div/h1/text()').extract()) item['name'] = name1 + name2 # 电话 if response.xpath('//div[@class="mapbox-text"]/ul/li[2]/span[3]/text()'): item['tel'] = ''.join(response.xpath('//div[@class="mapbox-text"]/ul/li[2]/span[3]/text()').extract()).replace("\n", "").replace(" ", "") elif response.xpath('//div[@class="mapbox-text"]/ul/li[3]/span[3]/text()'): item['tel'] = ''.join(response.xpath('//div[@class="mapbox-text"]/ul/li[3]/span[3]/text()').extract()).replace("\n", "").replace(" ", "") else: item['tel'] = '' # 街道 item['street'] =''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong/text()').extract()).strip().replace("\n", "") # 地址 if response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/address'): item['address'] = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/address/text()').extract()).strip().replace("\n", "") elif response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong'): item['address'] = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong/address/text()').extract()).strip().replace("\n", "") else: item['address'] = '' # 官网 if response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[3]/span[2]/a/@href'): email = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[3]/span[2]/a/@href').extract()) email = email.split('&website_link')[0].split('2F')[-1] item['email'] = email elif response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[4]/span[2]/a/@href'): email = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[4]/span[2]/a/@href').extract()) email = email.split('&website_link')[0].split('2F')[-1] item['email'] = email else: item['email'] = '' # 分类 if response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span[2]/a/text()'): item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span[2]').xpath('string(.)').extract()).replace("\n", "").replace(" ", "") elif response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span/a/text()'): item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span').xpath('string(.)').extract()).replace("\n", "").replace(" ", "") elif response.xpath('//div[@class="biz-main-info embossed-text-white"]/div/span/text()'): item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div/span').xpath('string(.)').extract()).replace("\n", "").replace(" ", "") # 描述 item['description'] = ''.join(response.xpath('//div[@class="from-biz-owner-content"]/p/text()').extract()).replace("\n", "").replace(" ", "") # logo图url item['logo_imgurl'] = response.meta['logo_imgurl'] # 经纬度 location = ''.join(response.xpath('//div[@class="mapbox-map"]/div/@data-map-state').extract()) location = re.findall('"location": {"latitude": (.*?), "longitude": (.*?)},',location) if location: item['latitude'] = re.findall("'(.*?)'",str(location))[0] item['longitude'] = re.findall("'(.*?)'",str(location))[1] else: item['latitude'] = '' item['longitude'] = '' # 背景图 item['back_img'] = ''.join(response.xpath('//div[@class="js-photo photo photo-1"]/div/a/img/@src').extract()) yield item time.sleep(0.5)