コード例 #1
0
ファイル: yelpspider.py プロジェクト: moiiom/yelp
    def detail_parse(self, response):
        print response.url
        city = self._get_city_from_url(response.url)
        filename = "{0}/{1}/{2}/{3}.csv".format(DATA_BASE_PATH, self.data_date,
                                                self.allowed_domains[0], city)

        contexts = response.xpath(
            '//li[@class="regular-search-result"]/div').extract()
        for c in contexts:
            s = Selector(text=c)
            item = YelpItem()
            item['filename'] = filename
            item['name'] = [
                _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath(
                    '//a[@class="biz-name js-analytics-click"]//text()').
                extract()
            ]
            item['address'] = [
                _.replace('\n', '').replace('\t', '').strip()
                for _ in s.xpath('//address/text()').extract()
            ]
            item['phone'] = [
                _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath(
                    '///span[@class="biz-phone"]/text()').extract()
            ]
            item['categories'] = [
                _.replace('\n', '').replace('\t', '').strip() for _ in s.xpath(
                    '//span[@class="category-str-list"]/a/text()').extract()
            ]
            item['img'] = s.xpath(
                '//div[@class="photo-box pb-90s"]/a/img/@src').extract()[0]
            yield item
コード例 #2
0
 def parse(self, response, **kwargs):
     loader = ItemLoader(item=YelpItem(), response=response)
     loader.default_output_processor = TakeFirst()
     for script in response.css('script').getall():
         if '{"gaConfig' in script:
             detail_json = json.loads(
                 re.search(r'({"gaConfig.*?)-->', script).group(1))
     loader.add_value('direct_url', detail_json['staticUrl'])
     loader.add_value(
         'business_id', detail_json['bizDetailsPageProps']
         ['bizContactInfoProps']['businessId'])
     loader.add_value(
         'categories', detail_json['gaConfig']['dimensions']['www']
         ['second_level_categories'][1])
     loader.add_value(
         'site', detail_json['bizDetailsPageProps']['bizContactInfoProps']
         ['businessWebsite']['linkText'])
     loader.add_value('title',
                      detail_json['bizDetailsPageProps']['businessName'])
     loader.add_value(
         'review_count', detail_json['bizDetailsPageProps']
         ['ratingDetailsProps']['numReviews'])
     yield scrapy.Request(
         'https://www.yelp.com/biz_attribute?biz_id={}'.format("".join(
             loader.get_output_value('business_id'))),
         method='GET',
         callback=self.linkedData,
         meta={'item': loader.load_item()})
コード例 #3
0
    def parse(self, response):
        sel = Selector(response)
        infos = sel.xpath('//a[@class="biz-name"]')

        for info in infos:
            name = ''.join(info.xpath('text()').extract()).strip()

            #if 'Vietthao' in name:
            if True:
                url = ''.join(info.xpath('@href').extract()).strip()
                #print url
                #print name
                item = YelpItem()

                item['name'] = name
                url_i = "http://%s%s" % (urlparse(response.url).hostname, url)
                yield Request(url_i,
                              meta={'item': item},
                              callback=self.parse_items)

        next = sel.xpath(
            '//a[@class="page-option prev-next next"]/@href').extract()
        for i in next:
            url_i = "http://%s%s" % (urlparse(response.url).hostname, i)
            yield Request(url_i, callback=self.parse)
コード例 #4
0
    def parse(self, response, **kwargs):
        page_hash_script = response.css(
            '#yelp-js-error-reporting-init-error-reporting::text').get()
        page_hash = json.loads(page_hash_script).get('config',
                                                     {}).get('release')

        if not page_hash:
            raise CloseSpider('Have no page hash')

        data_key = "yelp_main__{}__yelp_main__BizDetailsApp__dynamic".format(
            page_hash)

        data_block = response.xpath(
            '//div[@data-hypernova-key="{}"]'.format(data_key))
        main_data = data_block.xpath(
            'script[contains(text(), "telephone")]/text()').get()
        main_data = json.loads(main_data)

        item = YelpItem()

        item['name'] = main_data.get('name')
        item['url'] = response.url
        item['email'] = None
        item['address'] = main_data['address']
        item['rating'] = main_data['aggregateRating']['ratingValue']
        item['reviews_count'] = main_data['aggregateRating']['reviewCount']

        cat_block = response.xpath(
            '//script[@data-hypernova-key="{}"]/text()'.format(
                data_key)).get()
        cat_data = json.loads(cat_block[4:-3])
        business_id = cat_data['bizDetailsPageProps']['claimStatusGQLProps'][
            'businessId']

        item['id'] = business_id
        item['categories'] = cat_data['gaConfig']['dimensions']['www']['second_level_categories'] + \
                             cat_data['gaConfig']['dimensions']['www']['top_level_categories']
        item['business_website'] = cat_data['bizDetailsPageProps']['bizContactInfoProps'] \
            .get('businessWebsite', {}).get('linkText')
        item['work_schedule'] = cat_data['bizDetailsPageProps'][
            'bizHoursProps']['hoursInfoRows']
        item['about_business'] = cat_data['bizDetailsPageProps'][
            'fromTheBusinessProps']
        # item['about_business'] = cat_data['bizDetailsPageProps'].get('fromTheBusinessProps', {}) \
        #     .get('fromTheBusinessContentProps')
        item['main_image'] = cat_data['bizDetailsPageProps'][
            'photoHeaderProps']['photoHeaderMedias']
        item['phone'] = cat_data['bizDetailsPageProps'][
            'bizContactInfoProps'].get('phoneNumber')

        self.AMENITIES_DATA[0]['variables']['BizEncId'] = business_id
        yield scrapy.http.JsonRequest(self.AMENITIES_URL,
                                      data=self.AMENITIES_DATA,
                                      callback=self.parse_amenities,
                                      method='POST',
                                      meta={'item': item})
コード例 #5
0
 def parse_result_page(
         self, response):  #actually extracting the info from each page
     reviews = response.xpath(
         '//span[@class="lemon--span__373c0__3997G display--inline__373c0__1DbOG border-color--default__373c0__2oFDT"]/div/@aria-label'
     ).extract()
     city = response.xpath(
         '//span[@class="queryLocation__373c0__15viw"]/text()').extract()
     for k in range(1, len(reviews)):
         print(k)
         item = YelpItem()
         item['reviews'] = reviews[k]
         item['city'] = city
         yield item
コード例 #6
0
    def parse(self, response):
        loader = ItemLoader(item=YelpItem(), response=response)

        data = response.css("script[type='application/ld+json']::text").get()
        data = json.loads(data)
        app_data = response.xpath(
            "//script[contains(@data-hypernova-key, 'BizDetailsApp')]/text()"
        ).get()
        app_data = json.loads(app_data.strip('--><!--'))
        _properties = response.css("script[data-apollo-state]::text").get()
        categories = app_data['adSyndicationConfig'].get('categoryAliases')
        from_the_biz = app_data['bizDetailsPageProps'].get(
            'fromTheBusinessProps')
        rating_review = data.get('aggregateRating')
        address = data.get('address')
        schedule_sel = response.css("tbody.lemon--tbody__373c0__2T6Pl tr")
        schedule_d = self.get_schedule(schedule_sel)

        if from_the_biz is not None:
            specialties = from_the_biz['fromTheBusinessContentProps'][
                'specialtiesText']
            history = from_the_biz['fromTheBusinessContentProps'][
                'historyText']
            year_established = from_the_biz['fromTheBusinessContentProps'][
                'yearEstablished']
            about = self.get_about_text(specialties, history, year_established)
            loader.add_value('about', about)

        if address is not None:
            loader.add_value('geo_street', address.get('streetAddress'))
            loader.add_value('geo_city', address.get('addressLocality'))
            loader.add_value('geo_state', address.get('addressRegion'))
            loader.add_value('geo_country', address.get('addressCountry'))
            loader.add_value('geo_post_code', address.get('postalCode'))

        if rating_review is not None:
            loader.add_value('rating', float(rating_review.get('ratingValue')))
            loader.add_value('reviews_count', rating_review.get('reviewCount'))

        loader.add_value('schedule_d', schedule_d)
        loader.add_value('name', data.get('name'))
        loader.add_value('url', response.request.url)
        loader.add_xpath('biz_id', "//a[contains(@href, 'biz_id=')]/@href")
        loader.add_value('image', data.get('image'))
        loader.add_value('phone', data.get('telephone'))
        loader.add_value('categories', categories)
        loader.add_xpath('link', "//a[contains(@href, '/biz_redir?')]/@href")
        loader.add_value('_properties', _properties)

        yield loader.load_item()
コード例 #7
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     item = YelpItem()
     website_div = hxs.select("//div[@class='biz-website']")
     yelp_redirect_url = website_div.select("a/@href").extract()
     # address_div = hxs.select("//address")
     # item['street_address'] = hxs.select("//address/span[itemprop='streetAddress']").extract()
     # item['postal_code'] = ""
     # item['city'] = ""
     if (yelp_redirect_url != []):
         site_parse = urlparse(''.join(yelp_redirect_url))
         site_qs = parse_qs(site_parse.query)
         site_url = site_qs['url']
         site = ''.join(site_url)
         item['external_website'] = site
         yield item
コード例 #8
0
    def parse(self, response, **kwargs):
        loader = ItemLoader(item=YelpItem(), response=response)
        for script in response.css('script').getall():
            if '{"gaConfig' in script:
                detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1))
        loader.add_value('direct_url', detail_json['staticUrl'])
        loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId'])
        loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1])
        if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']:
            loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText'])
        loader.add_value('title', detail_json['bizDetailsPageProps']['businessName'])
        loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews'])
        #TODO: find way to not use hardcoded documentIds
        post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}]

        yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data),
            headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
コード例 #9
0
	def parse_restaurant_reviews_page(self, response):
		reviews = response.xpath('//div[@class = "review review--with-sidebar"]')
		restaurant = response.xpath('//div[@class = "biz-page-header-left claim-status"]/div/h1/text()').extract_first().strip()
		address = response.xpath('//div[@class="mapbox"]//address/text()').extract_first().strip()
		price = response.xpath('//span[@class="business-attribute price-range"]/text()').extract_first()
		for review in reviews:
			rating = review.xpath('.//div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title').extract_first()[0]
			text = review.xpath('.//p[@lang="en"]/text()').extract()
			date = review.xpath('.//span[@class="rating-qualifier"]/text()').extract_first().strip()

			item = YelpItem()
			item['restaurant'] = restaurant
			item['rating'] = rating
			item['text'] = text
			item['date'] = date
			item['address'] = address
			item['price'] = price
			yield item
コード例 #10
0
    def parse(self, response):
        meta_bizid = response.xpath(
            '/html/head/meta[@name="yelp-biz-id"]/@content').get()
        data_content = response.xpath(
            '//script[@type="application/ld+json"]//text()').getall()
        data_json = response.xpath(
            '//script[@type="application/json"]//text()').getall()

        general_data = json.loads(data_content[0])
        business_data1 = self._prepare_json(data_json[2])
        business_data2 = self._prepare_json(data_json[3])
        biz_details1 = json.loads(business_data1)["bizDetailsPageProps"]
        biz_details2 = json.loads(business_data2)

        l = ItemLoader(item=YelpItem(), response=response)
        l.add_value("name", general_data["name"])
        l.add_value("item_url", response.url)
        l.add_value("biz_id", meta_bizid)
        l.add_xpath("image", '/html/head/meta[@property="og:image"]/@content')
        l.add_value(
            "phone",
            (general_data["telephone"]
             if "telephone" in general_data.keys() else None),
        )
        l.add_value("email", (general_data["email"]
                              if "email" in general_data.keys() else None))
        l.add_value("address", general_data["address"])
        l.add_value("rating_value",
                    general_data["aggregateRating"]["ratingValue"])
        l.add_value("review_count",
                    general_data["aggregateRating"]["reviewCount"])
        l.add_value("categories", self._get_categories(data=data_content))
        l.add_value("home_url", self._get_homeurl(data=biz_details1))
        encid = "{'encid':'" + meta_bizid + "'}"
        client_platform = "{'clientPlatform':'WWW'}"
        l.add_value("hours", self._get_hours(encid=encid, data=biz_details2))
        l.add_value("about", self._get_about(data=biz_details1))
        l.add_value(
            "amenities",
            self._get_amenities(encid=encid,
                                client_platform=client_platform,
                                data=biz_details2),
        )
        return l.load_item()
コード例 #11
0
    def parse_gym_reviews_page(self, response):
        reviews = response.xpath('//li [@class="lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU"]')
        

        gym = response.xpath('//h1[@class="lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"]/text()').extract_first()
        address = response.xpath('//span [@class="lemon--span__373c0__3997G raw__373c0__3rcx7"]/text()').extract_first()
        zipcode = re.findall(r'9\d{4}',",".join(response.xpath('//span [@class="lemon--span__373c0__3997G raw__373c0__3rcx7"]/text()').extract()))[0]
        category = re.findall(r'(\w+\s?\w+)', ",".join(response.xpath('//span [@class="lemon--span__373c0__3997G display--inline__373c0__3JqBP margin-r1__373c0__zyKmV border-color--default__373c0__3-ifU"]//text()').extract()))
        
        try:
            num_review = int(re.search(r'(\d*\.?\d*)', response.xpath('//p [@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/text()').extract_first()).group(0))  
        except:
            num_review = 0

        about = response.xpath('//div [@class="lemon--div__373c0__1mboc margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/p/span//text()').extract_first()
        region = response.xpath('//div [@class="lemon--div__373c0__1mboc pseudoIsland__373c0__Fak5q"]//p [@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/text()').extract_first()
        avg_rating = float(response.xpath('//span [@class="lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU"]/div/@aria-label').extract_first().split()[0])
        
        for review in reviews:
            user_name = review.xpath('.//div [@class="lemon--div__373c0__1mboc border-color--default__373c0__3-ifU"]//a [@class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"]/text()').extract_first()
            user_id = review.xpath('.//div [@class="lemon--div__373c0__1mboc border-color--default__373c0__3-ifU"]//a [@class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"]/@href').extract_first().partition("=")[2]
            rating = int(review.xpath('.//div [@class="lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-grid-column--8__373c0__2dUx_ border-color--default__373c0__3-ifU"]//div/@aria-label').extract_first().split()[0])
            text = review.xpath('.//div [@class ="lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT arrange-unit-grid-column--8__373c0__2dUx_ border-color--default__373c0__3-ifU"]//p [@class= "lemon--p__373c0__3Qnnj text__373c0__2Kxyz comment__373c0__3EKjH text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/span [@class="lemon--span__373c0__3997G raw__373c0__3rKqk"]/text()').extract_first()
            reviewer_date = review.xpath('.//span [@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"]/text()').extract_first()

            
            item = YelpItem()
            item["gym"] = gym
            item["zipcode"] = zipcode
            item["address"] = address
            item["category"] = category
            item["about"] = about
            item["region"] = region
            item["num_review"] = num_review
            item["avg_rating"] = avg_rating
            item["user_name"] = user_name
            item["user_id"] = user_id
            item["rating"] = rating
            item["text"] = text
            item["reviewer_date"] = reviewer_date
            yield item 
コード例 #12
0
    def parse(self, response):
        # Defining rows to be scraped
        rows = response.xpath(
            '//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/ul/li')
        for row in rows:

            # Scraping Busines' Name
            name = row.xpath('.//p/a/text()').extract_first()

            # Scraping Phone number
            phone = row.xpath(
                './/div[1]/p[1][@class= "lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--right__373c0__3ARv7"]/text()'
            ).extract_first()

            # scraping area
            area = row.xpath(
                './/p/span[@class = "lemon--span__373c0__3997G"]/text()'
            ).extract_first()

            # Scraping services they offer
            services = row.xpath(
                './/a[@class="lemon--a__373c0__IEZFH link__373c0__29943 link-color--inherit__373c0__15ymx link-size--default__373c0__1skgq"]/text()'
            ).extract()

            # Extracting internal link
            link = row.xpath('.//p/a/@href').extract_first()
            link = response.urljoin(link)

            item = YelpItem()
            item['name'] = name
            item['phone'] = phone
            item['area'] = area
            item['services'] = services
            item['link'] = link

            yield scrapy.Request(link,
                                 callback=self.parse_detail,
                                 meta={'item': item})
コード例 #13
0
    def parse_page(self, response):
        Names = response.css(
            "div.lemon--div__373c0__1mboc.margin-b1__373c0__1khoT.border-color--default__373c0__3-ifU>h1.lemon--h1__373c0__2ZHSL.heading--h1__373c0___56D3.undefined.heading--inline__373c0__1jeAh::text"
        ).extract_first()
        Phone = response.xpath(
            "//p[contains(text(), 'Phone number')]/following-sibling::p/text()"
        ).extract_first()
        Website = response.xpath("//a[@rel='noopener']/text()").extract_first()
        Open_Status = response.xpath(
            "//span[contains(@class, 'status')]/text()").extract_first()
        Postal_code = response.xpath("//address/p/span/text()").extract()
        rating = response.css(
            "div.lemon--div__373c0__1mboc.i-stars__373c0__1T6rz.i-stars--large-1__373c0__1kclN.border-color--default__373c0__3-ifU.overflow--hidden__373c0__2y4YK::attr(aria-label)"
        ).extract_first()
        days = response.xpath(
            "//tbody[@class='lemon--tbody__373c0__2T6Pl']/tr/th/p/text()"
        ).getall()
        hrs = response.xpath(
            "//tbody[@class='lemon--tbody__373c0__2T6Pl']//p[contains(@class, 'no-wrap')]/text()"
        ).getall()
        mon = days[0] + '--' + hrs[0]
        tue = days[1] + '--' + hrs[1]
        wed = days[2] + '--' + hrs[2]
        thu = days[3] + '--' + hrs[3]
        fri = days[4] + '--' + hrs[4]
        sat = days[5] + '--' + hrs[5]
        sun = days[6] + '--' + hrs[6]
        total = mon + '\n' + tue + '\n' + wed + '\n' + thu + '\n' + fri + '\n' + sat + '\n' + sun
        loader = ItemLoader(item=YelpItem())

        loader.add_value("Business_Name", Names)
        loader.add_value("Phone", Phone)
        loader.add_value("Website", Website)
        loader.add_value("Open_Status", Open_Status)
        loader.add_value("Postal_Code", Postal_code)
        loader.add_value("Rating", rating)
        loader.add_value("Open_Hours", total)
        return loader.load_item()
コード例 #14
0
ファイル: Review.py プロジェクト: Stardust567/yelp
    def get_review(self, response):
        item = YelpItem()
        name_list = response.css(
            'h1[class="biz-page-title embossed-text-white shortenough"]::text'
        ).extract()
        if (len(name_list) == 0):
            name = 'none'
        name = ' '.join(name_list)
        address_list = response.css('strong[class="street-address"]').css(
            'address::text').extract()
        if (len(address_list) == 0):
            address = 'none'
        address = ' '.join(address_list)
        date = response.css(
            'span[class="rating-qualifier"]::text').extract_first()
        review = response.css('li').css('div').css(
            'p[lang = "en"]::text').extract()

        item['name'] = name
        item['address'] = address
        item['date'] = date
        item['review'] = review

        yield item
コード例 #15
0
    def parse_business_page(self, response):
        restaurant_name = response.xpath(
            '//h1[@class="lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"]/text()'
        ).extract_first()

        try:
            avg_rating = response.xpath(
                '//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-1-5__373c0__2vL-3 vertical-align-middle__373c0__1SDTo margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/div/span/div/@aria-label'
            ).extract_first()
            avg_rating = float(
                re.findall('(\d?\.?\d) star rating', avg_rating)[0])
        except:
            avg_rating = None
            print('=' * 50)
            print(f'Error with avg_rating at url: {response.url}')
            print('=' * 50)

        try:
            num_reviews = response.xpath(
                '//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/text()'
            ).extract_first()
            num_reviews = int(re.findall('(\d+) review[s]?', num_reviews)[0])
        except:
            num_reviews = 0
            print('=' * 50)
            print(f'Error with num_reviews at url: {response.url}')
            print('=' * 50)

        phone_num = response.xpath(
            '//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-2__373c0__1DiLQ vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU"]//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"]/text()'
        ).extract_first()

        address = response.xpath(
            '//address[@class="lemon--address__373c0__2sPac"]//span/text()'
        ).extract()
        address = ', '.join(address)

        days = response.xpath(
            '//table[@class="lemon--table__373c0__2clZZ hours-table__373c0__1S9Q_ table__373c0__3JVzr table--simple__373c0__3lyDA"]//tr'
        )
        hours_dict = {
            day.xpath('./th/p/text()').extract_first():
            day.xpath('./td/ul/li/p/text()').extract_first()
            for day in days
        }

        # for day in days:
        #     key = day.xpath('./th/p/text()').extract_first()
        #     value = day.xpath('./td/ul/li/p/text()').extract_first()
        #     hours_dict[key] = value

        price_range = response.xpath(
            '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-bullet--after__373c0__3fS1Z text-size--large__373c0__3t60B"]/text()'
        ).extract_first()

        category = response.xpath(
            '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"]/a/text()'
        ).extract()

        # This return a list of all review ratings in the first review page , some problems
        # review_rating = response.xpath('//div[@class="lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-1__373c0__2l5bx vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU"]/div/span/div/@aria-label').extract()[:20]
        review_rating = response.xpath(
            '//div[@class="lemon--div__373c0__1mboc margin-t1__373c0__oLmO6 margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]/div/div/span/div/@aria-label'
        ).extract()
        review_rating = review_rating[:
                                      -1]  # Drop the last review which is not chronological

        review_date = response.xpath(
            '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"]/text()'
        ).extract()
        review_date = review_date[:
                                  -1]  # Drop the last review which is not chronological

        # review_user = response.xpath('//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz fs-block text-color--blue-dark__373c0__1jX7S text-align--left__373c0__2XGa- text-weight--bold__373c0__1elNz"]/a/text()').extract()

        recent_reviews = list(zip(review_rating, review_date))

        covid_updates_text = response.xpath(
            '//div[@class="lemon--div__373c0__1mboc margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]//p/text()'
        ).extract_first()

        covid_update_time = response.xpath(
            '//p[@class="lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--subtle__373c0__3DZpi text-align--left__373c0__2XGa-"]/text()'
        ).extract_first()

        pairs = response.xpath(
            '//span[@class="lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-weight--semibold__373c0__2l0fe text-size--large__373c0__3t60B"]/text()'
        ).extract()

        covid_services = {}
        for x in response.xpath(
                '//div[@class="lemon--div__373c0__1mboc margin-t2__373c0__1CFWK border-color--default__373c0__3-ifU"]//div[@class="lemon--div__373c0__1mboc display--inline-block__373c0__1ZKqC margin-r3__373c0__r37sx margin-b1__373c0__1khoT border-color--default__373c0__3-ifU"]'
        ).getall():
            service = re.search('<span.*?text.*?([A-Za-z\s\-]*)<\/span', x)
            if service != None:
                covid_services[service.group(1)] = re.search("checkmark",
                                                             x) != None

        item = YelpItem()
        item['restaurant_name'] = restaurant_name
        item['avg_rating'] = avg_rating
        item['num_reviews'] = num_reviews
        item['phone_num'] = phone_num
        item['address'] = address
        item['business_hours'] = hours_dict
        item['price_range'] = price_range
        item['category'] = category
        item['recent_reviews'] = recent_reviews

        item['covid_updates_text'] = covid_updates_text
        item['covid_update_time'] = covid_update_time
        item['covid_services'] = covid_services

        item['location'] = response.meta['location']
        item['cuisine'] = response.meta['cuisine']
        item['url'] = response.meta['url']

        yield item
コード例 #16
0
    def parse1(self,response):
        item = YelpItem()
        # 详情页面URL
        item['detail_page_url'] = response.meta['aurl']
        # 城市
        item['city'] = 'Oakland, CA'
        # 名称
        name1 = ''.join(response.xpath('//div[@class="biz-page-header-left claim-status"]/div/h1/text()').extract())
        name2 = ''.join(response.xpath('//div[@class="biz-page-header-left claim-status"]/div/div/h1/text()').extract())
        item['name'] = name1 + name2

        # 电话
        if response.xpath('//div[@class="mapbox-text"]/ul/li[2]/span[3]/text()'):
            item['tel'] = ''.join(response.xpath('//div[@class="mapbox-text"]/ul/li[2]/span[3]/text()').extract()).replace("\n", "").replace(" ", "")
        elif response.xpath('//div[@class="mapbox-text"]/ul/li[3]/span[3]/text()'):
            item['tel'] = ''.join(response.xpath('//div[@class="mapbox-text"]/ul/li[3]/span[3]/text()').extract()).replace("\n", "").replace(" ", "")
        else:
            item['tel'] = ''


        # 街道
        item['street'] =''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong/text()').extract()).strip().replace("\n", "")

        # 地址
        if response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/address'):
            item['address'] =  ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/address/text()').extract()).strip().replace("\n", "")
        elif response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong'):
            item['address'] = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[1]/div/strong/address/text()').extract()).strip().replace("\n", "")
        else:
            item['address'] = ''


        # 官网
        if response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[3]/span[2]/a/@href'):
            email = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[3]/span[2]/a/@href').extract())
            email = email.split('&website_link')[0].split('2F')[-1]
            item['email'] = email
        elif response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[4]/span[2]/a/@href'):
            email = ''.join(response.xpath('//*[@id="wrap"]/div[2]/div/div[1]/div/div[4]/div[1]/div/div[2]/ul/li[4]/span[2]/a/@href').extract())
            email = email.split('&website_link')[0].split('2F')[-1]
            item['email'] =  email
        else:
            item['email'] = ''

        # 分类
        if response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span[2]/a/text()'):
            item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span[2]').xpath('string(.)').extract()).replace("\n", "").replace(" ", "")
        elif response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span/a/text()'):
            item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div[2]/span').xpath('string(.)').extract()).replace("\n", "").replace(" ", "")
        elif response.xpath('//div[@class="biz-main-info embossed-text-white"]/div/span/text()'):
            item['forms'] = ''.join(response.xpath('//div[@class="biz-main-info embossed-text-white"]/div/span').xpath('string(.)').extract()).replace("\n", "").replace(" ", "")

        # 描述
        item['description'] = ''.join(response.xpath('//div[@class="from-biz-owner-content"]/p/text()').extract()).replace("\n", "").replace(" ", "")

        # logo图url
        item['logo_imgurl'] = response.meta['logo_imgurl']

        # 经纬度
        location = ''.join(response.xpath('//div[@class="mapbox-map"]/div/@data-map-state').extract())
        location = re.findall('"location": {"latitude": (.*?), "longitude": (.*?)},',location)
        if location:
            item['latitude'] = re.findall("'(.*?)'",str(location))[0]
            item['longitude'] = re.findall("'(.*?)'",str(location))[1]
        else:
            item['latitude'] = ''
            item['longitude'] = ''
        # 背景图
        item['back_img'] = ''.join(response.xpath('//div[@class="js-photo photo photo-1"]/div/a/img/@src').extract())

        yield item
        time.sleep(0.5)