def parse(self, response): data = json.loads(response.body) listing_data = data.get('explore_tabs')[0].get('sections')[0].get( 'listings') country = str(data.get('metadata').get('geography').get('country')) for listing in listing_data: listing_item = AirbnbItem() details = listing.get('listing') price = listing.get('pricing_quote') listing_id = str(details.get('id')) rate = price.get('rate').get('amount') rate_w_service = price.get('rate_with_service_fee').get('amount') listing_item['listing_id'] = listing_id listing_item['title'] = str(details.get('name')) listing_item['country'] = country listing_item['city'] = str(details.get('localized_city')) listing_item['lat'] = details.get('lat') listing_item['lon'] = details.get('lon') listing_item['bedrooms'] = details.get('bedrooms') listing_item['bathrooms'] = details.get('bathrooms') listing_item['max_occupancy'] = details.get('person_capacity') listing_item['service_fee'] = rate_w_service - rate yield listing_item
def parse_room_second(self, response): try: flag = response.meta.get('parse') except Exception as e: self.logger.info(e) if flag: self.logger.info('From First {}'.format(response.url)) data = json.loads(response.text) price = data['nightly_price'] # not exactly name = response.meta.get('name') owner = response.meta.get('owner') owner_id = response.meta.get('owner_id') room_id = response.meta.get('id') yield AirbnbItem({'room_id': room_id, 'name': name, 'owner': owner, 'owner_id': owner_id, 'price': price}) room_id = re.findall(r'\d{3,10}', response.url)[0] else: self.logger.info('From similar {}'.format(response.url)) room_id = response.url.split('/')[-1] guest_list = response.css('div.show-inline-block > div:nth-child(2) > div > div.name > a') for one in guest_list: name = one.css('span::text').extract_first() guest_id = one.css('::attr(href)').extract_first() yield UserItem({'name': name, 'user_id': guest_id}) self.logger.info('id = {}'.format(room_id)) f = furl('https://zh.airbnb.com/api/v2/similar_listings?') arg = {'key': 'd306zoyjsyarp7ifhu67rjxn52tv0t20', 'currency': 'CNY', 'locale': 'zh', '_format': 'for_listing_card', 'filter_instant_book': 'false', 'listing_id': str(room_id)} f.add(args=arg) url = f.url path = f.query yield scrapy.FormRequest(url=url, callback=self.parse_similar)
def parse_listing_contents(self, response): item = AirbnbItem() json_array = response.xpath( '//meta[@id="_bootstrap-room_options"]/@content').extract() if json_array: airbnb_json_all = json.loads(json_array[0]) airbnb_json = airbnb_json_all['airEventData'] item['rev_count'] = airbnb_json['visible_review_count'] item['amenities'] = airbnb_json['amenities'] item['host_id'] = airbnb_json_all['hostId'] item['hosting_id'] = airbnb_json['hosting_id'] item['room_type'] = airbnb_json['room_type'] item['price'] = airbnb_json['price'] item['bed_type'] = airbnb_json['bed_type'] item['person_capacity'] = airbnb_json['person_capacity'] item['cancel_policy'] = airbnb_json['cancel_policy'] item['rating_communication'] = airbnb_json['communication_rating'] item['rating_cleanliness'] = airbnb_json['cleanliness_rating'] item['rating_checkin'] = airbnb_json['checkin_rating'] item['satisfaction_guest'] = airbnb_json[ 'guest_satisfaction_overall'] item['instant_book'] = airbnb_json['instant_book_possible'] item['accuracy_rating'] = airbnb_json['accuracy_rating'] item['response_time'] = airbnb_json['response_time_shown'] # item['response_rate'] = airbnb_json['reponse_rate_shown'] item['nightly_price'] = airbnb_json_all['nightly_price'] item['url'] = response.url yield item
def parse_meetup(self, response): sel = Selector(response) item = AirbnbItem() item['title'] = sel.xpath('//h1[@itemprop="name"]/text()').extract() item['link'] = response.url item['description'] = sel.xpath( '//div[@id="past-event-description-wrap"]//text()').extract() yield item
def parse_similar(self, response): data = json.loads(response.text) for one in data['similar_listings']: room_id = one['listing']['id'] name = one['listing']['name'] owner = one['listing']['primary_host']['first_name'] owner_id = one['listing']['primary_host']['id'] price = one['pricing_quote']['rate']['amount_formatted'] yield AirbnbItem({'room_id': room_id, 'name': name, 'owner': owner, 'owner_id': owner_id, 'price': price}) yield scrapy.Request(url='https://zh.airbnb.com/rooms/' + str(room_id), meta={'parse': False}, callback=self.parse_room_second)
def parse_locations(self, response): sel = Selector(response) property_site = AirbnbItem() property_site['R_Hostname'] = sel.xpath( '//a[contains(@href, "#host-profile")]/text()').extract()[1] property_site['R_Hostprofile'] = sel.xpath( '//div[contains(@data-reactid, ".agcwfvnqbk.2.0.0.0.2.1")]/a/@href' ).extract() property_site['R_Listname'] = sel.xpath( '//h1[@id = "listing_name"]/text()').extract() property_site['R_Reviews'] = sel.xpath( '//span[@itemprop = "reviewCount"]/text()').extract() property_site['S_Accommodates'] = sel.xpath( '//strong[contains(@data-reactid, "Accommodates=2.2")]/text()' ).extract() property_site['S_Bedrooms'] = sel.xpath( '//strong[contains(@data-reactid, "Bedrooms=2.2")]/text()' ).extract() property_site['S_Bathrooms'] = sel.xpath( '//strong[contains(@data-reactid, "Bathrooms=2.2")]/text()' ).extract() property_site['S_Numbeds'] = sel.xpath( '//strong[contains(@data-reactid, "Beds=2.2")]/text()').extract() property_site['S_Bedtype'] = sel.xpath( '//strong[contains(@data-reactid, "Bed type=2.2")]/text()' ).extract() property_site['S_Checkin'] = sel.xpath( '//strong[contains(@data-reactid, "Check In=2.2")]/text()' ).extract() property_site['S_Checkout'] = sel.xpath( '//strong[contains(@data-reactid, "Check Out=2.2")]/text()' ).extract() property_site['A_Availability'] = sel.xpath( '//div[@class = "col-md-6"]/strong/text()').extract() #Why the f**k does this work? property_site['R_Value'] = sel.xpath( '//div[@class = "col-sm-8"]/div/span/text()').extract() #Not a fan of the following xpath, fix later property_site['R_Roomtype'] = sel.xpath( '//div[@class = "col-sm-3"]/text()').extract()[0] property_site['A_Cleaningfee'] = sel.xpath( '//strong[contains(@data-reactid, "Cleaning Fee=2.2")]/text()' ).extract() return property_site
def parse(self, response): def clean_the_shit(shit): return max(shit.splitlines()).strip() responseSelector = Selector(response) listing_item = responseSelector.css( 'div.col-sm-12.row-space-2.col-md-6').css('div.listing') items_top = listing_item.css('div.panel-image.listing-img').css( 'div.listing-description') items_bottom = listing_item.css( 'div.panel-body.panel-card-section').css('div.media') items_image = listing_item.css('div.panel-image.listing-img') for bottom, top, image in zip(items_bottom, items_top, items_image): item = AirbnbItem() item['name'] = bottom.css('a').css('h3::text').extract()[0].strip() if item['name'] == '\u661f\u7ea7\u6c11\u5bbf \u8212\u9002\u5e72\u51c0 \u4ea4\u901a\u65b9\u4fbf': import ipdb ipdb.set_trace() item['user_url'], item['room_url'] = bottom.xpath( 'a/@href').extract() type_and_reviews = bottom.css( 'div.text-muted.listing-location.text-truncate').css( 'a::text').extract() item['room_type'] = clean_the_shit(type_and_reviews[0]) if len(type_and_reviews) == 2: item['reviews'] = clean_the_shit(type_and_reviews[1])[2:] item['summary'] = top.css('div.summary').css( 'p::text').extract()[0].strip() item['address'] = top.css('p.address').css('p::text').extract()[0] item['image'] = image.css('a.media-photo.media-cover').css( 'img').xpath('@src').extract()[0] # TODO: add cookie to get consistent prices item['price'] = int( image.css( 'a.link-reset.panel-overlay-bottom-left.panel-overlay-label.panel-overlay-listing-label' ).css('div').css( 'span.h3.text-contrast.price-amount::text').extract()[0]) item['coin'] = ''.join( image.css( 'a.link-reset.panel-overlay-bottom-left.panel-overlay-label.panel-overlay-listing-label' ).css('div').css('sup.h6.text-contrast::text').extract()) yield item # pagination next_page = responseSelector.xpath( '//li[contains(@class, "next_page")]').xpath('a/@href') if next_page: url = response.urljoin(next_page[0].extract()) yield Request(url, self.parse)
def parse(self, response): #l = ItemLoader(item = ItjuziItem(),response=response) jsonresponse = json.loads(response.body_as_unicode()) for i in range(0,len(jsonresponse['results_json']['search_results'])): l = ItemLoader(item = AirbnbItem(),response=response) bedrooms = jsonresponse['results_json']['search_results'][i]['listing']['bedrooms'] beds = jsonresponse['results_json']['search_results'][i]['listing']['beds'] name = jsonresponse['results_json']['search_results'][i]['listing']['name'] person_capacity = jsonresponse['results_json']['search_results'][i]['listing']['person_capacity'] primary_host = jsonresponse['results_json']['search_results'][i]['listing']['primary_host']['first_name'] host_id = jsonresponse['results_json']['search_results'][i]['listing']['primary_host']['id'] host_url = "https://zh.airbnb.com/users/show/{}".format(host_id) property_type = jsonresponse['results_json']['search_results'][i]['listing']['property_type'] room_id = jsonresponse['results_json']['search_results'][i]['listing']['id'] room_url = "https://zh.airbnb.com/rooms/{}".format(room_id) is_new_listing = jsonresponse['results_json']['search_results'][i]['listing']['is_new_listing'] public_address = jsonresponse['results_json']['search_results'][i]['listing']['public_address'] room_type = jsonresponse['results_json']['search_results'][i]['listing']['room_type'] star_rating = jsonresponse['results_json']['search_results'][i]['listing']['star_rating'] reviews_count = jsonresponse['results_json']['search_results'][i]['listing']['reviews_count'] guests = jsonresponse['results_json']['search_results'][i]['pricing_quote']['guests'] amount = jsonresponse['results_json']['search_results'][i]['pricing_quote']['rate']['amount'] currency = jsonresponse['results_json']['search_results'][i]['pricing_quote']['rate']['currency'] l.add_value('bedrooms',bedrooms) l.add_value('beds',beds) l.add_value('name',name) l.add_value('person_capacity',person_capacity) l.add_value('primary_host',primary_host) l.add_value('host_id',host_id) l.add_value('host_url',host_url) l.add_value('property_type',property_type) l.add_value('room_id',room_id) l.add_value('room_url',room_url) l.add_value('is_new_listing',is_new_listing) l.add_value('public_address',public_address) l.add_value('room_type',room_type) l.add_value('star_rating',star_rating) l.add_value('reviews_count',reviews_count) l.add_value('guests',guests) l.add_value('amount',amount) l.add_value('currency',currency) print l yield l.load_item()
def parse_listing_contents(self, response): item = AirbnbItem() json_array = response.xpath( '//meta[@id="_bootstrap-room_options"]/@content').extract() if json_array: airbnb_json_all = json.loads(json_array[0]) airbnb_json = airbnb_json_all['airEventData'] item['host_id'] = airbnb_json_all['hostId'] item['hosting_id'] = airbnb_json['hosting_id'] item['room_type'] = airbnb_json['room_type'] item['price'] = airbnb_json['price'] item['bed_type'] = airbnb_json['bed_type'] item['person_capacity'] = airbnb_json['person_capacity'] item['listing_lat'] = airbnb_json['listing_lat'] item['listing_lng'] = airbnb_json['listing_lng'] item['nightly_price'] = airbnb_json_all['nightly_price'] item['url'] = response.url yield item
def parse_details(self, response): # GOAL HERE: # GET ALL OF THE DETAILS OF THE PAGE HERE # WITH SCRAPY SPLASH WE CAN GET THE TEXT BODY OF THE RESPONSE. # WITH THIS, WE CAN REGEX THE ENTIRE BODY TO GET MOST OF THE INFORMATION. print("-" * 50) item = AirbnbItem() #Extracting the roomID from url. try: roomID = re.search('rooms/([0-9]*)\?location', str(response.url)).group(1) except AttributeError: roomID = '' # Extracting rating and numReviews from below xpath object string. string1 = str( response.xpath( '//button[@class="_ff6jfq"]/@aria-label').extract_first()) try: rating = re.search('Rated ([0-5](.[0-9])?) out of 5', string1).group(1) except AttributeError: rating = '' try: numReviews = re.search('from ([0-9]*) reviews', string1).group(1) except AttributeError: numReviews = '' price = response.meta['price'] ########################### Overview ####################### item['roomID'] = roomID item['numReviews'] = numReviews item['price'] = price # this line was causing: # AttributeError: 'NoneType' object has no attribute 'group' # item['shortDesc'] = (re.search('"localized_room_type":"(.{1,50})","city',response.text)).group(1) ####################### Host ############################## item['numHostReviews'] = response.xpath( '//span[@class="_e296pg"]/span[@class="_1uhfauip"]/text()' ).extract_first() # item['isSuperhost'] = (re.search('"is_superhost":(.{1,5}),',response.text)).group(1) ################# Numbers of rooms/baths/guests ############ # item['numBaths'] = (re.search('"bathroom_label":"([0-9]\.?[0-9]?).*","bed_label"', response.text)).group(1) item['numBeds'] = (re.search('"bed_label":"(.).*","bedroom_label"', response.text)).group(1) if re.search('"bedroom_label":"([0-9][0-9]?).*","guest_label"', response.text) != None: item['numRooms'] = (re.search( '"bedroom_label":"([0-9][0-9]?).*","guest_label"', response.text)).group(1) else: item['numRooms'] = 0 if re.search('"guest_label":".{1,8}([0-9][0-9]?).{1,8}",', response.text) != None: item['numGuests'] = (re.search( '"guest_label":".{1,8}([0-9][0-9]?).{1,8}",', response.text)).group(1) else: item['numGuests'] = (re.search( '"guest_label":"([0-9][0-9]?) guest.*', response.text)).group(1) ############## Types of rooms/baths/guests ################### item['bathType'] = (re.search( '"bathroom_label":"[0-9].?[0-9]? (.*)","bed_label"', response.text)).group(1) if re.search('"bedroom_label":"[0-9] (.*)","guest_label"', response.text) != None: item['bedroomType'] = (re.search( '"bedroom_label":"[0-9] (.*)","guest_label"', response.text)).group(1) else: item['bedroomType'] = (re.search( '"bedroom_label":"(..?.?.?.?.?.?.?.?.?.?.?)","guest_label"', response.text)).group(1) item['bedType'] = (re.search( '"bed_label":"[0-9] (.*)","bedroom_label"', response.text)).group(1) ######################## Coordinates ######################## coordinates = re.search( '"listing_lat":([0-9]{2}.[0-9]*),"listing_lng":(-[0-9]{2}.[0-9]*),', response.text) item['latitude'] = coordinates.group(1) item['longitude'] = coordinates.group(2) ########################## Ratings ########################## # Sometimes the ratings are not available... if numReviews: item['rating'] = rating item['accuracy'] = (re.search('"accuracy_rating":([0-9][0-9]?),"', response.text)).group(1) item['communication'] = (re.search( '"communication_rating":([0-9][0-9]?),"', response.text)).group(1) item['cleanliness'] = (re.search( '"cleanliness_rating":([0-9][0-9]?),"', response.text)).group(1) item['location'] = (re.search('"location_rating":([0-9][0-9]?),"', response.text)).group(1) item['checkin'] = (re.search('"checkin_rating":([0-9][0-9]?),"', response.text)).group(1) item['value'] = (re.search('"cleanliness_rating":([0-9][0-9]?),"', response.text)).group(1) item['guestSatisfaction'] = (re.search( '"guest_satisfaction_overall":([0-9][0-9][0-9]?),"', response.text)).group(1) else: item['rating'] = '' item['accuracy'] = '' item['communication'] = '' item['cleanliness'] = '' item['location'] = '' item['checkin'] = '' item['value'] = '' item['guestSatisfaction'] = '' yield item