def parse_common(self, response): """Parse common fields for both.""" self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_css('title', 'h1.title::text') return item
def parse_item(self, response): """Parse the flat response. @url https://www.immowelt.de/expose/2GT7W4N @returns items 1 1 @scrapes url title address rooms size cold_rent_price warm_rent_price additional_price heating_price description """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_xpath('title', '//h1/text()') item.add_xpath('address', '//div[@class="location"]/span[@class="no_s"]/text()') item.add_xpath('rooms', '//div[contains(@class, "quickfacts")]//div[@class="hardfact rooms"]/text()[1]') item.add_xpath('size', '//div[contains(@class, "hardfacts")]/div[contains(@class, "hardfact")][2]/text()[1]') item.add_xpath('cold_rent_price', '//div[contains(@class, "hardfacts")]/div[contains(@class, "hardfact")][1]' '/strong/text()') item.add_xpath('description', '//div[contains(@class, "section_label")][starts-with(' 'normalize-space(.), "Objekt")]/following-sibling::div/child::p/text()') for field, cell_text in {'warm_rent_price': 'Warmmiete', 'additional_price': 'Nebenkosten', 'heating_price': 'Heizkosten'}.items(): item.add_xpath( field, '//div[contains(@class, "datatable")]/div[contains(@class, "datarow")]/div[contains' '(@class, "datalabel")][starts-with(normalize-space(.), "{}")]/following-sibling::div' '[contains(@class, "datacontent")]/text()'.format(cell_text)) yield item.load_item()
def parse_item(self, response): """Parse a page with an apartment. @url https://www.city-wohnen.de/eng/berlin/32608-furnished-apartment-berlin-friedrichshain-pettenkoferstrasse @returns items 1 1 @scrapes url title availability description neighborhood address warm_rent size rooms """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_css('title', 'div.text_data > h2::text') item.add_css('availability', 'div.row > div.text_data > p::text') item.add_css('description', 'div.object_details div.col_left p::text') item.add_value( 'neighborhood', response.css( 'div.object_meta div.container div.text_data p strong::text'). extract()[0]) item.add_xpath('address', "//li[@class='map']/a/@href") keys = response.css( 'div.object_meta table.object_meta_data th::text').extract() values = response.css( 'div.object_meta table.object_meta_data td::text').extract() features = dict(zip(keys, values)) item.add_value('warm_rent', features.get('Rent')) item.add_value('size', features.get('Size')) item.add_value('rooms', features.get('Room/s')) return item.load_item()
def parse_item(self, response): """Parse the flat response. @url https://www.immonet.de/angebot/32437621?drop=sel&related=false @returns items 1 1 @scrapes url title address rooms size cold_rent_price warm_rent_price additional_price description @scrapes equipment location """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_xpath('title', '//h1/text()') item.add_xpath( 'address', '//div[contains(@class, "row")]//span[@id = "infobox-static-address"]/text()' ) for field, id_ in { 'rooms': 'equipmentid_1', 'size': 'areaid_1', 'cold_rent_price': 'priceid_2', 'warm_rent_price': 'priceid_4', 'additional_price': 'priceid_20', 'heating_price': 'priceid_5', 'description': 'objectDescription', 'equipment': 'ausstattung', 'location': 'locationDescription', 'other': 'otherDescription' }.items(): item.add_xpath(field, '//*[@id="{}"]/text()'.format(id_)) yield item.load_item()
def parse_item(self, response): """Parse an ad page, with an apartment. @url https://www.berlinovo.de/en/apartment/2-room-suite-house-heinrich-heine-stra-e-18-24-berlin-mitte @returns items 1 1 @scrapes url title description location address other neighborhood rooms """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_css('title', 'h1.title::text') item.add_xpath( 'description', '//div[contains(@class, field-name-body)]/div/div[4]/div/div/p/text()' ) item.add_xpath( 'location', '//div[contains(@class, field-name-field-position)]/div/div[5]/div[2]/div/text()' ) zipcode = response.xpath( '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[3]/div/span/text()[1]') \ .extract()[0].strip() street = response.xpath( '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[3]/div/span/text()[2]') \ .extract()[0].strip() item.add_value('address', u'{}, {}'.format(street, zipcode)) item.add_xpath( 'equipment', '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[18]/div/div/ul/li/span/text()' ) item.add_xpath( 'warm_rent', '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[5]/span[2]/text()' ) item.add_xpath( 'other', '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div/span/text()' ) item.add_value( 'neighborhood', response.css('#page-title::text').extract()[0].strip().split( 'Berlin-')[-1]) room_list = response.xpath( '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div' '[contains(@class, views-field-field-rooms-description)]/div/text()' ).extract() item.add_value('rooms', re.findall(r'([0-9]+)', ' '.join(room_list))[0]) yield item.load_item()
def parse(self, response): """Parse the items from the main list, then start requests to get more details. The number of rooms are only available on the list; maybe on purpose, to make scraping harder. """ parser = etree.HTMLParser() for html in response.xpath('//figure').extract(): tree = etree.fromstring(html, parser) item = ApartmentItem() item['url'] = response.urljoin(tree.xpath('//a/@href')[0]) item['rooms'] = tree.xpath('//p/span[@class="rooms"]/text()')[0] item['size'] = tree.xpath('//p/span[@class="areaSize"]/text()')[0] item['address'] = tree.xpath('normalize-space(//h3)') yield item for request in super().parse(response): yield request
def parse_item(self, response): """Parse a page with an apartment. @url https://www.akelius.de/en/search/apartments/osten/berlin/2.7037.16 @returns items 1 1 @scrapes url title warm_rent_price size availability cold_rent_price description address """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_xpath('title', '//h2/text()') item.add_xpath( 'warm_rent_price', '//h2/following-sibling::p[starts-with(normalize-space(.), "Total rent")]/text()' ) item.add_xpath('size', '//h2//following-sibling::p[2]/text()') item.add_xpath( 'location', '//h3[starts-with(normalize-space(.), "Location")]/following-sibling::div//span/text()' ) item.add_xpath('availability', '//h2//following-sibling::p[4]/text()') item.add_xpath( 'cold_rent_price', '//h3[starts-with(normalize-space(.), "Apartment")]/following-sibling::div[1]/p[2]/span/text()' ) item.add_xpath( 'description', '//h3[starts-with(normalize-space(.), "Building")]/following-sibling::div//span/text()' ) # The map is shown with JavaScript; get the HTML # and use a regex to extract the part of the script with the address. map_response = requests.get(response.url + '/karte') if map_response.status_code == 200: html_string = ''.join(self.ADDRESS_REGEX.findall( map_response.text)) # Extract the address from the HTML. root = etree.fromstring(html_string, etree.HTMLParser()) item.add_value('address', ', '.join(root.xpath('//p/text()'))) return item.load_item()
def parse_item(self, response): """Parse an ad page with an apartment. @url https://www.immobilienscout24.de/expose/93354819 @returns items 1 1 @scrapes url title address neighborhood cold_rent_price warm_rent_price rooms """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_css('title', 'h1#expose-title::text') for field, css_class in self.DIV_PRE_MAPPING.items(): item.add_xpath( field, "//div/pre[contains(@class, '{}')]/text()".format(css_class)) full_address = ''.join( response.xpath("//span[@data-qa='is24-expose-address']/div//text()" ).extract()).strip() parts = full_address.split(self.CITY) if len(parts) == 1: item.add_value('address', full_address) else: street_zip = (parts[0] + self.CITY).strip(' ,').replace( ' (zur Karte) ', '') item.add_value('address', street_zip) item.add_value('neighborhood', ''.join(parts[1:]).strip(' ,')) item.add_css('cold_rent_price', 'div.is24qa-kaltmiete::text') item.add_css('warm_rent_price', 'dd.is24qa-gesamtmiete::text') item.add_css('rooms', 'div.is24qa-zi::text') item.add_xpath('size', '//div[contains(@class, "is24qa-flaeche ")]/text()') item.add_xpath( 'active', '//div[contains(@class, "status-message")]' '/h3[starts-with(normalize-space(.), "Angebot")]/text()') yield item.load_item()
def parse_item(self, response): """Parse a page with an apartment. @url http://www.merkur-berlin.de/?page_id=39&showExpose=1&exposeID=926C081BECA043C9BE7756469D94722F @returns items 1 1 @scrapes url title address rooms size warm_rent description location """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_xpath('title', '//h4[@class="entry-title"]/text()') item.add_xpath('address', '//address/text()') for field, info in dict(rooms='Rooms', size='AreaLiving', warm_rent='PriceWarmmiete', cold_rent='Price').items(): item.add_xpath(field, '//div[@class="infotables"]//tr[@id="infotable_{info}"]/td[@class=' '"infotable_value"]/text()'.format(info=info)) for field, h2 in dict(description='Objekt', equipment='Ausstattung', location='Lage', other='Mehr Angebote').items(): item.add_xpath(field, '//div[@class="infoblock"]/h2[starts-with(normalize-space(.),' ' "{h2}")]/following-sibling::p/text()'.format(h2=h2)) return item.load_item()