def parse_object(self, response): breadCrumbTitle = Extractor.string(response, 'a.active span').split(' - ') city = Extractor.string(breadCrumbTitle[0]) breadCrumbTitle.pop(0) street = ' - '.join(breadCrumbTitle) volume = Structure.find_in_definition( response, '.pand-specs.panddetail-desc li > span', 'Woonoppervlakte') if volume is not None and isinstance(volume, str): volume = Extractor.volume(volume) rooms = Structure.find_in_definition( response, '.pand-specs.panddetail-desc li > span', 'Kamers') if rooms is not None and isinstance(rooms, str): Extractor.string(rooms) type = Structure.find_in_definition( response, '.pand-specs.panddetail-desc li > span', 'Type') if type is not None and isinstance(type, str): Extractor.string(type) price = Extractor.string(response, '.panddetail-price') price = price.split('-')[0] price = Extractor.euro(price) availability = Structure.find_in_definition( response, '.pand-specs.panddetail-desc li > span', 'Beschikbaarheid') if availability is not None and isinstance(type, str): availability = Extractor.string(availability) yield { 'street': street, 'city': city, 'region': self.region, 'volume': volume, 'rooms': rooms, 'availability': availability, 'type': type, 'pricePerMonth': price, 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': '123Wonen.nl', 'images': Extractor.images(response, 'a[data-fancybox="group1"]::attr(href)', True), }
def parse_object(self, response): address_heading = Extractor.string(response, '.address').split(',') street = address_heading[0] postcode_and_city = address_heading[1].split(' ') city = postcode_and_city[(len(postcode_and_city) - 1)] volume = Structure.find_in_definition( response, 'table.table-striped.feautures tr td', 'Gebruiksoppervlakte wonen') if volume is not None and isinstance(volume, str): volume = Extractor.volume(volume) rooms = Structure.find_in_definition( response, 'table.table-striped.feautures tr td', 'Aantal kamers') if rooms is not None and isinstance(rooms, str): rooms = rooms.split(' (w')[0] type = Structure.find_in_definition( response, 'table.table-striped.feautures tr td', 'Type object') if type is not None and isinstance(type, str): type = type.split(', ') lastindex = (len(type) - 1) type = type[lastindex] price = Structure.find_in_definition( response, 'table.table-striped.feautures tr td', 'Prijs') if price is not None and isinstance(type, str): price = Extractor.euro(price.split('-')[0]) yield { 'street': street, 'city': city, 'region': self.region, 'volume': volume, 'rooms': rooms, 'availability': response.meta['availability'], 'type': type, 'pricePerMonth': price, 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': 'Domica', 'images': Extractor.images( response, '#cycle-slideshow2 > a.gallery-link img::attr(estate_crawler)', True), }
def parse_object(self, response): street = Extractor.string(response, 'h1.text-regular') city = Extractor.string(response, '.col-md-8 .fixed-lh p.color-medium') availability = Extractor.string( response, '.col-md-8 .horizontal-items ul li:last-child') # Sometimes Nederwoon mistakenly adds the zip code in the city field, filter it out city = re.sub('\d{4}?\s*[a-zA-Z]{2}', '', city).replace(' ', '') rooms = Extractor.string( Structure.find_in_definition(response, '.table-striped.table-specs td', 'Aantal kamers')) price = Extractor.euro( Structure.find_in_definition(response, '.table-striped.table-specs td', 'Totale huur per maand', 2)) volume = Extractor.volume( Structure.find_in_definition(response, '.table-striped.table-specs td', 'Woonoppervlakte')) type = Extractor.string( Structure.find_in_definition(response, '.table-striped.table-specs td', 'Soort woonruimte')) yield { 'street': street, 'city': city, 'region': self.region, 'volume': volume, 'rooms': rooms, 'availability': availability, 'type': type, 'pricePerMonth': price, 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': 'NederWoon', 'images': Extractor.images( response, '.slider.slider-media > div img::attr(estate_crawler)'), }
def parse_object(self, response): availability = Extractor.string( Structure.find_in_definition(response, '#properties .row > .col-xs-6', 'Ingangsdatum')) rooms = Extractor.string( Structure.find_in_definition(response, '#properties .row > .col-xs-6', 'Aantal kamers')) price = Extractor.euro( Structure.find_in_definition(response, '#properties .row > .col-xs-6', 'Totale huur').split('-')[0]) volume = Extractor.volume( Structure.find_in_definition(response, '#properties .row > .col-xs-6', 'Oppervlakte (ca.)')) type = Extractor.string( Structure.find_in_definition(response, '#properties .row > .col-xs-6', 'Soort')) yield { 'street': response.meta['street'], 'city': response.meta['city'], 'region': self.region, 'volume': volume, 'rooms': rooms, 'availability': availability, 'type': type, 'pricePerMonth': price, 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': 'Rotsvast', 'images': Extractor.images(response, '.slider img::attr(estate_crawler)', True), }
def parse_object(self, response): type = Structure.find_in_definition(response, '.house-info div', 'Type woning') volume = Extractor.volume( Structure.find_in_definition(response, '.house-info div', 'Woonoppervlak')) rooms = Structure.find_in_definition(response, '.house-info div', 'Aantal kamers') price = Extractor.euro(response, '.house-info div:nth-child(2)') availability = Structure.find_in_definition(response, '#tab-1 tr td', 'Aanvaarding') yield { 'street': response.meta['street'], 'city': response.meta['city'], 'region': self.region, 'volume': volume, 'rooms': rooms, 'availability': availability, 'type': type, 'pricePerMonth': price, 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': 'Eervast', 'images': Extractor.images(response, '.tab-content > a.gallery::attr(href)', True), }
def parse(self, response): pageSelector = Selector(response) objects = pageSelector.css('.pandlist-container') objects.extract() for index, object in enumerate(objects): # Determine if the object is still available for rent objectStatus = str(Extractor.string(object, '.pand-status')).lower() if objectStatus in ['verhuurd', 'in optie']: continue # Skip crawling storage spaces and garages type = Structure.find_in_definition(object, '.pand-specs li > span', 'Type').lower() if type in [ 'garagebox', 'berging/opslag', 'kantoorruimte', 'loods', 'parkeerplaats', 'winkelpand' ]: continue yield scrapy.Request( Extractor.string( object, 'a.textlink-design:contains("Details")::attr(href)'), self.parse_object) # Crawl the next pages nextPageSelector = '.productBrowser a:contains("volgende")' nextPageLink = pageSelector.css(nextPageSelector).extract_first() if nextPageLink is not None and isinstance(nextPageLink, str): nextPageSelector += '::attr(href)' yield scrapy.Request( Extractor.url(response, pageSelector, nextPageSelector), self.parse, )
def parse_object(self, response): tableSelector = '.property-overview dl > *' yield { 'street': Extractor.string(response, 'h1.entry-title'), 'city': Extractor.string( Structure.find_in_definition(response, tableSelector, 'Plaats')).title(), 'region': self.region, 'volume': Extractor.volume( Structure.find_in_definition(response, tableSelector, 'Woonoppervlakte')), 'rooms': Extractor.string( Structure.find_in_definition(response, tableSelector, 'Kamers')), 'availability': Extractor.string( Structure.find_in_definition(response, tableSelector, 'Status')), 'type': Extractor.string( Structure.find_in_definition(response, tableSelector, 'Type')), 'pricePerMonth': Extractor.euro( Structure.find_in_definition(response, tableSelector, 'Prijs')), 'reference': Extractor.urlWithoutQueryString(response), 'estateAgent': 'Van der Hulst', 'images': Extractor.images(response, '.property-detail-gallery a::attr(href)', True), }