def parseLocations(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") BuildType = self._getBuildType(response.url) hxs = HtmlXPathSelector(response) if BuildType == 'Display Locations': for data in hxs.xpath('//div[@class="view-content"]/div'): l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) l.add_value('BuilderLogo', self.logo) HomeDesignMainImage = data.xpath( './/div[@class="views-field views-field-field-hero"]/div/img/@src' ).extract_first() DisplayLocation = data.xpath( './/div[@class="views-field views-field-field-location-description"]/div/text()' ).extract_first() BuilderEmailAddress = data.xpath( './/div[@class="views-label views-label-field-contact-email"]/div/a/text()' ).extract_first() OtherInclusions = data.xpath( './/div[@class="views-field views-field-field-contact-mobile"]/div/text()' ).extract_first() OtherInclusions1 = data.xpath( './/div[@class="views-field views-field-field-opening-hours"]/div/text()' ).extract_first() OtherInclusions2 = data.xpath( './/div[@class="views-field views-field-field-location-map"]/div/a/@href' ).extract_first() BuilderName = data.xpath( './/div[@class="views-field views-field-title"]/span/text()' ).extract_first() l.add_value('HomeDesignMainImage', HomeDesignMainImage) l.add_value('DisplayLocation', DisplayLocation) l.add_value('BuilderEmailAddress', BuilderEmailAddress) l.add_value('OtherInclusions', OtherInclusions) l.add_value('OtherInclusions1', OtherInclusions1) l.add_value('OtherInclusions2', OtherInclusions2) l.add_value('BuilderName', BuilderName) yield l.load_item() else: for data in hxs.xpath( '//div[@class="panel panel-home node node-dhfs node-promoted"]' ): l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) l.add_value('BuilderLogo', self.logo) HomeDesignMainImage = data.xpath( './/div[@class="panel-image"]/img/@src').extract_first() BasePrice = data.xpath( './/div[@class="panel-footer"]/ul/li/text()' ).extract_first() BuilderName = data.xpath( './/div[@class="panel-footer"]/text()').extract_first() l.add_value('HomeDesignMainImage', HomeDesignMainImage) l.add_value('BasePrice', BasePrice) l.add_value('BuilderName', BuilderName) yield l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") # with open('testURL', 'a') as file: # file.write(response.url + ' ' + referer + '\n') hxs = HtmlXPathSelector(response) BuildType = self._getBuildType(referer) imgXpath = '''//ul[@class="slides"]/li[{}]/img/@src''' # descriptionXPath = '//div[@id="listing_options"]/ul/li/text()' # areaXpath = '//div[@class="table-light"]/table/tbody/tr/td[text()="{}"]/following-sibling::td[1]/text()' # roomsXpath = '''//h1[text()="Room dimensions"]/following-sibling:: # dl/dt[text()="{}"]/following-sibling::dd[1]/text()''' # data = hxs.xpath(roomsXpath).extract() # with open('testURL','a') as file: # for i in data: # file.write(i+'\n') l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) l.add_value('BuilderLogo', self.logo) if BuildType == 'Display Homes': l.add_value('Lot_BlockAddress', response.meta['address']) else: if response.meta['storey']: l.add_value('Storey', '1') else: l.add_value('Storey', '2') l.add_xpath('DesignName', '//div[@class="banner-content stick-bar"]//h2/text()') l.add_xpath('Bedrooms', '//span[@class="ico-beds"]/ancestor::li/text()') l.add_xpath('Bathrooms', '//span[@class="ico-baths"]/ancestor::li/text()') l.add_xpath('Garage', '//span[@class="ico-garage"]/ancestor::li/text()') # l.add_xpath('HouseWidth', '//th[text()="House Width"]/following-sibling::td/text()') l.add_xpath( 'HouseLength', '//th[text()="House Length"]/following-sibling::td/text()') l.add_xpath('GarageDimension', '//th[text()="Garage"]/following-sibling::td/text()') l.add_xpath('AlfrescoDimension', '//th[text()="Alfresco"]/following-sibling::td/text()') l.add_xpath('Alfresco_Yes_No', '//th[text()="Alfresco"]/following-sibling::td/text()') if BuildType == 'Portfolio': l.add_xpath('Squares', 'string(//div[@class="col-3 floor-plan-legend"]/p)', **{'re': '(?<=AREA|Area).*'}) else: l.add_xpath( 'Squares', '//th[text()="Total Area"]/following-sibling::td/text()') # l.add_xpath('MasterBedroomDimension', [roomsXpath.format('Master Bed'), roomsXpath.format('Bedroom 1')]) # l.add_xpath('Bedroom2Dimension', [roomsXpath.format('Bed 2'), roomsXpath.format('Bedroom 2')]) # l.add_xpath('Bedroom3Dimension', [roomsXpath.format('Bed 3'), roomsXpath.format('Bedroom 3')]) # l.add_xpath('Bedroom4Dimension', [roomsXpath.format('Bed 4'), roomsXpath.format('Bedroom 4')]) # l.add_xpath('Study_Yes_No', [roomsXpath.format('Study'), roomsXpath.format('Study')]) # l.add_xpath('StudyDimension', [roomsXpath.format('Study'), roomsXpath.format('Study (ground floor)'), # roomsXpath.format('Study (first floor)'), # roomsXpath.format('Study (First floor)')]) # l.add_xpath('FamilyDimension', [roomsXpath.format('Family')]) # l.add_xpath('Meals_DiningDimension', # [roomsXpath.format('Family / Meals'), roomsXpath.format('Meals/Family'), # roomsXpath.format('Living / Meals'), roomsXpath.format('Meals')]) # l.add_xpath('TheatreDimension', [roomsXpath.format('Theatre')]) # l.add_xpath('BrochureImage_pdf', '//a[text()="Download the Floor Plan"]/@href', **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('InclusionsImage_pdf', '//a[text()="Download the Inclusions Brochure "]/@href', **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('BasePrice', '''//a[text()="Download the Price List"]/@href''', **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('FloorPlanImage1', '//div[@class="col-wrap floor-plan-box"]//img/@src', **{'myRefer': self.start_urls[0]}) l.add_xpath('HomeDesignMainImage', '//section[@id="overview-anhor"]//img/@src', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image1', imgXpath.format('1'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image2', imgXpath.format('2'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image3', imgXpath.format('3'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image4', imgXpath.format('4'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image5', imgXpath.format('5'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image6', imgXpath.format('6'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image7', imgXpath.format('7'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image8', imgXpath.format('8'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image9', imgXpath.format('9'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image10', imgXpath.format('10'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image11', imgXpath.format('11'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image12', imgXpath.format('12'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image13', imgXpath.format('13'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image14', imgXpath.format('14'), **{'myRefer': self.start_urls[0][0:-1]}) l.add_xpath('Image15', imgXpath.format('15'), **{'myRefer': self.start_urls[0][0:-1]}) return l.load_item()
def parseItem(self, response): print('parseItem') referer = response.request.headers.get('Referer', None).decode("utf-8") BuildType = self._getBuildType(referer) if not BuildType: return None hxs = HtmlXPathSelector(response) # with open('testURL', 'a') as file: # file.writelines('\n'.join(hxs.xpath('//ul[@class="measurements-list"]/li/span[1]/text()').extract())) inclusionsXpath = '''//div[@data-tab="inclusions"]/div/p/text()''' imgXpath = '//div[@data-tab="gallery/images"]/div/img[{}]/@src' descriptionXPath = '''//div[@class="tab-pane floorplans"][{}]/div[@class="clearfix"]/div /ul[@class="measurements-list"]/li/span[text()="{}"]/following-sibling::span/text()''' count = hxs.xpath( '//a[text()="Floorplans"]/following-sibling::ul/li/a/text()' ).extract() for i, design in enumerate(count): other = [] for name in self.oth: size = hxs.xpath(descriptionXPath.format( i + 1, name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) # l.add_value('BuilderEmailAddress', '*****@*****.**') l.add_xpath('HomeDesignMainImage', '''//div[@data-tab="overview"]/img/@src''') l.add_value('BuilderLogo', self.logo) l.add_value('DesignName', design) l.add_xpath('Region', descriptionXPath.format(i + 1, 'Region')) # l.add_xpath('Bedrooms', '//span[@class="bedroom"]/ancestor::li/text()') l.add_xpath('Bathrooms', '//span[@class="bathroom"]/ancestor::li/text()') l.add_xpath('Garage', '//span[@class="garage"]/ancestor::li/text()') l.add_xpath('BrochureImage_pdf', '//a[@class="gt-after download-price-list"]/@href') l.add_xpath('InclusionsImage_pdf', '//div[@data-tab="inclusions"]/div/a/@href') l.add_xpath( 'OtherInclusions1', '//div[@class="tab-pane floorplans"][{}]/div/div/a/@dref'. format(i + 1)) l.add_value('BasePrice', response.meta['BasePrice']) l.add_xpath( 'FloorPlanImage1', '//div[@class="tab-pane floorplans"][{}]/div/div/img/@src'. format(i + 1)) l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('MasterBedroomDimension', [ descriptionXPath.format(i + 1, 'Master Suite'), descriptionXPath.format(i + 1, 'Master Bedroom'), descriptionXPath.format(i + 1, 'Master Bed') ]) l.add_xpath('Bedroom2Dimension', descriptionXPath.format(i + 1, 'Bedroom 2')) l.add_xpath('Bedroom3Dimension', descriptionXPath.format(i + 1, 'Bedroom 3')) l.add_xpath('Bedroom4Dimension', descriptionXPath.format(i + 1, 'Bedroom 4')) l.add_xpath('StudyDimension', [ descriptionXPath.format(i + 1, 'Study'), descriptionXPath.format(i + 1, 'Study/TV Area') ]) l.add_xpath('Meals_DiningDimension', [ descriptionXPath.format(i + 1, 'Dining/Living'), descriptionXPath.format(i + 1, 'Family/Meals') ]) l.add_xpath('FamilyDimension', descriptionXPath.format(i + 1, 'Family/Meals')) l.add_xpath('TheatreDimension', [ descriptionXPath.format(i + 1, 'Study/TV Area'), descriptionXPath.format(i + 1, 'TV Area'), descriptionXPath.format(i + 1, 'Home Theatre') ]) l.add_xpath('AlfrescoDimension', descriptionXPath.format(i + 1, 'Alfresco')) # l.add_xpath('HouseWidth', descriptionXPath.format('Min block width')) l.add_xpath('GarageDimension', [ descriptionXPath.format(i + 1, 'Garage'), descriptionXPath.format(i + 1, 'Double Garage') ]) l.add_xpath('KitchenDimension', [ descriptionXPath.format(i + 1, 'Kitchen/Meals'), descriptionXPath.format(i + 1, 'Kitchen') ]) l.add_xpath('LoungeDimension', descriptionXPath.format(i + 1, 'Lounge')) l.add_xpath('Squares', descriptionXPath.format(i + 1, 'Total Size')) # l.add_xpath('LandSize', descriptionXPath.format('Land Size sqm')) l.add_xpath('LivingArea', descriptionXPath.format(i + 1, 'Living')) # # Block Yes No l.add_xpath('TheatreRoom_Yes_No', [ descriptionXPath.format(i + 1, 'Study/TV Area'), descriptionXPath.format(i + 1, 'TV Area'), descriptionXPath.format(i + 1, 'Home Theatre') ]) l.add_xpath('Alfresco_Yes_No', [ descriptionXPath.format(i + 1, 'Alfresco'), descriptionXPath.format(i + 1, 'Second Alfresco') ]) l.add_xpath('Study_Yes_No', [ descriptionXPath.format(i + 1, 'Study'), descriptionXPath.format(i + 1, 'Study/TV Area') ]) l.add_value('OtherInclusions', ', '.join(other)) l.add_xpath('SturturalWarranty', inclusionsXpath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) l.add_xpath('Windows', inclusionsXpath, **{'re': '.*[Ww]indows?.*'}) l.add_xpath( 'KitchenBenchtop', inclusionsXpath, **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) l.add_xpath( 'SecuritySystem', inclusionsXpath, **{ 're': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*' }) l.add_xpath( 'EnergyRating', inclusionsXpath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) l.add_xpath( 'KitchenAppliance', inclusionsXpath, **{ 're': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*' }) l.add_xpath('ApplianceBrand', inclusionsXpath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) l.add_xpath('Splashback', inclusionsXpath, **{'re': '.*[Ss]plashback.*'}) l.add_xpath( 'FloorCovering', inclusionsXpath, **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) l.add_xpath('Cooling', inclusionsXpath, **{'re': '.*[Cc]ooling.*'}) l.add_xpath('Bath', inclusionsXpath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) l.add_xpath('CeilingHeight', inclusionsXpath, **{'re': '.*[Bb]ath.*'}) l.add_xpath('EnsuiteWallTiling', inclusionsXpath, **{'re': '.*[Tt]ile.*'}) l.add_xpath( 'EnsuiteBenchtop', inclusionsXpath, **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) l.add_xpath('EnsuiteShowerbase', inclusionsXpath, **{'re': '.*[Ss]howerbase.*'}) l.add_xpath( 'WallPaint', inclusionsXpath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) l.add_xpath('WIRFitouts', inclusionsXpath, **{'re': '.*walk in robe.*|.*WIR.*'}) l.add_xpath('Downlights', inclusionsXpath, **{'re': '.*[Dd]ownlights.*'}) l.add_xpath('Landscaping', inclusionsXpath, **{'re': '.*[Ll]andscaping.*'}) l.add_xpath('Driveway', inclusionsXpath, **{'re': '.*[Dd]riveway.*'}) l.add_xpath('Promotion', inclusionsXpath, **{'re': '.*[Pp]romotion.*'}) yield l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) # with open('testURL', 'a') as file: # file.writelines('\n'.join(hxs.xpath('//div[@class="col-md-8"]/table/tbody/tr/td[1]/text()').extract())) inclusionsXpath = '''//div[@class="clearfix inclusions-block-inner"]/ul/li/text()''' imgXpath = '//input[@class="mfp-images"][{}]/@value' descriptionXPath = '//div[@class="admin-content"]/p/text()' l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', self._getBuildType(response.url)) # l.add_value('BuilderEmailAddress', '*****@*****.**') l.add_xpath('HomeDesignMainImage', '//div[@class="imagefill h550"]//img/@src') l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', '/html/body/div[2]/div/section[2]/h1/text()') data = response.meta['data'] if isinstance(data, int): l.add_xpath('Storey', str(data)) elif data: l.add_value('Region', data) l.add_value('State', self._getState(response.url)) l.add_xpath('Squares', '//span[@class="img-caption"]//i/text()', **{'re': '(?<=Home [Ss]ize - ).+'}) l.add_xpath( 'Bedrooms', '//span[@class="facility-list clearfix"]/em[1]//strong/text()') l.add_xpath( 'Bathrooms', '//span[@class="facility-list clearfix"]/em[2]//strong/text()') l.add_xpath( 'Garage', '//span[@class="facility-list clearfix"]/em[3]//strong/text()') l.add_xpath('LandSize', '//span[@class="img-caption"]//i/text()', **{'re': '(?<=Land size - ).+'}) l.add_xpath('BasePrice', '//span[@class="product-price"]/text()') l.add_xpath('Lot_BlockAddress', '//div[@class="admin-content"]/h3/text()') l.add_xpath('BrochureImage_pdf', '//a[text()="Download Flyer"]/@href') l.add_xpath('InclusionsImage_pdf', '//a[text()="Download Inclusions"]/@href') l.add_xpath('OtherInclusions', '//a[text()="Download Floorplan & Options"]/@href') l.add_xpath('FloorPlanImage1', '//a[@class="image-lightbox"]/img/@src') l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('HouseWidth', '//em[@class="width-length"]/i/text()', **{'re': '(?<=Lot [Ww]idth - ).+'}) l.add_xpath('HouseLength', '//em[@class="width-length"]/i/text()', **{'re': '(?<=Lot [Ll]ength - ).+'}) # Block Yes No l.add_xpath('TheatreRoom_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('SeparateMeals_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('Alfresco_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('Study_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('WalkinPantry_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('BultersPantry_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('SteelStructure_Yes_No', [descriptionXPath, inclusionsXpath]) l.add_xpath('Balcony_Yes_No', [descriptionXPath, inclusionsXpath]) # Гарантія l.add_xpath('SturturalWarranty', [descriptionXPath, inclusionsXpath], **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath( 'KitchenBenchtop', [descriptionXPath, inclusionsXpath], **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) # Сигналізація l.add_xpath( 'SecuritySystem', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath( 'EnergyRating', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath( 'KitchenAppliance', [descriptionXPath, inclusionsXpath], **{ 're': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*' }) # Бренд пристрою l.add_xpath('ApplianceBrand', [descriptionXPath, inclusionsXpath], **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath( 'FloorCovering', [descriptionXPath, inclusionsXpath], **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) # Охолодження l.add_xpath('Cooling', [descriptionXPath, inclusionsXpath], **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', [descriptionXPath, inclusionsXpath], **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath( 'EnsuiteBenchtop', [descriptionXPath, inclusionsXpath], **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) # Душова l.add_xpath('EnsuiteShowerbase', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', [descriptionXPath, inclusionsXpath], **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', [descriptionXPath, inclusionsXpath], **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', [descriptionXPath, inclusionsXpath], **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', [descriptionXPath, inclusionsXpath], **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', [descriptionXPath, inclusionsXpath], **{'re': '.*[Pp]romotion.*'}) # # # інші штуки # # l.add_xpath('OtherInclusions1', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions2', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions3', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions4', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions5', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) BuildType = referer BuildType = self.getBuildType(BuildType) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('BuildType', BuildType) l.add_value('State', 'VIC') l.add_value('Region', 'MELBOURNE') l.add_value('url', response.url) if BuildType == 'Home Designs': l.add_xpath('DesignName', '//h2[@class="darkblue lowercase"]/text()') l.add_xpath('Squares', '//*[@id="details"]/tr/th[text()="Floor Area"]/following-sibling::td/text()') l.add_xpath('Bedrooms', '//*[@id="details"]/tr/th[text()="Bedrooms"]/following-sibling::td/text()') l.add_xpath('Bathrooms', '//*[@id="details"]/tr/th[text()="Bathrooms"]/following-sibling::td/text()') l.add_xpath('Garage', '//*[@id="details"]/tr/th[text()="Garages"]/following-sibling::td/text()') l.add_xpath('LandSize', '//*[@id="details"]/tr/th[text()="Land Area"]/following-sibling::td/text()') l.add_xpath('Lot_BlockWidth', '//*[@id="details"][2]/tbody/tr/th[text()="Frontage"]/following-sibling::td[2]/text()') l.add_xpath('HomeDesignMainImage', '//li[@id="img1"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('FloorPlanImage1', '//div[@class="property-details-buttons"]/a/span[text()="Floor Plan"]/ancestor::a/@href') l.add_xpath('Image1', '//li[@id="img1"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image2', '//li[@id="img2"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image3', '//li[@id="img3"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image4', '//li[@id="img4"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image5', '//li[@id="img5"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image6', '//li[@id="img6"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image7', '//li[@id="img7"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image8', '//li[@id="img8"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image9', '//li[@id="img9"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image10', '//li[@id="img10"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image11', '//li[@id="img11"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image12', '//li[@id="img12"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image13', '//li[@id="img13"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image14', '//li[@id="img14"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_xpath('Image15', '//li[@id="img15"]/a/img/@src', **{'myRefer': 'http://www.frenkenhomes.com.au', 're':'.*jpg'}) l.add_value('BuilderLogo', self.logo) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") Storey = self.getStorey(referer) hxs = HtmlXPathSelector(response) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('BuildType', response.meta['BuildType']) l.add_value('url', response.url) l.add_value('BuilderLogo', self.logo) l.add_value('State', 'VIC') l.add_value('Region', 'MELBOURNE') l.add_value('Storey', Storey) try: l.add_value('Squares', response.meta['Squares']) except KeyError: pass try: l.add_value('HouseWidth', response.meta['HouseWidth']) except KeyError: pass try: l.add_value('HouseLength', response.meta['HouseLength']) except KeyError: pass l.add_xpath('DesignName', '//h2[@class="page-title "]/span/text()', **{'reSub': '^Lot\s*\d+,\s*(St)?(Mt\.)?\s*[\w\s]+,'}) l.add_xpath('Lot_BlockAddress', '//h2[@class="page-title "]/span/text()', **{'re': '^Lot\s*\d+,\s*(St)?(Mt\.)?\s*[\w\s]+'}) l.add_xpath( 'Bedrooms', '''//div[@class="single-property"]/div/span[@class="bed"]/strong/text()''' ) l.add_xpath( 'Bathrooms', '''//div[@class="single-property"]/div/span[@class="bath"]/strong/text()''' ) l.add_xpath( 'Garage', '''//div[@class="single-property"]/div/span[@class="car"]/strong/text()''' ) l.add_xpath( 'Squares', '''//div[@class="single-property"]/div/span[@class="area "]/strong/text()''' ) l.add_xpath('SturturalWarranty', '//div[@id="description"]/p/text()', **{'re': '"?.*[\w\s]+guarantee.*"?'}) l.add_xpath('TheatreRoom_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Tt]heatre [Rr]ooms?'}) l.add_xpath('SeparateMeals_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Ss]eparate [Mm]eals'}) l.add_xpath('Alfresco_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Aa]lfresco'}) l.add_xpath('Study_Yes_No', '//div[@id="description"]/p/text()', **{'re': '([Ss]tudy)|([Ss}chool)|([Uu]niversity)'}) l.add_xpath('WalkinPantry_Yes_No', '//div[@id="description"]/p/text()', **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('BultersPantry_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('SteelStructure_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Ss]teel [Ss]tructure'}) l.add_xpath('Balcony_Yes_No', '//div[@id="description"]/p/text()', **{'re': '[Bb]alcony'}) l.add_xpath('Windows', '//div[@id="description"]/p/text()', **{'re': '"?.*[\w\s]+[Ww]indows?.*"?'}) l.add_xpath('KitchenBenchtop', '//div[@id="description"]/p/text()', **{'re': '"?.*[\w\s]+[Bb]enchtop.*"?'}) l.add_xpath('SecuritySystem', '//div[@id="description"]/p/text()', **{'re': '"?.*[\w\s]+[Ss]ecurity System.*"?'}) l.add_xpath('FloorPlanImage1', '//div[@id="floor-plan"]/a/img/@src') l.add_xpath('BrochureImage_pdf', '//a[text()="View property brochure PDF"]/@href') l.add_xpath('HomeDesignMainImage', '//div[@id="property-images"]/ul/li[1]/img/@src') l.add_xpath('Image1', '//div[@id="property-images"]/ul/li[1]/img/@src') l.add_xpath('Image2', '//div[@id="property-images"]/ul/li[2]/img/@src') l.add_xpath('Image3', '//div[@id="property-images"]/ul/li[3]/img/@src') l.add_xpath('Image4', '//div[@id="property-images"]/ul/li[4]/img/@src') l.add_xpath('Image5', '//div[@id="property-images"]/ul/li[5]/img/@src') l.add_xpath('Image6', '//div[@id="property-images"]/ul/li[6]/img/@src') l.add_xpath('Image7', '//div[@id="property-images"]/ul/li[7]/img/@src') l.add_xpath('Image8', '//div[@id="property-images"]/ul/li[8]/img/@src') l.add_xpath('Image9', '//div[@id="property-images"]/ul/li[9]/img/@src') l.add_xpath('Image10', '//div[@id="property-images"]/ul/li[10]/img/@src') l.add_xpath('Image11', '//div[@id="property-images"]/ul/li[11]/img/@src') l.add_xpath('Image12', '//div[@id="property-images"]/ul/li[12]/img/@src') l.add_xpath('Image13', '//div[@id="property-images"]/ul/li[13]/img/@src') l.add_xpath('Image14', '//div[@id="property-images"]/ul/li[14]/img/@src') l.add_xpath('Image15', '//div[@id="property-images"]/ul/li[15]/img/@src') return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) BuildType = self._getBuildType(referer) imgXpath = '//div[@class="portfolio-single__main-content"]/p/img[{}]/@src' descriptionXPath = '//div[@class="portfolio-single__main-content"]/p[2]/text()' l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType',BuildType) l.add_value('BuilderLogo', self.logo) if BuildType == 'PRESTIGE HOMES': l.add_value('State', 'MELBOURNE') l.add_xpath('BuilderEmailAddress', '//div[@class="entry-content span5"]/p/strong[text()="Email:"]/following-sibling::a/text()') l.add_xpath('DesignName', '//h1[@class="title-header"]/text()') l.add_xpath('FloorPlanImage1', '//div[@class="entry-content span5"]/p[1]/a/@href') l.add_xpath('HomeDesignMainImage', '//div[@class="portfolio-single__main-content"]/p[1]/img/@src') l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('HomeDesignMainImage', '//div[@class="portfolio-single__main-content"]/p[2]/text()') # Block Yes No l.add_xpath('TheatreRoom_Yes_No', descriptionXPath, **{'re': '([Tt]heatre.*[Rr]ooms?)|([Rr]ooms?.*[Tt]heatre)'}) l.add_xpath('SeparateMeals_Yes_No', descriptionXPath, **{'re': '([Ss]eparate.*[Mm]eals)|([Mm]eals.*[Ss]eparate)'}) l.add_xpath('Alfresco_Yes_No', descriptionXPath, **{'re': '[Aa]lfresco'}) l.add_xpath('Study_Yes_No', descriptionXPath, **{'re': '([Ss]tudy)|([Ss}chool)|([Uu]niversity)'}) l.add_xpath('WalkinPantry_Yes_No', descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('SteelStructure_Yes_No', descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', descriptionXPath, **{'re': '[Bb]alcony'}) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") Region = self.getParams(response.url) Referer = str(response.request.headers.get('Referer', None)) hxs = HtmlXPathSelector(response) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('BuildType', self.getBuildType(Referer)) l.add_value('url', response.url) l.add_value('BuilderLogo', self.logo) l.add_value('Region', Region) l.add_value('Storey', self._getSrorey(referer)) l.add_xpath( 'State', '''//div[@class="dimensions-wrapper clearfix border-top"]/div/ p[text()="Region"]/following-sibling::p/text()''' ) l.add_xpath( 'DesignName', '''//ul[@class="normalize-ul design-list"]/li/a[@style="background-color: #e36420;"]/text()''' ) l.add_xpath( 'BasePrice', '''//div[@class="dimensions-wrapper clearfix border-top"]/div/ p[text()="Price"]/following-sibling::p/text()''' ) l.add_xpath( 'Squares', '''//div[@class="dimensions-wrapper clearfix border-top"]/div/ p[text()="House Size"]/following-sibling::p/text()''' ) l.add_xpath( 'HouseWidth', '''//div[@class="dimensions-wrapper clearfix border-top"]/div/ p[text()="House Width"]/following-sibling::p/text()''' ) l.add_xpath( 'HouseLength', '''//div[@class="dimensions-wrapper clearfix border-top"]/div/ p[text()="House Length"]/following-sibling::p/text()''' ) l.add_xpath( 'Bedrooms', '''//div[@class="icon-wrapper clearfix"]/div/img[@alt="bed-gray"]/following-sibling::span/text()''' ) l.add_xpath( 'Bathrooms', '''//div[@class="icon-wrapper clearfix"]/div/img[@alt="bathtub-gray"]/following-sibling::span/text()''' ) l.add_xpath( 'Garage', '''//div[@class="icon-wrapper clearfix"]/div/img[@alt="car-gray"]/following-sibling::span/text()''' ) l.add_xpath('SturturalWarranty', '//div[@class="one-halve"]/ul/li', **{'re': '[\w\s]+guarantee'}) l.add_xpath('TheatreRoom_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Tt]heatre [Rr]ooms?'}) l.add_xpath('SeparateMeals_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Ss]eparate [Mm]eals'}) l.add_xpath('Alfresco_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Aa]lfresco'}) l.add_xpath('Study_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '([Ss]tudy)|([Ss}chool)|([Uu]niversity)'}) l.add_xpath('WalkinPantry_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('BultersPantry_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('SteelStructure_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Ss]teel [Ss]tructure'}) l.add_xpath('Balcony_Yes_No', '//div[@class="one-halve"]/ul/li', **{'re': '[Bb]alcony'}) l.add_xpath('Windows', '//div[@class="one-halve"]/ul/li', **{'re': '[\w\s]+[Ww]indows?'}) l.add_xpath('KitchenBenchtop', '//div[@class="one-halve"]/ul/li', **{'re': '[\w\s]+[Bb]enchtop'}) l.add_xpath('SecuritySystem', '//div[@class="one-halve"]/ul/li', **{'re': '[\w\s]+[Ss]ecurity System'}) l.add_xpath( 'BuilderEmailAddress', '//div[@class="tablet desktop editable"]/table/tbody/tr/th[text()="Email"]/following-sibling::td/text()' ) l.add_xpath('FloorPlanImage1', '//div[@class="floor-plans-wrapper"]/div/a/img/@src', **{'myRefer': referer}) l.add_xpath('BrochureImage_pdf', '//a[text()="Download Brochure"]/@href') l.add_xpath('InclusionsImage_pdf', '//a[text()="View Our Standard Inclusions List"]/@href', **{'myRefer': 'http://nostrahomes.com.au/'}) l.add_xpath('Image1', '//*[@id="top-image"]/img/@src', **{'myRefer': referer}) l.add_xpath('Image1', '//ul[@class="slides normalize-ul"]/li[1]/img/@src', **{'myRefer': referer}) l.add_xpath('Image2', '//ul[@class="slides normalize-ul"]/li[2]/img/@src', **{'myRefer': referer}) l.add_xpath('Image3', '//ul[@class="slides normalize-ul"]/li[3]/img/@src', **{'myRefer': referer}) l.add_xpath('Image4', '//ul[@class="slides normalize-ul"]/li[4]/img/@src', **{'myRefer': referer}) l.add_xpath('Image5', '//ul[@class="slides normalize-ul"]/li[5]/img/@src', **{'myRefer': referer}) l.add_xpath('Image6', '//ul[@class="slides normalize-ul"]/li[6]/img/@src', **{'myRefer': referer}) l.add_xpath('Image7', '//ul[@class="slides normalize-ul"]/li[7]/img/@src', **{'myRefer': referer}) l.add_xpath('Image8', '//ul[@class="slides normalize-ul"]/li[8]/img/@src', **{'myRefer': referer}) l.add_xpath('Image9', '//ul[@class="slides normalize-ul"]/li[9]/img/@src', **{'myRefer': referer}) l.add_xpath('Image10', '//ul[@class="slides normalize-ul"]/li[10]/img/@src', **{'myRefer': referer}) l.add_xpath('Image11', '//ul[@class="slides normalize-ul"]/li[11]/img/@src', **{'myRefer': referer}) l.add_xpath('Image12', '//ul[@class="slides normalize-ul"]/li[12]/img/@src', **{'myRefer': referer}) l.add_xpath('Image13', '//ul[@class="slides normalize-ul"]/li[13]/img/@src', **{'myRefer': referer}) l.add_xpath('Image14', '//ul[@class="slides normalize-ul"]/li[14]/img/@src', **{'myRefer': referer}) l.add_xpath('Image15', '//ul[@class="slides normalize-ul"]/li[15]/img/@src', **{'myRefer': referer}) return l.load_item()
def parseList(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) hxsItemsList = hxs.select( '//div[@id="itemListPrimary"]/div[@class="itemContainer itemContainerLast"]' ) for hxsItems in hxsItemsList: l = RealtyLoader(RealtyspidersItem(), hxsItems) l.add_value('BuildType', self._getBuildType(response.url)) l.add_value('BuilderEmailAddress', '*****@*****.**') l.add_value('BuilderLogo', self.logo) l.add_value('url', response.url) l.add_xpath('DesignName', './/div[@class="packages-cat-title"]/text()', **{'re': '.*-'}) l.add_xpath('Squares', './/div[@class="packages-cat-title"]/text()', **{'re': '\d+\ssq'}) l.add_xpath( 'Region', './/div[@class="packages-cat-middle"]/div[@class="estate"]/text()', **{'re': ',.*'}) l.add_xpath( 'DisplayLocation', './/div[@class="packages-cat-middle"]/div[@class="estate"]/text()', **{'re': '.*,'}) l.add_xpath( 'LandSize', './/div[@class="packages-cat-middle"]/div[@class="sq"]/text()') l.add_xpath( 'Bedrooms', './/div[@class="packages-cat-middle"]/div[@class="bed"]/text()' ) l.add_xpath( 'Bathrooms', './/div[@class="packages-cat-middle"]/div[@class="bath"]/text()' ) l.add_xpath( 'Garage', './/div[@class="packages-cat-middle"]/div[@class="car"]/text()' ) l.add_xpath('BrochureImage_pdf', './/div[@class="brochure"]/a/@href', **{'myRefer': referer}) l.add_xpath('BasePrice', '''.//div[@class="price"]/strong/text()''') l.add_xpath('HomeDesignMainImage', './/div[@class="packages-cat-left"]/img/@src', **{'myRefer': self.start_urls[0]}) yield l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) inclusionsXpath = '''//h2[text()="Package Inclusions"]/following-sibling::div//li/text()''' imgXpath = '//div[@class="cycle-slideshow"]/img[{}]/@src' descriptionXPath = '''//div[@class="col-sm-4 col-hd-house-dimensions hd-house-dimensions"] //tr/td[text()="{}"]/following-sibling::td/text()''' idPage = hxs.xpath( '//a[text()="Download info pack"]/@data-home-id').extract_first() BuildType = self._getBuildType(referer) other = [] for name in self.oth: size = hxs.xpath(descriptionXPath.format(name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) # l.add_value('BuilderEmailAddress', '*****@*****.**') l.add_xpath('HomeDesignMainImage', imgXpath.format('1')) l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', [ '/html/body/div[3]/div/div[1]/div/div[1]/h1/text()', '/html/body/div[3]/div/div[1]/h1/text()' ]) if BuildType.find('Double'): l.add_value('Storey', '2') else: l.add_value('Storey', '1') # l.add_xpath('Region', '/html/body/div[3]/div/div[1]/div/div[1]/h3/text()') l.add_xpath('Region', descriptionXPath.format('Region')) # l.add_xpath('Bedrooms', '//span[@class="hh-icon-beds"]/ancestor::li/text()') l.add_xpath('Bathrooms', '//span[@class="hh-icon-baths"]/ancestor::li/text()') l.add_xpath('Garage', '//span[@class="hh-icon-car"]/ancestor::li/text()') l.add_xpath('LivingArea', '//span[@class="hh-icon-living"]/ancestor::li/text()') l.add_xpath('BasePrice', [ '/html/body/div[3]/div/div[1]/div/div[1]/h2/text()', '/html/body/div[3]/div/div[1]/h2/text()' ]) l.add_xpath( 'FloorPlanImage1', '//div[@class="js-fp-panzoom js-fp-panzoom-reset"]/img/@src') if idPage: l.add_value('BrochureImage_pdf', '{}{}'.format(self.pdfUrl, idPage)) l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('MasterBedroomDimension', descriptionXPath.format('Master Bedroom')) l.add_xpath('Bedroom2Dimension', descriptionXPath.format('Bedroom 2')) l.add_xpath('Bedroom3Dimension', descriptionXPath.format('Bedroom 3')) l.add_xpath('Bedroom4Dimension', [ descriptionXPath.format('Bedroom 4'), descriptionXPath.format('Study/Bedroom 4') ]) l.add_xpath('StudyDimension', [ descriptionXPath.format('Study/Bedroom 4'), descriptionXPath.format('Study') ]) l.add_xpath('Meals_DiningDimension', [ descriptionXPath.format('Meals'), descriptionXPath.format('Family/Meals') ]) l.add_xpath('FamilyDimension', [ descriptionXPath.format('Meals'), descriptionXPath.format('Family/Meals') ]) l.add_xpath('TheatreDimension', [ descriptionXPath.format('Media Room'), descriptionXPath.format('Media') ]) l.add_xpath('AlfrescoDimension', descriptionXPath.format('Alfresco')) l.add_xpath('HouseWidth', descriptionXPath.format('Min block width')) l.add_xpath('GarageDimension', descriptionXPath.format('Garage')) l.add_xpath('KitchenDimension', descriptionXPath.format('Kitchen')) l.add_xpath('Squares', descriptionXPath.format('Floor Area sqm')) l.add_xpath('LandSize', descriptionXPath.format('Land Size sqm')) # # Block Yes No l.add_xpath('TheatreRoom_Yes_No', [ descriptionXPath.format('Media Room'), descriptionXPath.format('Media') ]) l.add_xpath('Alfresco_Yes_No', [ descriptionXPath.format('Alfresco'), descriptionXPath.format('Second Alfresco') ]) l.add_xpath('Study_Yes_No', [ descriptionXPath.format('Study/Bedroom 4'), descriptionXPath.format('Study') ]) l.add_value('OtherInclusions', ', '.join(other)) # Гарантія l.add_xpath('SturturalWarranty', inclusionsXpath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', inclusionsXpath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath( 'KitchenBenchtop', inclusionsXpath, **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) # Сигналізація l.add_xpath( 'SecuritySystem', inclusionsXpath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath( 'EnergyRating', inclusionsXpath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath( 'KitchenAppliance', inclusionsXpath, **{ 're': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*' }) # Бренд пристрою l.add_xpath('ApplianceBrand', inclusionsXpath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', inclusionsXpath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath( 'FloorCovering', inclusionsXpath, **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) # Охолодження l.add_xpath('Cooling', inclusionsXpath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', inclusionsXpath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', inclusionsXpath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath( 'EnsuiteBenchtop', inclusionsXpath, **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) # Душова l.add_xpath('EnsuiteShowerbase', inclusionsXpath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', inclusionsXpath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', inclusionsXpath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', inclusionsXpath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', inclusionsXpath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', inclusionsXpath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', inclusionsXpath, **{'re': '.*[Pp]romotion.*'}) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) BuildType = self._getBuildType(referer) imgXpath = '//a[@class="proPhotoThumbLink"]/img[{}]/@src' descriptionXPath = '//div[@id="listing_options"]/ul/li/text()' l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) l.add_value('BuilderLogo', self.logo) # if BuildType == 'PRESTIGE HOMES': # l.add_value('State', 'MELBOURNE') # l.add_xpath('BuilderEmailAddress', # '//div[@class="entry-content span5"]/p/strong[text()="Email:"]/following-sibling::a/text()') # l.add_xpath('DesignName', '//div[@id="listing_options"]/h4/text()') l.add_xpath('Bedrooms', '//div[@id="listing_options"]/text()', **{'re': '\d(?=\sBeds)'}) l.add_xpath('Bathrooms', '//div[@id="listing_options"]/text()', **{'re': '\d(?=\sBaths)'}) l.add_xpath('Lot_BlockWidth', '//div[@id="listing_options"]/text()', **{'re': '(?<=Ideal block width = )[\w\.\s]+'}) l.add_xpath('LivingArea', descriptionXPath, **{'re': '(?<=Living Area - )[\w\.\s]+'}) l.add_xpath('Squares', descriptionXPath, **{'re': '(?<=Total Area - )[\w\.\s]+'}) l.add_xpath('GarageDimension', descriptionXPath, **{'re': '(?<=Garage Area - )[\w\.\s]+'}) l.add_xpath('AlfrescoDimension', descriptionXPath, **{'re': '(?<=Alfresco Area - )[\w\.\s]+'}) l.add_xpath( 'FloorPlanImage1', '//div[@id="listing_text"]/h4/a[text()="Download floor plan"]/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath( 'BrochureImage_pdf', '//div[@id="listing_text"]/h4/a[text()="View the Specification"]/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('HomeDesignMainImage', '//div[@class="mainImageTarget"]/img/@src', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image1', imgXpath.format('1'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image2', imgXpath.format('2'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image3', imgXpath.format('3'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image4', imgXpath.format('4'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image5', imgXpath.format('5'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image6', imgXpath.format('6'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image7', imgXpath.format('7'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image8', imgXpath.format('8'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image9', imgXpath.format('9'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image10', imgXpath.format('10'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image11', imgXpath.format('11'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image12', imgXpath.format('12'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image13', imgXpath.format('13'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image14', imgXpath.format('14'), **{'myRefer': self.start_urls[0]}) l.add_xpath('Image15', imgXpath.format('15'), **{'myRefer': self.start_urls[0]}) # # Block Yes No l.add_value('TheatreRoom_Yes_No', self.getFeatures(response.url, '35')) # l.add_xpath('SeparateMeals_Yes_No', # descriptionXPath, **{'re': '([Ss]eparate.*[Mm]eals)|([Mm]eals.*[Ss]eparate)'}) l.add_value('Alfresco_Yes_No', self.getFeatures(response.url, '25')) l.add_value('Study_Yes_No', self.getFeatures(response.url, '36')) # l.add_xpath('WalkinPantry_Yes_No', # descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) # l.add_xpath('BultersPantry_Yes_No', # descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) # l.add_xpath('BultersPantry_Yes_No', # descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) # l.add_xpath('SteelStructure_Yes_No', # descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) # l.add_xpath('Balcony_Yes_No', # descriptionXPath, **{'re': '[Bb]alcony'}) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) # all = hxs.select('//div[@class="property-info-agent clear"]/span/strong/text()').extract() # with open('testURL', 'a') as file: # for l in all: # file.writelines(l+'\n') roomsXpath = '''//div[@class="property-info-agent clear"]/span/strong[text()="{}"]/ancestor::span/text()''' Bedrooms = hxs.xpath(roomsXpath.format('Bedrooms:')).extract() Bathrooms = hxs.xpath(roomsXpath.format('Bathrooms:')).extract() Garage = hxs.xpath(roomsXpath.format('Car Spaces:')).extract() HouseWidth = hxs.xpath(roomsXpath.format('Overall Width:')).extract() GarageDimension = hxs.xpath(roomsXpath.format('Garage:')).extract() AlfrescoDimension = hxs.xpath(roomsXpath.format('Alfresco:')).extract() Alfresco_Yes_No = hxs.xpath(roomsXpath.format('Alfresco:')).extract() Squares = hxs.xpath(roomsXpath.format('Total:')).extract() Storey = hxs.xpath(roomsXpath.format('First Floor Living:')).extract() # overviewXpath = '''//table[@id="hf-property-overview"]/tr/td/div[text()="{}"]/ancestor::td/following-sibling:: # td[@class="item-value"]/div/div[@class="field-value"]/text()''' # imgXpath = '//div[@class=" flexslider_gallery image hf-property-gallery"]/div/ul/li[{}]/img/@src' descriptionXPath = '//div[@id="0"]//li/text()' l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', 'HOME DESIGNS') # l.add_value('BuilderEmailAddress', '*****@*****.**') # # try: # l.add_value('HomeDesignMainImage', self.itemsList[response.url]) # except KeyError: # pass l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', [ '//section[@class="page-title-block header-bg"]/div/h2/text()', '//section[@class="page-title-block-default header-bg"]/div/h2/text()' ]) # if response.url.find('/lot') == -1: # else: # l.add_xpath('DesignName', overviewXpath.format('Home Design')) # l.add_xpath('Region', '//h1[@class="property-detail-title"]/text()', **{'re': ',.*$'}) # l.add_value('State', 'MELBOURNE') # l.add_xpath('Squares', overviewXpath.format('Area')) l.add_value('Bedrooms', self._stripJoin(Bedrooms)) l.add_value('Bathrooms', self._stripJoin(Bathrooms)) l.add_value('Garage', self._stripJoin(Garage)) if Storey: l.add_value('Storey', '2') else: l.add_value('Storey', '1') l.add_value('HouseWidth', self._stripJoin(HouseWidth)) l.add_value('GarageDimension', self._stripJoin(GarageDimension)) l.add_value('AlfrescoDimension', self._stripJoin(AlfrescoDimension)) l.add_value('Alfresco_Yes_No', self._stripJoin(Alfresco_Yes_No)) l.add_value('Squares', self._stripJoin(Squares)) # l.add_xpath('LandSize', overviewXpath.format('Land Size')) # l.add_xpath('BasePrice', '//*[@id="main-content"]/div/div[1]/div/div/div[2]/div/div[2]/text()') l.add_xpath('BrochureImage_pdf', '//div[@id="0"]//a/@href') # l.add_xpath('InclusionsImage_pdf', '//a[text()="Specifications and Inclusions"]/@href') l.add_xpath('FloorPlanImage1', '//div[@id="1"]/img/@src') l.add_xpath('HomeDesignMainImage', '//ul[@class="slides"]//a/@href') # l.add_xpath('Image1', imgXpath.format('2')) # l.add_xpath('Image2', imgXpath.format('3')) # l.add_xpath('Image3', imgXpath.format('4')) # l.add_xpath('Image4', imgXpath.format('5')) # l.add_xpath('Image5', imgXpath.format('6')) # l.add_xpath('Image6', imgXpath.format('7')) # l.add_xpath('Image7', imgXpath.format('8')) # l.add_xpath('Image8', imgXpath.format('9')) # l.add_xpath('Image9', imgXpath.format('10')) # l.add_xpath('Image10', imgXpath.format('11')) # l.add_xpath('Image11', imgXpath.format('12')) # l.add_xpath('Image12', imgXpath.format('13')) # l.add_xpath('Image13', imgXpath.format('14')) # l.add_xpath('Image14', imgXpath.format('15')) # l.add_xpath('Image15', imgXpath.format('16')) # # l.add_xpath('MasterBedroomDimension', roomsXpath.format('Master Bedroom')) # l.add_xpath('Bedroom2Dimension', roomsXpath.format('Bedroom 2')) # l.add_xpath('Bedroom3Dimension', roomsXpath.format('Bedroom 3')) # l.add_xpath('Bedroom4Dimension', roomsXpath.format('Bedroom 4')) # l.add_xpath('StudyDimension', roomsXpath.format('Study')) # l.add_xpath('Meals_DiningDimension', roomsXpath.format('Meals')) # l.add_xpath('FamilyDimension', roomsXpath.format('Family')) # l.add_xpath('AlfrescoDimension', roomsXpath.format('Alfresco')) # l.add_xpath('HouseWidth', roomsXpath.format('Overall Width')) # l.add_xpath('HouseLength', roomsXpath.format('Overall Length')) # # Block Yes No l.add_xpath('WalkinPantry_Yes_No', descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath( 'SteelStructure_Yes_No', descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', roomsXpath.format('Balcony')) # # Гарантія l.add_xpath('SturturalWarranty', descriptionXPath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', descriptionXPath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath( 'KitchenBenchtop', descriptionXPath, **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) # Сигналізація l.add_xpath( 'SecuritySystem', descriptionXPath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath( 'EnergyRating', descriptionXPath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath( 'KitchenAppliance', descriptionXPath, **{ 're': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*' }) # Бренд пристрою l.add_xpath('ApplianceBrand', descriptionXPath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', descriptionXPath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath( 'FloorCovering', descriptionXPath, **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) # Охолодження l.add_xpath('Cooling', descriptionXPath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', descriptionXPath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', descriptionXPath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath( 'EnsuiteBenchtop', descriptionXPath, **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) # Душова l.add_xpath('EnsuiteShowerbase', descriptionXPath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', descriptionXPath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', descriptionXPath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', descriptionXPath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', descriptionXPath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', descriptionXPath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', descriptionXPath, **{'re': '.*[Pp]romotion.*'}) # # # інші штуки # # l.add_xpath('OtherInclusions', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions1', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions2', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions3', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions4', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions5', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) return l.load_item()
def parseOurhomes(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") areaXpath = '''//*[@id="floorplan-1"]/div[@class="specs-table"]/div/div[text()="{}"]/following-sibling:: div[@class="size"]/text()''' imgXpath = '//div[@class="home--single__gallery-images hidden-sm hidden-xs"]/a[{}]/@href' descrXpath = '//*[@id="sb-site"]/div[2]/div[3]/div/div/div[1]/p/text()' hxs = HtmlXPathSelector(response) # data = hxs.xpath('//div[@class="specs-table"]/div/div[@class="area"]/text()').extract() # with open('testURL', 'a') as file: # for i in data: # file.writelines(i + '\n') other = [] for name in self.oth: size = hxs.xpath(areaXpath.format(name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', self._getBuildType(response.url)) l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', '//*[@id="sb-site"]/div[2]/div[1]/div/div/h1/text()') l.add_xpath( 'BrochureImage_pdf', '//*[@id="sb-site"]/div[2]/div[3]/div/div/div[2]/div[2]/a/@href') l.add_xpath( 'InclusionsImage_pdf', '//*[@id="sb-site"]/div[2]/div[3]/div/div/div[2]/div[3]/a/@href') l.add_xpath('Squares', '//*[@id="floorplan-1"]/div[@class="squares"]/text()') l.add_xpath('Bedrooms', '//*[@id="floorplan-1"]/div[@class="bedrooms"]/text()') l.add_xpath('Bathrooms', '//*[@id="floorplan-1"]/div[@class="bathrooms"]/text()') l.add_xpath('Garage', '//*[@id="floorplan-1"]/div[@class="cars"]/text()') l.add_xpath('FamilyDimension', [ areaXpath.format('Family'), areaXpath.format('Family/Lounge'), areaXpath.format('family') ]) l.add_xpath( 'Meals_DiningDimension', [areaXpath.format('Meals'), areaXpath.format('Family/Meals')]) l.add_xpath('LoungeDimension', areaXpath.format('Lounge')) l.add_xpath('AlfrescoDimension', areaXpath.format('Alfresco')) l.add_xpath('Alfresco_Yes_No', areaXpath.format('Alfresco')) l.add_xpath('TheatreRoom_Yes_No', areaXpath.format('Theatre')) l.add_xpath('TheatreDimension', areaXpath.format('Theatre')) l.add_xpath('GarageDimension', areaXpath.format('Garage')) l.add_xpath('MasterBedroomDimension', [ areaXpath.format('Master Bedroom'), areaXpath.format('Bedroom 1'), areaXpath.format('Bed 1'), areaXpath.format('Master') ]) l.add_xpath('Bedroom2Dimension', [ areaXpath.format('Bedroom 2'), areaXpath.format('Bed 2'), areaXpath.format('Bedroom 2/Lounge') ]) l.add_xpath('Bedroom3Dimension', [areaXpath.format('Bedroom 3'), areaXpath.format('Bed 3')]) l.add_xpath('Bedroom4Dimension', [areaXpath.format('Bedroom 4'), areaXpath.format('Bed 4')]) l.add_xpath('KitchenDimension', areaXpath.format('Kitchen')) l.add_xpath('Study_Yes_No', areaXpath.format('Study')) l.add_xpath('StudyDimension', areaXpath.format('Study')) l.add_xpath('FloorPlanImage1', '//*[@id="floorplan-1"]/@src') l.add_xpath('HomeDesignMainImage', '//*[@class="home--single__full-image"]/a/@href') l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('BuilderEmailAddress', descrXpath, **{'re': '[a-zA-Z]+@[a-z]+\.com\.au'}) # Block Yes No l.add_xpath('TheatreRoom_Yes_No', descrXpath, **{'re': '[tT]heatre'}) l.add_xpath('SeparateMeals_Yes_No', descrXpath, **{'re': '[Ss]eparate|[Mm]eals'}) l.add_xpath('WalkinPantry_Yes_No', descrXpath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descrXpath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath( 'SteelStructure_Yes_No', descrXpath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', descrXpath, **{'re': '[Bb]alcony'}) # # Гарантія l.add_xpath('SturturalWarranty', descrXpath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', descrXpath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath( 'KitchenBenchtop', descrXpath, **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) # Сигналізація l.add_xpath( 'SecuritySystem', descrXpath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath( 'EnergyRating', descrXpath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath( 'KitchenAppliance', descrXpath, **{ 're': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*' }) # Бренд пристрою l.add_xpath('ApplianceBrand', descrXpath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', descrXpath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath( 'FloorCovering', descrXpath, **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) # Охолодження l.add_xpath('Cooling', descrXpath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', descrXpath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', descrXpath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descrXpath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath( 'EnsuiteBenchtop', descrXpath, **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) # Душова l.add_xpath('EnsuiteShowerbase', descrXpath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', descrXpath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', descrXpath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', descrXpath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', descrXpath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', descrXpath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', descrXpath, **{'re': '.*[Pp]romotion.*'}) l.add_value('OtherInclusions', ', '.join(other)) return l.load_item()
def parseHL(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) descrXpath = '//div[@class="houseland__description"]/pre/text()' imgXpath = '//div[@id="houseland_gallery-image-1"]/a/@href' l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', self._getBuildType(response.url)) l.add_value('BuilderLogo', self.logo) l.add_xpath( 'DesignName', '//*[@id="sb-site"]/div[2]/div/div/div[2]/div/h1/span/text()') l.add_xpath('DisplayLocation', '//*[@id="sb-site"]/div[2]/div/div/div[2]/div/h1/text()') l.add_xpath('BasePrice', '//div[@class="houseland__price"]/text()') l.add_xpath('Bedrooms', '//div[@class="houseland__bedrooms"]/text()') l.add_xpath('Bathrooms', '//div[@class="houseland__bathrooms"]/text()') l.add_xpath('Garage', '//div[@class="houseland__cars"]/text()') l.add_xpath('HomeDesignMainImage', '//div[@class="houseland__main-image"]/a/@href') l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_xpath('TheatreRoom_Yes_No', descrXpath, **{'re': '[tT]heatre'}) l.add_xpath('SeparateMeals_Yes_No', descrXpath, **{'re': '[Ss]eparate|[Mm]eals'}) l.add_xpath('WalkinPantry_Yes_No', descrXpath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descrXpath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath( 'SteelStructure_Yes_No', descrXpath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', descrXpath, **{'re': '[Bb]alcony'}) return l.load_item()
def parseList(self,response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) hxsItemsList = hxs.xpath('//div[@class="property-item"]') for hxsItem in hxsItemsList: l = RealtyLoader(RealtyspidersItem(), hxsItem) l.add_value('url', response.url) l.add_value('BuildType', 'Browse our H&L packages') l.add_value('BuilderLogo', self.logo) l.add_xpath('Lot_BlockAddress', './/span[@class="street"]/text()') l.add_xpath('Squares', './/span[@class="area"]/text()') l.add_xpath('Bedrooms', '//li[@class="beds"]/text()') l.add_xpath('Bathrooms', '//li[@class="baths"]/text()') l.add_xpath('Garage', '//li[@class="garages"]/text()') l.add_xpath('LivingArea', '//li[@class="storeys"]/text()') l.add_xpath('BasePrice', './/div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()') l.add_xpath('HomeDesignMainImage', './/img/@src') yield l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") if self._chakURL(response.url): if re.search(r'\d+-special-offers', response.url): return None hxs = HtmlXPathSelector(response) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('BuildType', self._getBuildType(response.url)) l.add_value('BuilderEmailAddress', '*****@*****.**') try: l.add_value('HomeDesignMainImage', self.start_urls[0] + self.itemsList[response.url]) except KeyError: pass l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', '//div[@class="content-columns"]/h2[1]/text()') l.add_xpath('Squares', '//div[@id="house-details"]/div[@class="sq"]/text()') l.add_xpath('Bedrooms', '//div[@id="house-details"]/div[@class="bed"]/text()') l.add_xpath( 'Bathrooms', '//div[@id="house-details"]/div[@class="bath"]/text()') l.add_xpath('Garage', '//div[@id="house-details"]/div[@class="car"]/text()') l.add_xpath( 'BrochureImage_pdf', '//div[@class="house-attachment"]/a[text()="Download Brochure"]/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('FloorPlanImage1', '//li[@class="sigProThumb"][1]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image1', '//li[@class="sigProThumb"][2]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image2', '//li[@class="sigProThumb"][3]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image3', '//li[@class="sigProThumb"][4]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image4', '//li[@class="sigProThumb"][5]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image5', '//li[@class="sigProThumb"][6]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image6', '//li[@class="sigProThumb"][7]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image7', '//li[@class="sigProThumb"][8]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image8', '//li[@class="sigProThumb"][9]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image9', '//li[@class="sigProThumb"][10]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image10', '//li[@class="sigProThumb"][11]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image11', '//li[@class="sigProThumb"][12]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image12', '//li[@class="sigProThumb"][12]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image13', '//li[@class="sigProThumb"][14]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image14', '//li[@class="sigProThumb"][15]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_xpath('Image15', '//li[@class="sigProThumb"][16]/span/span/a/@href', **{'myRefer': self.start_urls[0]}) l.add_value('url', response.url) descriptionXPath = '//div[@id="content-body"]/div/ul/li/span/text()' # Block Yes No l.add_xpath( 'TheatreRoom_Yes_No', descriptionXPath, **{'re': '([Tt]heatre.*[Rr]ooms?)|([Rr]ooms?.*[Tt]heatre)'}) l.add_xpath( 'SeparateMeals_Yes_No', descriptionXPath, **{'re': '([Ss]eparate.*[Mm]eals)|([Mm]eals.*[Ss]eparate)'}) l.add_xpath('Alfresco_Yes_No', descriptionXPath, **{'re': '[Aa]lfresco'}) l.add_xpath('Study_Yes_No', descriptionXPath, **{'re': '([Ss]tudy)|([Ss}chool)|([Uu]niversity)'}) l.add_xpath('WalkinPantry_Yes_No', descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath( 'SteelStructure_Yes_No', descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', descriptionXPath, **{'re': '[Bb]alcony'}) # Гарантія l.add_xpath('SturturalWarranty', descriptionXPath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', descriptionXPath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath( 'KitchenBenchtop', descriptionXPath, **{ 're': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*' }) # Сигналізація l.add_xpath( 'SecuritySystem', descriptionXPath, **{ 're': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*' }) # Клас енергозбереження l.add_xpath( 'EnergyRating', descriptionXPath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя # l.add_xpath('KitchenAppliance', # descriptionXPath, **{'re': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*'}) # Бренд пристрою # l.add_xpath('ApplianceBrand', # descriptionXPath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', descriptionXPath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath( 'FloorCovering', descriptionXPath, **{ 're': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*' }) # Охолодження l.add_xpath('Cooling', descriptionXPath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', descriptionXPath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', descriptionXPath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath( 'EnsuiteBenchtop', descriptionXPath, **{ 're': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*' }) # Душова l.add_xpath('EnsuiteShowerbase', descriptionXPath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath( 'WallPaint', descriptionXPath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', descriptionXPath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', descriptionXPath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', descriptionXPath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', descriptionXPath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', descriptionXPath, **{'re': '.*[Pp]romotion.*'}) # # інші штуки # l.add_xpath('OtherInclusions', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # l.add_xpath('OtherInclusions1', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # l.add_xpath('OtherInclusions2', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # l.add_xpath('OtherInclusions3', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # l.add_xpath('OtherInclusions4', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # l.add_xpath('OtherInclusions5', # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) return l.load_item() else: hxs = HtmlXPathSelector(response) itemsURL = hxs.xpath( '//div[@class="homes-cat-left"]/a/@href').extract() imgURL = hxs.xpath( '//div[@class="homes-cat-left"]/a/img/@src').extract() itemsURL = list(map(lambda x: self.start_urls[0] + x, itemsURL)) self.itemsList = { items: img for items, img in zip(itemsURL, imgURL) }
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) # with open('testURL', 'a') as file: # file.write(str(response.meta)+ '\n') # file.writelines('\n'.join(hxs.xpath('//div[@class="col-md-8"]/table/tbody/tr/td[1]/text()').extract())) roomsXpath = '''//div[@class="room_dimensions overview_table"] //tr/td[text()="Master Bedroom"]/following-sibling::td/text()''' overviewXpath = '''//table[@id="hf-property-overview"]/tr/td/div[text()="{}"]/ancestor::td/following-sibling:: td[@class="item-value"]/div/div[@class="field-value"]/text()''' imgXpath = '//div[@class=" flexslider_gallery image hf-property-gallery"]/div/ul/li[{}]/img/@src' descriptionXPath = '//div[@id="col-md-8"]/p/text()' # data = hxs.xpath(roomsXpath).extract() # with open('testURL','a') as file: # for i in data: # file.write(i+'\n') other = [] for name in self.oth: size = hxs.xpath(roomsXpath.format(name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', self._getBuildType(referer)) l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', '//h3[@class="title-post"]/text()') l.add_value('State', 'MELBOURNE') l.add_xpath('Squares', '//div[@class="info-box1 "]/p[1]/text()') l.add_xpath('Bedrooms', '//li[@class="beds"]/text()') l.add_xpath('Bathrooms', '//li[@class="baths"]/text()') l.add_xpath('Garage', '//li[@class="garages"]/text()') l.add_xpath('BasePrice', '//div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()') l.add_value('Storey', self._getStorey(response.meta['Storey'])) l.add_xpath('HouseWidth', '//div[text()="MIN. BLOCK WIDTH"]/text()[2]') l.add_xpath('HouseLength', '//div[text()="\n MIN. BLOCK LENGTH"]/text()[2]') l.add_xpath('BrochureImage_pdf', '//a[text()="Brochure"]/@href') l.add_xpath('InclusionsImage_pdf', '//a[text()="Inclusions"]/@href') l.add_xpath('FloorPlanImage1', '//a[@class="floor-plan fancybox"]/img/@src') l.add_xpath('HomeDesignMainImage', imgXpath.format('1')) l.add_xpath('Image1', imgXpath.format('2')) l.add_xpath('Image2', imgXpath.format('3')) l.add_xpath('Image3', imgXpath.format('4')) l.add_xpath('Image4', imgXpath.format('5')) l.add_xpath('Image5', imgXpath.format('6')) l.add_xpath('Image6', imgXpath.format('7')) l.add_xpath('Image7', imgXpath.format('8')) l.add_xpath('Image8', imgXpath.format('9')) l.add_xpath('Image9', imgXpath.format('10')) l.add_xpath('Image10', imgXpath.format('11')) l.add_xpath('Image11', imgXpath.format('12')) l.add_xpath('Image12', imgXpath.format('13')) l.add_xpath('Image13', imgXpath.format('14')) l.add_xpath('Image14', imgXpath.format('15')) l.add_xpath('Image15', imgXpath.format('16')) l.add_xpath('MasterBedroomDimension', roomsXpath.format('Master Bedroom')) l.add_xpath('Bedroom2Dimension', roomsXpath.format('Bedroom 2')) l.add_xpath('Bedroom3Dimension', roomsXpath.format('Bedroom 3')) l.add_xpath('Bedroom4Dimension', roomsXpath.format('Bedroom 4')) l.add_xpath('StudyDimension', [roomsXpath.format('Study'),roomsXpath.format('Study nook')]) l.add_xpath('Meals_DiningDimension', roomsXpath.format('Meals')) l.add_xpath('FamilyDimension', roomsXpath.format('Family')) l.add_xpath('AlfrescoDimension', roomsXpath.format('Alfresco')) l.add_xpath('LoungeDimension', roomsXpath.format('Lounge')) l.add_xpath('TheatreDimension', roomsXpath.format('Theatre')) l.add_value('OtherInclusions', ', '.join(other)) # Block Yes No l.add_xpath('TheatreRoom_Yes_No', roomsXpath.format('Theatre')) l.add_xpath('SeparateMeals_Yes_No', roomsXpath.format('Meals')) l.add_xpath('Alfresco_Yes_No', roomsXpath.format('Alfresco')) l.add_xpath('Study_Yes_No', [roomsXpath.format('Study Nook'),roomsXpath.format('Study')]) l.add_xpath('WalkinPantry_Yes_No', descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('SteelStructure_Yes_No', descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', roomsXpath.format('Balcony')) # # Гарантія l.add_xpath('SturturalWarranty', descriptionXPath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', descriptionXPath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath('KitchenBenchtop', descriptionXPath, **{'re': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*'}) # Сигналізація l.add_xpath('SecuritySystem', descriptionXPath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath('EnergyRating', descriptionXPath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath('KitchenAppliance', descriptionXPath, **{'re': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*'}) # Бренд пристрою l.add_xpath('ApplianceBrand', descriptionXPath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', descriptionXPath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath('FloorCovering', descriptionXPath, **{'re': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*'}) # Охолодження l.add_xpath('Cooling', descriptionXPath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', descriptionXPath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', descriptionXPath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath('EnsuiteBenchtop', descriptionXPath, **{'re': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*'}) # Душова l.add_xpath('EnsuiteShowerbase', descriptionXPath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', descriptionXPath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', descriptionXPath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', descriptionXPath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', descriptionXPath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', descriptionXPath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', descriptionXPath, **{'re': '.*[Pp]romotion.*'}) # # # інші штуки # # l.add_xpath('OtherInclusions', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions1', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions2', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions3', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions4', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions5', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) return l.load_item()
def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) BuildType = self._getBuildType(referer) imgXpath = '''//main[@id="content"]/div[@class="carousel -arrows flickity "] /div[{}]/figure/div/img/@data-flickity-lazyload''' # descriptionXPath = '//div[@id="listing_options"]/ul/li/text()' areaXpath = '//div[@class="table-light"]/table/tbody/tr/td[text()="{}"]/following-sibling::td[1]/text()' roomsXpath = '''//h1[text()="Room dimensions"]/following-sibling:: dl/dt[text()="{}"]/following-sibling::dd[1]/text()''' # roomsDIMENSIONSXpath = '''//h1[text()="Room dimensions"]/following-sibling:: # dl/dt/text()''' # data = hxs.xpath(roomsDIMENSIONSXpath).extract() # with open('testURL','a') as file: # for i in data: # file.write(i+'\n') other = [] for name in self.oth: size = hxs.xpath(roomsXpath.format(name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', BuildType) l.add_value('BuilderLogo', self.logo) if BuildType == 'Displays Homes': l.add_value('Lot_BlockAddress', response.meta['address']) else: l.add_value('Storey', response.meta['storey']) l.add_xpath( 'DesignName', '//h1[@class="h1 +margin-none +color-dark"]/strong/text()') l.add_xpath('Bedrooms', '//dl[@class="rooms-count"]/dd[1]/text()') l.add_xpath('Bathrooms', '//dl[@class="rooms-count"]/dd[2]/text()') l.add_xpath('Garage', '//dl[@class="rooms-count"]/dd[3]/text()') l.add_xpath( 'BasePrice', '''//div/small[text()="Priced From"]/ancestor:: div/following-sibling::div[@class="h1 +color-dark"]/text()''' ) l.add_xpath('HouseWidth', '//div[@class="h5 +color-dark"]/text()', **{'re': '((?<=Exterior Width )\d+\.\d+m)'}) l.add_xpath('HouseLength', '//div[@class="h5 +color-dark"]/text()', **{'re': '((?<=Exterior Length )\d+\.\d+m)'}) l.add_xpath('GarageDimension', areaXpath.format('Garage')) l.add_xpath('AlfrescoDimension', areaXpath.format('Porch')) l.add_xpath('Alfresco_Yes_No', areaXpath.format('Porch')) l.add_xpath( 'Squares', '//div[@class="table-light"]/table/tfoot/tr/td[text()="Total Area"]/following-sibling::td[1]/text()' ) l.add_xpath( 'MasterBedroomDimension', [roomsXpath.format('Master Bed'), roomsXpath.format('Bedroom 1')]) l.add_xpath( 'Bedroom2Dimension', [roomsXpath.format('Bed 2'), roomsXpath.format('Bedroom 2')]) l.add_xpath( 'Bedroom3Dimension', [roomsXpath.format('Bed 3'), roomsXpath.format('Bedroom 3')]) l.add_xpath( 'Bedroom4Dimension', [roomsXpath.format('Bed 4'), roomsXpath.format('Bedroom 4')]) l.add_xpath('Study_Yes_No', [ roomsXpath.format('Study'), roomsXpath.format('Study (ground floor)'), roomsXpath.format('Study (first floor)'), roomsXpath.format('Study (First floor)') ]) l.add_xpath('StudyDimension', [ roomsXpath.format('Study'), roomsXpath.format('Study (ground floor)'), roomsXpath.format('Study (first floor)'), roomsXpath.format('Study (First floor)') ]) l.add_xpath('FamilyDimension', [roomsXpath.format('Family')]) l.add_xpath('Meals_DiningDimension', [ roomsXpath.format('Family / Meals'), roomsXpath.format('Meals/Family'), roomsXpath.format('Living / Meals'), roomsXpath.format('Meals') ]) l.add_xpath('TheatreRoom_Yes_No', [roomsXpath.format('Theatre')]) l.add_xpath('TheatreDimension', [roomsXpath.format('Theatre')]) l.add_xpath('LivingArea', [roomsXpath.format('Living')]) l.add_xpath( 'BrochureImage_pdf', '//div[@class="+v-spacer-xs +t-margin-sm"]/div/a[text()="\t\tDownload Floorplan\n\t"]/@href' ) l.add_xpath('FloorPlanImage1', '//div[@class="section +t-padding-md"]//img/@src') l.add_xpath('HomeDesignMainImage', imgXpath.format('1')) l.add_xpath('Image1', imgXpath.format('1')) l.add_xpath('Image2', imgXpath.format('2')) l.add_xpath('Image3', imgXpath.format('3')) l.add_xpath('Image4', imgXpath.format('4')) l.add_xpath('Image5', imgXpath.format('5')) l.add_xpath('Image6', imgXpath.format('6')) l.add_xpath('Image7', imgXpath.format('7')) l.add_xpath('Image8', imgXpath.format('8')) l.add_xpath('Image9', imgXpath.format('9')) l.add_xpath('Image10', imgXpath.format('10')) l.add_xpath('Image11', imgXpath.format('11')) l.add_xpath('Image12', imgXpath.format('12')) l.add_xpath('Image13', imgXpath.format('13')) l.add_xpath('Image14', imgXpath.format('14')) l.add_xpath('Image15', imgXpath.format('15')) l.add_value('OtherInclusions', ', '.join(other)) return l.load_item()