def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) json_content = Regexp( CleanText('//script'), r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);window\[\"tags\"\]" )(self.doc) json_content = codecs.unicode_escape_decode(json_content)[0] json_content = json_content.encode('utf-8', 'surrogatepass').decode('utf-8') self.doc = json.loads(json_content)
def obj_details(self): details = {} dispo = Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self) if dispo is not None: details["dispo"] = dispo priceMentions = CleanText('//p[has-class("OfferTop-mentions")]', default=None)(self) if priceMentions is not None: details["priceMentions"] = priceMentions agency = CleanText('//p[has-class("OfferContact-address")]', default=None)(self) if agency is not None: details["agency"] = agency for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]', default=None)(item) if not category: continue details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) details["electric_consumption"] = ( '{} kWhEP/m².an'.format(electric_consumption)) except (RegexpError, XPathNotFound): pass return details
def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) json_content = Regexp( CleanText('//script'), r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);")(self.doc) json_content = codecs.unicode_escape_decode(json_content)[0] json_content = json_content.encode('utf-8', 'surrogatepass').decode('utf-8') self.doc = { "advert": json.loads(json_content).get('advert', {}).get('mainAdvert', {}), "agency": json.loads(json_content).get('agency', {}) }
def obj_details(self): details = {} details["creationDate"] = Date( Regexp( CleanText( '//div[@class="offer-description-notes"]' ), u'.*Mis en ligne: (\d{2}/\d{2}/\d{4}).*' ), dayfirst=True )(self) honoraires = CleanText( ( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) value = CleanText('./span[has-class("criteria-value")]')(li) details[label] = value return details
def obj_DPE(self): try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) except (RegexpError, XPathNotFound): electric_consumption = None DPE = "" if electric_consumption is not None: if electric_consumption <= 50: DPE = "A" elif 50 < electric_consumption <= 90: DPE = "B" elif 90 < electric_consumption <= 150: DPE = "C" elif 150 < electric_consumption <= 230: DPE = "D" elif 230 < electric_consumption <= 330: DPE = "E" elif 330 < electric_consumption <= 450: DPE = "F" else: DPE = "G" return getattr(ENERGY_CLASS, DPE, NotAvailable) return NotAvailable
class item(ItemElement): klass = Housing obj_id = Format('colocation-%s', CleanText('./div/header/@id', replace=[('header-offer-', '')])) obj_type = POSTS_TYPES.SHARING obj_advert_type = ADVERT_TYPES.PROFESSIONAL obj_title = CleanText(CleanHTML('./div/header/section/p[@class="property-type"]/span/@title')) obj_area = CleanDecimal('./div/header/section/p[@class="offer-attributes"]/a/span[@class="offer-area-number"]', default=0) obj_cost = CleanDecimal('./div/header/section/p[@class="price"]', default=0) obj_currency = Currency( './div/header/section/p[@class="price"]' ) obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText( './div/div[@class="content-offer"]/section[has-class("content-desc")]/p/span[has-class("offer-text")]/@title', default=NotLoaded ) obj_date = Date(Regexp(CleanText('./div/header/section/p[has-class("update-date")]'), ".*(\d{2}/\d{2}/\d{4}).*")) obj_location = CleanText( '(./div/div[@class="content-offer"]/section[has-class("content-desc")]/p)[1]/span/@title', default=NotLoaded )
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@itemprop="name"]') obj_location = CleanText('//span[@class="informations-localisation"]') obj_cost = CleanDecimal('//span[@itemprop="price"]') obj_currency = Currency('//span[@itemprop="price"]') obj_text = CleanHTML('//div[@itemprop="description"]') obj_url = BrowserURL('housing', _id=Env('_id')) obj_area = CleanDecimal(Regexp(CleanText('//h1[@itemprop="name"]'), r'(.*?)(\d*) m2(.*?)', '\\2'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self): url = Regexp(CleanText('./@src'), r'http://thbr\.figarocms\.net.*(http://.*)')(img) photos.append(HousingPhoto(url)) return photos def obj_details(self): details = dict() for item in XPath('//div[@class="features clearfix"]/ul/li')(self): key = CleanText('./span[@class="name"]')(item) value = CleanText('./span[@class="value"]')(item) if value and key: details[key] = value key = CleanText('//div[@class="title-dpe clearfix"]')(self) value = CleanText('//div[@class="energy-consumption"]')(self) if value and key: details[key] = value return details
def obj_cost(self): cost = CleanDecimal(Regexp(CleanText(self.price_selector, default=''), r'de (.*) à .*', default=0))(self) if cost == 0: return CleanDecimal(self.price_selector, default=NotAvailable)(self) else: return cost
def condition(self): title = self.obj_title(self) isNotFurnishedOk = True if self.env['query_type'] == POSTS_TYPES.RENT: isNotFurnishedOk = 'meublé' not in title.lower() id = self.obj_id(self) if id is None: return False return (Regexp(Link('./div/a[has-class("item-title")]'), '/annonces/(.*)', default=None)(self) and isNotFurnishedOk)
def parse(self, el): rooms_bedrooms_area = el.xpath('.//ul[has-class("item-tags")]/li') self.env['rooms'] = NotAvailable self.env['bedrooms'] = NotAvailable self.env['area'] = NotAvailable for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('./strong')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('./strong')(item) elif ' m²' in name and 'le m²' not in name: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value
def parse(self, el): rooms_bedrooms_area = el.xpath( './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotLoaded self.env['bedrooms'] = NotLoaded self.env['area'] = NotLoaded for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('.')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('.')(item) else: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value
class item(ItemElement): klass = Housing def condition(self): title = self.obj_title(self) isNotFurnishedOk = True if self.env['query_type'] == POSTS_TYPES.RENT: isNotFurnishedOk = 'meublé' not in title.lower() id = self.obj_id(self) if id is None: return False return (Regexp(Link('./div/a[has-class("item-title")]'), '/annonces/(.*)', default=None)(self) and isNotFurnishedOk) def parse(self, el): rooms_bedrooms_area = el.xpath( './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotLoaded self.env['bedrooms'] = NotLoaded self.env['area'] = NotLoaded for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('.')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('.')(item) else: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value obj_id = Regexp(Link('./div/a[has-class("item-title")]'), '/annonces/(.*)', default=None) obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PERSONAL def obj_house_type(self): item_link = Link('./div/a[@class="item-title"]')(self) house_type = item_link.split('/')[-1].split('-')[0] if 'parking' in house_type: return HOUSE_TYPES.PARKING elif 'appartement' in house_type: return HOUSE_TYPES.APART elif 'terrain' in house_type: return HOUSE_TYPES.LAND elif 'maison' in house_type: return HOUSE_TYPES.HOUSE else: return HOUSE_TYPES.OTHER obj_title = CleanText('./div/a[has-class("item-title")]') obj_area = Env('area') obj_cost = CleanDecimal(CleanText( './div/a[has-class("item-title")]/span[@class="item-price"]'), replace_dots=True, default=Decimal(0)) obj_currency = Currency( './div/a[@class="item-title"]/span[@class="item-price"]') obj_utilities = UTILITIES.UNKNOWN obj_station = CleanText('./div/p[@class="item-transports"]', default=NotLoaded) def obj_location(self): return CleanText('./div/p[@class="item-description"]')( self).split(".")[0] obj_text = CleanText('./div/p[@class="item-description"]', replace=[(' Lire la suite', '')]) obj_rooms = Env('rooms') obj_bedrooms = Env('bedrooms') obj_price_per_meter = PricePerMeterFilter() obj_url = Format(u'http://www.pap.fr%s', Link('./div/a[@class="item-title"]')) def obj_photos(self): photos = [] for img in XPath('./a/img/@src')(self): if (img.endswith("visuel-nophoto.png") or img.endswith('miniature-video.png')): continue photos.append(HousingPhoto(u'%s' % img)) return photos
def obj_photos(self): photos = [] for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self): url = Regexp(CleanText('./@src'), r'http://thbr\.figarocms\.net.*(http://.*)')(img) photos.append(HousingPhoto(url)) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') def obj_type(self): url = BrowserURL('housing', _id=Env('_id'))(self) if 'colocation' in url: return POSTS_TYPES.SHARING elif 'location' in url: isFurnished = False for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) if label.lower() == "meublé": isFurnished = ( CleanText('./span[has-class("criteria-value")]')(li).lower() == 'oui' ) if isFurnished: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif 'vente' in url: return POSTS_TYPES.SALE return NotAvailable obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): house_type = CleanText('.//h2[@class="offerMainFeatures"]/div')(self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER obj_title = Attr('//meta[@property="og:title"]', 'content') obj_area = CleanDecimal( CleanText( '//p[@class="offerArea"]/span', ), default=NotAvailable ) obj_rooms = CleanDecimal( Regexp( CleanText('//p[@class="offerRooms"]/span'), '(\d) p.', default=NotAvailable ), default=NotAvailable ) obj_bedrooms = CleanDecimal( Regexp( CleanText('//p[@class="offerRooms"]/span'), '(\d) ch.', default=NotAvailable ), default=NotAvailable ) obj_cost = CleanDecimal('//*[@itemprop="price"]', default=0) obj_currency = Currency( '//*[@itemprop="price"]' ) def obj_utilities(self): notes = CleanText('//p[@class="offer-description-notes"]')(self) if "Loyer mensuel charges comprises" in notes: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_price_per_meter = PricePerMeterFilter() obj_date = Date(Regexp(CleanText('//div[@class="offer-description-notes"]'), u'.* Mis à jour: (\d{2}/\d{2}/\d{4}).*'), dayfirst=True) obj_text = CleanHTML('//p[@class="descrProperty"]') obj_location = CleanText('//em[@class="infoAdresse"]') obj_station = CleanText( '//div[has-class("offer-description-metro")]', default=NotAvailable ) obj_url = BrowserURL('housing', _id=Env('_id')) def obj_photos(self): photos = [] for img in XPath('//ul[@class="thumbsContainer"]//img/@src')(self): if img.endswith('.svg'): continue url = u'%s' % img.replace('182x136', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos def obj_DPE(self): energy_value = CleanText( '//ul[@class="energyInfosDPE"]//li[@class="energyInfos"]/span/@data-class', default="" )(self) if len(energy_value): energy_value = energy_value.replace("DPE", "").strip()[0] return getattr(ENERGY_CLASS, energy_value, NotAvailable) def obj_GES(self): greenhouse_value = CleanText( '//ul[@class="energyInfosGES"]//li[@class="energyInfos"]/span/@data-class', default="" )(self) if len(greenhouse_value): greenhouse_value = greenhouse_value.replace("GES", "").strip()[0] return getattr(ENERGY_CLASS, greenhouse_value, NotAvailable) def obj_details(self): details = {} details["creationDate"] = Date( Regexp( CleanText( '//div[@class="offer-description-notes"]' ), u'.*Mis en ligne: (\d{2}/\d{2}/\d{4}).*' ), dayfirst=True )(self) honoraires = CleanText( ( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) value = CleanText('./span[has-class("criteria-value")]')(li) details[label] = value return details
class item(ItemElement): offer_details_wrapper = ( './/div[has-class("offer-details-wrapper")]' ) klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')]) ) obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): house_type = CleanText('.//div[has-class("offer-details-caracteristik")]/meta[@itemprop="name"]/@content')(self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER obj_title = CleanText('.//div[has-class("offer-details-type")]/a/@title') obj_url = Format(u'%s%s', CleanText('.//div/a[@class="offer-link"]/@href'), CleanText('.//div/a[@class="offer-link"]/\ @data-orpi', default="")) obj_area = CleanDecimal( ( offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]/span' + '/span[has-class("offer-area-number")]' ), default=NotLoaded ) obj_rooms = CleanDecimal( ( offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]' + '/span[has-class("offer-rooms")]' + '/span[has-class("offer-rooms-number")]' ), default=NotAvailable ) obj_cost = CleanDecimal( Regexp( CleanText( ( offer_details_wrapper + '/div/p[@class="offer-price"]/span' ), default=NotLoaded ), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotLoaded ), default=NotLoaded ) obj_currency = Currency( offer_details_wrapper + '/div/p[has-class("offer-price")]/span' ) obj_price_per_meter = PricePerMeterFilter() obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText( offer_details_wrapper + '/div/div/div/p[has-class("offer-description")]/span' ) obj_location = CleanText( offer_details_wrapper + '/div[@class="offer-details-location"]', replace=[('Voir sur la carte','')] ) def obj_photos(self): photos = [] url = None try: url = Attr( './/div[has-class("offer-picture")]//img', 'src' )(self) except: pass if url: url = url.replace('335x253', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos def obj_details(self): details = {} honoraires = CleanText( ( self.offer_details_wrapper + '/div/div/p[@class="offer-agency-fees"]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) return details
class item(ItemElement): klass = Housing price_selector = './/span[@class="price-label"]|./div/div[@class="item-price-pdf"]' def is_agency(self): agency = CleanText('.//span[has-class("item-agency-name")]')(self.el) return 'annonce de particulier' not in agency.lower() def condition(self): if len(self.env['advert_types']) == 1: is_agency = self.is_agency() if self.env['advert_types'][0] == ADVERT_TYPES.PERSONAL: return not is_agency elif self.env['advert_types'][0] == ADVERT_TYPES.PROFESSIONAL: return is_agency return Attr('.', 'data-classified-id', default=False)(self) obj_id = Attr('.', 'data-classified-id') obj_type = Env('query_type') obj_title = CleanText('./div/h2[@class="item-type"]') def obj_advert_type(self): if self.is_agency(): return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL def obj_house_type(self): type = self.obj_title(self).split()[0].lower() if type == "appartement" or type == "studio" or type == "chambre": return HOUSE_TYPES.APART elif type == "maison" or type == "villa": return HOUSE_TYPES.HOUSE elif type == "parking": return HOUSE_TYPES.PARKING elif type == "terrain": return HOUSE_TYPES.LAND else: return HOUSE_TYPES.OTHER def obj_location(self): script = CleanText('./script')(self) try: # Should be standard JSON+LD data script = json.loads(script) except ValueError: try: # But explorimmo can't write JSON correctly and there # is a trailing "}" script = json.loads(script.strip().rstrip('}')) except ValueError: script = None if not script: return NotLoaded try: return '%s (%s)' % ( script['address']['addressLocality'], script['address']['postalCode'] ) except (KeyError): return NotLoaded def obj_cost(self): cost = CleanDecimal(Regexp(CleanText(self.price_selector, default=''), r'de (.*) à .*', default=0))(self) if cost == 0: return CleanDecimal(self.price_selector, default=NotAvailable)(self) else: return cost obj_currency = Currency(price_selector) def obj_utilities(self): utilities = CleanText( './div/div/span[@class="price-label"]|' './div/div[@class="item-price-pdf"]|' './div/div/span[@class="item-price"]' )(self) if "CC" in utilities: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_text = CleanText('./div/p[@itemprop="description"]') obj_area = CleanDecimal( Regexp( obj_title, r'(.*?)([\d,\.]*) m2(.*?)', '\\2', default=None ), replace_dots=True, default=NotLoaded ) obj_url = Format( "https://immobilier.lefigaro.fr/annonces/annonce-%s.html", CleanText('./@data-classified-id') ) obj_price_per_meter = PricePerMeterFilter() def obj_phone(self): phone = CleanText('./div/div/ul/li[has-class("js-clickphone")]', replace=[('Téléphoner : ', '')], default=NotLoaded)(self) if '...' in phone: return NotLoaded return phone def obj_details(self): charges = CleanText('.//span[@class="price-fees"]', default=None)(self) if charges: return { "fees": charges.split(":")[1].strip() } else: return NotLoaded def obj_photos(self): url = CleanText('./div[has-class("default-img")]/img/@data-src')(self) if url: url = unquote(url) if "http://" in url[3:]: rindex = url.rfind("?") if rindex == -1: rindex = None url = url[url.find("http://", 3):rindex] return [HousingPhoto(url)] else: return NotLoaded
class item(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('.//span[boolean(@data-reference)]', 'data-reference')) obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): url = self.obj_url(self) for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotLoaded obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_title = CleanText('.//h3[has-class("TeaserOffer-title")]') obj_area = CleanDecimal(Regexp(CleanText( './/div[has-class("MiniData")]//p[@data-behat="surfaceDesBiens"]' ), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal( './/strong[has-class("TeaserOffer-price-num")]', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency( './/strong[has-class("TeaserOffer-price-num")]') obj_location = CleanText('.//p[has-class("TeaserOffer-loc")]') obj_text = CleanText('.//p[has-class("TeaserOffer-description")]') def obj_photos(self): url = CleanText( Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src'))(self) # If the used photo is a default no photo, the src is on the same domain. if url[0] == '/': return [] else: return [HousingPhoto(url)] obj_date = datetime.date.today() def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbPiecesDesBiens"]', default=NotLoaded) obj_bedrooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbChambresDesBiens"]', default=NotLoaded) def obj_details(self): return { "dispo": Date( Attr('.//span[boolean(@data-dispo)]', 'data-dispo', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText( './/span[has-class("TeaserOffer-price-mentions")]')( self) }
class get_housing(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('//div[boolean(@data-property-reference)]', 'data-property-reference')) obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_type(self): type = Env('type')(self) if type == 'location': if 'appartement-meuble' in self.page.url: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif type == 'achat': return POSTS_TYPES.SALE else: return NotAvailable def obj_url(self): return self.page.url def obj_house_type(self): url = self.obj_url() for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotAvailable obj_title = CleanText('//h1[has-class("OfferTop-title")]') obj_area = CleanDecimal(Regexp(CleanText( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][1]'), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal('//span[has-class("OfferTop-price")]', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency('//span[has-class("OfferTop-price")]') obj_location = Format('%s - %s', CleanText('//p[@data-behat="adresseBien"]'), CleanText('//p[has-class("OfferTop-loc")]')) obj_text = CleanText('//div[has-class("OfferDetails-content")]/p[1]') obj_phone = Regexp(Link('//a[has-class("OfferContact-btn--tel")]'), r'tel:(.*)') def obj_photos(self): photos = [] for photo in self.xpath('//div[has-class("OfferSlider")]//img'): photo_url = Attr('.', 'src')(photo) photo_url = photo_url.replace('640/480', '800/600') photos.append(HousingPhoto(photo_url)) return photos obj_date = datetime.date.today() def obj_utilities(self): price = CleanText('//p[has-class("OfferTop-price")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][2]', default=NotAvailable) obj_bedrooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][3]', default=NotAvailable) def obj_DPE(self): try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) except (RegexpError, XPathNotFound): electric_consumption = None DPE = "" if electric_consumption is not None: if electric_consumption <= 50: DPE = "A" elif 50 < electric_consumption <= 90: DPE = "B" elif 90 < electric_consumption <= 150: DPE = "C" elif 150 < electric_consumption <= 230: DPE = "D" elif 230 < electric_consumption <= 330: DPE = "E" elif 330 < electric_consumption <= 450: DPE = "F" else: DPE = "G" return getattr(ENERGY_CLASS, DPE, NotAvailable) return NotAvailable def obj_details(self): details = {} dispo = Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self) if dispo is not None: details["dispo"] = dispo priceMentions = CleanText('//p[has-class("OfferTop-mentions")]', default=None)(self) if priceMentions is not None: details["priceMentions"] = priceMentions agency = CleanText('//p[has-class("OfferContact-address")]', default=None)(self) if agency is not None: details["agency"] = agency for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]', default=None)(item) if not category: continue details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) details["electric_consumption"] = ( '{} kWhEP/m².an'.format(electric_consumption)) except (RegexpError, XPathNotFound): pass return details