class item(ItemElement): klass = Housing obj_id = CleanText('./@data-classified-id') obj_title = CleanText('./div/h2[@itemprop="name"]/a') obj_location = CleanText( './div/h2[@itemprop="name"]/span[class="item-localisation"]') obj_cost = CleanDecimal('./div/div/span[@class="price-label"]') obj_currency = Regexp( CleanText('./div/div/span[@class="price-label"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_text = CleanText('./div/div/div[@itemprop="description"]') obj_area = CleanDecimal(Regexp( CleanText('./div/h2[@itemprop="name"]/a'), '(.*?)(\d*) m2(.*?)', '\\2', default=None), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() def obj_phone(self): phone = CleanText( './div/div/ul/li/span[@class="js-clickphone"]', replace=[(u'Téléphoner : ', u'')], default=NotAvailable)(self) if '...' in phone: return NotLoaded return phone def obj_photos(self): url = CleanText('./div/div/a/img[@itemprop="image"]/@src')( self) return [HousingPhoto(url)]
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') obj_title = Format( "%s %s%s - %s", CleanText('titre'), CleanText('surface'), CleanText('surfaceUnite'), CleanText('ville'), ) obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = Regexp(CleanText('prixUnite'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_area = CleanDecimal('surface', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_text = CleanText('descriptif') obj_rooms = CleanDecimal('nbPiece|nbPieces', default=NotAvailable) obj_bedrooms = CleanDecimal('nbChambre|nbChambres', default=NotAvailable) def obj_location(self): location = CleanText('adresse', default="")(self) quartier = CleanText('quartier', default=None)(self) if not location and quartier is not None: location = quartier ville = CleanText('ville')(self) cp = CleanText('cp')(self) return u'%s %s (%s)' % (location, ville, cp) obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('h1') obj_rooms = CleanDecimal( '//div[@class="stats"]/section/div[@id="divpieces"]/span[@class="stat"]' ) obj_cost = CleanDecimal('(//div[@class="stats"]/div/h2)[2]') obj_currency = u'€' obj_utilities = UTILITIES.UNKNOWN obj_text = CleanHTML('//div[@class="textes"]') obj_location = CleanText('//input[@id="adressegeo"]/@value') obj_url = CleanText('//input[@id="hfurldetail"]/@value') obj_area = CleanDecimal(Regexp(CleanText( '//div[@class="stats"]/section/div[@id="divsurface"]/span[@class="stat"]' ), u'\s?(\d+)\sm\s2', default=NotAvailable), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_phone = CleanText('//input[@id="hftelA"]/@value') obj_date = datetime.now def obj_photos(self): photos = [] for photo in self.xpath('//div[@id="plistimage"]/a/@urlbig'): photos.append( HousingPhoto( unicode("http://www.entreparticuliers.com/" + photo))) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@itemprop="name"]') obj_location = CleanText('//span[@class="informations-localisation"]') obj_cost = CleanDecimal('//span[@itemprop="price"]') obj_currency = Regexp(CleanText('//span[@itemprop="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_text = CleanHTML('//div[@itemprop="description"]') obj_url = BrowserURL('housing', _id=Env('_id')) obj_area = CleanDecimal(Regexp(CleanText('//h1[@itemprop="name"]'), '(.*?)(\d*) m2(.*?)', '\\2'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self): url = Regexp(CleanText('./@src'), 'http://thbr\.figarocms\.net.*(http://.*)')(img) photos.append(HousingPhoto(url)) return photos def obj_details(self): details = dict() for item in XPath('//div[@class="features clearfix"]/ul/li')(self): key = CleanText('./span[@class="name"]')(item) value = CleanText('./span[@class="value"]')(item) if value and key: details[key] = value key = CleanText('//div[@class="title-dpe clearfix"]')(self) value = CleanText('//div[@class="energy-consumption"]')(self) if value and key: details[key] = value return details
class get_housing(ItemElement): klass = Housing obj_id = Regexp(CleanText('//p[has-class("property-reference")]'), r'\:(.*)$') def obj_url(self): return self.page.url obj_area = CleanDecimal( Regexp( CleanText('//table[@id="table"]//span[contains(text(), "Surface")]//following-sibling::span[has-class("r")]'), r'([\d\ ]+)m' ), default=NotAvailable ) obj_title = CleanText('//span[has-class("mainh1")]') obj_cost = CleanDecimal('//span[has-class("price-info")]') obj_currency = Currency.get_currency(u'€') obj_rooms = CleanDecimal('//table[@id="table"]//span[contains(text(), "Pièce")]//following-sibling::span[has-class("r")]') obj_bedrooms = CleanDecimal('//table[@id="table"]//span[contains(text(), "Chambre")]//following-sibling::span[has-class("r")]') obj_location = CleanText(Regexp(CleanText('//span[has-class("mainh1")]'), r',(.+)$')) obj_text = CleanText('//div[has-class("property-description-main")]') obj_date = Date( Regexp( CleanText('//div[has-class("property-description-main")]'), r'Mise à jour le ([\d\\]+)', default=datetime.today() ) ) obj_phone = Attr('//button[@id="display-phonenumber-1"]', 'data-phone-number') def obj_photos(self): photos = [] for photo in self.xpath('//div[@id="bxSliderContainer"]//ul//li//img'): url = Attr('.', 'src')(photo) if url[0] != '/': photos.append(HousingPhoto(url)) return photos def obj_details(self): return { 'GES': CleanText('//span[@id="gassymbol"]', '')(self), 'DPE': CleanText('//span[@id="energysymbol"]', '')(self), } def obj_utilities(self): price = CleanText('//span[has-class("price-info")]')(self) if 'CC' in price: return UTILITIES.INCLUDED elif 'HC' in price: return UTILITIES.EXCLUDED else: return UTILITIES.UNKNOWN obj_station = NotAvailable obj_price_per_meter = PricePerMeterFilter()
class get_housing(ItemElement): klass = Housing obj_title = CleanText('//Titre') def obj_cost(self): cost = CleanDecimal(Regexp(CleanText('//Prix'), u'(.*)\€.*', default=None), default=None)(self) return cost if cost else CleanDecimal( Regexp(CleanText('//Prix'), u'(.*)€'))(self) obj_currency = u'€' obj_text = CleanText('//Description') obj_location = CleanHTML(CleanText('//Localisation')) obj_area = CleanDecimal('//SurfaceBien', replace_dots=True, default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_phone = CleanText('//Telephone') obj_date = datetime.now def obj_details(self): details = {} details[u'Type de bien'] = CleanText('//Tbien')(self) details[u'Reference'] = CleanText('(//Reference)[1]')(self) details[u'Nb pièces'] = CleanText('//Nbpieces')(self) _ener = CleanText('//Energie')(self) if _ener: details[u'Energie'] = _ener _lat = CleanText('//Latitude')(self) if _lat: details[u'Latitude'] = _lat _long = CleanText('//Longitude')(self) if _long: details[u'Longitude'] = _long return details def obj_photos(self): photos = [] for i in range(1, CleanDecimal('//NbPhotos')(self) + 1): img = CleanText('//LienImage%s' % i, replace=[(u'w=69&h=52', u'w=786&h=481')])(self) url = img if img.startswith( 'http') else u'http://www.entreparticuliers.com%s' % img photos.append(HousingPhoto(url)) return photos
class get_housing(ItemElement): klass = Housing def parse(self, el): details = dict() self.env['area'] = NotAvailable for item in el.xpath('//div[@class="line"]/h2'): if 'Surface' in CleanText('./span[@class="property"]')(item): self.env['area'] = CleanDecimal(Regexp(CleanText('./span[@class="value"]'), '(.*)m.*'), replace_dots=(',', '.'))(item) else: key = u'%s' % CleanText('./span[@class="property"]')(item) if 'GES' in key or 'Classe' in key: details[key] = CleanText('./span[@class="value"]/noscript/a')(item) else: details[key] = CleanText('./span[@class="value"]')(item) self.env['details'] = details obj_id = Env('_id') obj_title = CleanText('//title') obj_cost = CleanDecimal('//h2[@itemprop="price"]/@content', default=Decimal(0)) obj_currency = Regexp(CleanText('//h2[@itemprop="price"]/span[@class="value"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_text = CleanText('//p[@itemprop="description"]') obj_location = CleanText('//span[@itemprop="address"]') obj_details = Env('details') obj_area = Env('area') obj_price_per_meter = PricePerMeterFilter() obj_url = BrowserURL('housing', _id=Env('_id')) def obj_date(self): _date = Regexp(CleanText('//p[has-class("line")]', replace=[(u'à', '')]), '.*Mise en ligne le (.*)')(self) for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self) def obj_photos(self): items = re.findall(r'images\[\d\]\s*=\s*"([\w/\.]*\.jpg)";', CleanText('//script')(self)) photos = [HousingPhoto(u'http:%s' % item) for item in items] if not photos: img = CleanText('//meta[@itemprop="image"]/@content', default=None)(self) if img: photos.append(HousingPhoto(img)) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_type = EPAdvertType(CleanText('//rubrique')) obj_advert_type = ADVERT_TYPES.PERSONAL obj_house_type = EPHouseType(CleanText('//tbien')) obj_title = CleanText('//titre') obj_rooms = CleanDecimal('//pieces') obj_cost = CleanDecimal('//prix') obj_currency = Currency.get_currency(u'€') obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText('//titre') obj_location = CleanText('//ville') obj_url = CleanText('//urlDetailAnnonce') obj_area = CleanDecimal('//surface') obj_price_per_meter = PricePerMeterFilter() obj_phone = CleanText('//telephone1') obj_date = DateTime(CleanText('//DateCheck')) def obj_GES(self): value = CleanText('//GSE')(self) return getattr(ENERGY_CLASS, value.upper(), NotAvailable) def obj_photos(self): photos = [] for photo in ['//UrlImage1', '//UrlImage2', '//UrlImage3']: p = CleanText(photo)(self) if p: photos.append(HousingPhoto(p)) return photos def obj_DPE(self): value = CleanText('//DPE')(self) return getattr(ENERGY_CLASS, value.upper(), NotAvailable) def obj_details(self): details = dict() d = [('//Nb_Etage', 'Nombre d\'etages'), ('//Neuf', 'Neuf'), ('//Ancien_avec_du_Charme', 'Ancien avec charme'), ('//Avec_terasse', 'Avec Terrasse'), ('//latitude', 'Latitude'), ('//longitude', 'Longitude'), ('//loyer', 'Loyer'), ('//piscine', 'Piscine'), ('//surface_balcon', 'Surface du balcon'), ('//surface_exp', 'Surface exploitable'), ('//surface_terrain', 'Surface du Terrain'), ('//Meuble', 'furnished')] for key, value in d: key = CleanText(key)(self) if key: details[value] = key return details
class item(ItemElement): klass = Housing def condition(self): return Regexp(Link('./div[has-class("box-header")]/a[@class="title-item"]'), '/annonces/(.*)', default=None)(self) obj_id = Regexp(Link('./div[has-class("box-header")]/a[@class="title-item"]'), '/annonces/(.*)') obj_title = CleanText('./div[has-class("box-header")]/a[@class="title-item"]') obj_area = CleanDecimal(Regexp(CleanText('./div[has-class("box-header")]/a/span[@class="h1"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) obj_cost = CleanDecimal(CleanText('./div[has-class("box-header")]/a/span[@class="price"]'), replace_dots=True, default=Decimal(0)) obj_currency = Regexp(CleanText('./div[has-class("box-header")]/a/span[@class="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_utilities = UTILITIES.UNKNOWN def obj_date(self): _date = Regexp(CleanText('./div[has-class("box-header")]/p[@class="date"]'), '.* / (.*)')(self) return parse_french_date(_date) obj_station = CleanText('./div[@class="box-body"]/div/div/p[@class="item-transports"]', default=NotAvailable) obj_location = CleanText('./div[@class="box-body"]/div/div/p[@class="item-description"]/strong') obj_text = CleanText('./div[@class="box-body"]/div/div/p[@class="item-description"]') obj_rooms = CleanDecimal( './div[@class="box-body"]/div/div/div[@class="clearfix"]/ul[has-class("item-summary")]/li[1]/strong', default=NotAvailable ) obj_price_per_meter = PricePerMeterFilter() def obj_bedrooms(self): rooms_bedrooms_area = XPath( './div[@class="box-body"]/div/div/div[@class="clearfix"]/ul[has-class("item-summary")]/li' )(self) if len(rooms_bedrooms_area) > 2: return CleanDecimal( './div[@class="box-body"]/div/div/div[@class="clearfix"]/ul[has-class("item-summary")]/li[2]/strong', default=NotAvailable )(self) else: return NotAvailable obj_url = Format( u'http://www.pap.fr%s', Link( './div[@class="box-body"]/div/div/div[@class="clearfix"]/div[@class="float-right"]/a' ) ) def obj_photos(self): photos = [] for img in XPath('./div[@class="box-body"]/div/div/a/img/@src')(self): photos.append(HousingPhoto(u'%s' % img)) return photos
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') def obj_type(self): idType = int(CleanText('idTypeTransaction')(self)) type = next(k for k, v in TYPES.items() if v == idType) if type == POSTS_TYPES.FURNISHED_RENT: # SeLoger does not let us discriminate between furnished and not # furnished. return POSTS_TYPES.RENT return type def obj_house_type(self): idType = CleanText('idTypeBien')(self) try: return next(k for k, v in RET.items() if v == idType) except StopIteration: return NotAvailable obj_title = Format( "%s %s%s - %s", CleanText('titre'), CleanText('surface'), CleanText('surfaceUnite'), CleanText('ville'), ) obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = Currency('prixUnite') obj_area = CleanDecimal('surface', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_text = CleanText('descriptif') obj_rooms = CleanDecimal('nbPiece|nbPieces', default=NotAvailable) obj_bedrooms = CleanDecimal('nbChambre|nbChambres', default=NotAvailable) def obj_location(self): location = CleanText('adresse', default="")(self) quartier = CleanText('quartier', default=None)(self) if not location and quartier is not None: location = quartier ville = CleanText('ville')(self) cp = CleanText('cp')(self) return u'%s %s (%s)' % (location, ville, cp) obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') obj_title = CleanText('titre') obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = CleanText('prixUnite') obj_area = CleanDecimal('surface', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_text = CleanText('descriptif') obj_location = CleanText('ville') obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@class="clearfix"]/span[@class="title"]') obj_cost = CleanDecimal('//h1[@class="clearfix"]/span[@class="price"]', replace_dots=True) obj_currency = Regexp( CleanText('//h1[@class="clearfix"]/span[@class="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_area = CleanDecimal(Regexp( CleanText('//h1[@class="clearfix"]/span[@class="title"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_location = CleanText('//div[@class="item-geoloc"]/h2') obj_text = CleanText(CleanHTML('//p[@class="item-description"]')) obj_station = CleanText('//div[@class="metro"]') obj_phone = CleanHTML('(//div[has-class("tel-wrapper")])[1]') obj_url = BrowserURL('housing', _id=Env('_id')) def obj_details(self): details = dict() for item in XPath('//ul[@class="item-summary"]/li')(self): key = CleanText('.', children=False)(item) value = CleanText('./strong')(item) if value and key: details[key] = value key = CleanText( '//div[@class="box energy-box"]/div/div/p[@class="h3"]')(self) value = Format( '%s(%s)', CleanText('(//div[@class="box energy-box"]/div/div/p)[2]'), CleanText('//div[@class="box energy-box"]/div/div/@class', replace=[('-', ''), ('rank', '')]))(self) if value and key: details[key] = value return details def obj_photos(self): photos = [] for img in XPath( '//div[has-class("showcase-thumbnail")]/img/@src')(self): photos.append(HousingPhoto(u'%s' % img)) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = Dict('characteristics/titleWithTransaction') obj_location = Format('%s %s %s', Dict('location/address'), Dict('location/postalCode'), Dict('location/cityLabel')) obj_cost = TypeDecimal(Dict('characteristics/price')) obj_currency = u'€' obj_text = CleanHTML(Dict('characteristics/description')) obj_url = BrowserURL('housing_html', _id=Env('_id')) obj_area = TypeDecimal(Dict('characteristics/area')) obj_date = FromTimestamp(Dict('characteristics/date')) obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] for img in Dict('characteristics/images')(self): m = re.search('http://thbr\.figarocms\.net.*(http://.*)', img.get('xl')) if m: photos.append(HousingPhoto(m.group(1))) else: photos.append(HousingPhoto(img.get('xl'))) return photos def obj_details(self): details = {} details['fees'] = Dict('characteristics/fees')(self) details['bedrooms'] = Dict('characteristics/bedroomCount')(self) details['energy'] = Dict( 'characteristics/energyConsumptionCategory')(self) rooms = Dict('characteristics/roomCount')(self) if len(rooms): details['rooms'] = rooms[0] details['available'] = Dict('characteristics/available', default=NotAvailable)(self) return details
class item(ItemElement): klass = Housing obj_id = Format("%s#%s", Dict('rubrique'), Dict('idannonce')) obj_type = EPAdvertType(Dict('rubrique')) obj_advert_type = ADVERT_TYPES.PERSONAL obj_house_type = EPHouseType(Dict('tbien')) obj_title = Dict('titre') obj_cost = CleanDecimal(Dict('prix')) obj_currency = Currency.get_currency(u'€') obj_text = Dict('titre') obj_location = Dict('ville') obj_area = CleanDecimal(Dict('surface')) obj_rooms = CleanDecimal(Dict('pieces')) obj_date = DateTime(Dict('creationdate')) obj_utilities = UTILITIES.UNKNOWN obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] photo = Dict('UrlImage', default=NotAvailable)(self) if not empty(photo): photos.append(HousingPhoto(photo)) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText( '//div[has-class("box-header")]/h1[@class="clearfix"]' ) obj_cost = CleanDecimal('//h1[@class="clearfix"]/span[@class="price"]', replace_dots=True) obj_currency = Regexp(CleanText('//h1[@class="clearfix"]/span[@class="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_utilities = UTILITIES.UNKNOWN obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="clearfix"]/span[@class="title"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) def obj_date(self): date = CleanText( '//div[has-class("box-header")]//p[has-class("date")]' )(self).split("/")[-1].strip() return parse_french_date(date) def obj_bedrooms(self): rooms_bedrooms_area = XPath( '//div[has-class("box-body")]//ul[has-class("item-summary")]/li' )(self) if len(rooms_bedrooms_area) > 2: return CleanDecimal( '//div[has-class("box-body")]//ul[has-class("item-summary")]/li[2]/strong', default=NotAvailable )(self) else: return NotAvailable obj_rooms = CleanText('//ul[has-class("item-summary")]/li[1]/strong', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_location = CleanText('//div[@class="item-geoloc"]/h2') obj_text = CleanText(CleanHTML('//p[@class="item-description"]')) def obj_station(self): return ", ".join([ station.text for station in XPath( '//ul[has-class("item-metro")]//span[has-class("label")]' )(self) ]) def obj_phone(self): phone = CleanText('(//div[has-class("tel-wrapper")])[1]')(self) phone = phone.replace(' ', ', ') return phone.strip() obj_url = BrowserURL('housing', _id=Env('_id')) def obj_details(self): GES = Attr( '//div[has-class("energy-box")]//div[has-class("rank")]', 'class', default=None )(self) if GES: GES = [x.replace("rank-", "").upper() for x in GES.split() if x.startswith("rank-")][0] else: GES = NotAvailable return { "GES": GES } def obj_photos(self): photos = [] for img in XPath('//div[has-class("owl-carousel-thumbs")]//img/@src')(self): photos.append(HousingPhoto(u'%s' % img)) return photos
class item(ItemElement): offer_details_wrapper = ( './div/div/div[has-class("offer-details-wrapper")]' ) klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')]) ) obj_title = Attr( offer_details_wrapper + '/div/div/p[@class="offer-type"]/a', 'title' ) obj_url = Format( "http://www.logic-immo.com/%s.htm", CleanText( './@id', replace=[('header-offer-', 'detail-location-')] ) ) obj_area = CleanDecimal( ( offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]/span' + '/span[has-class("offer-area-number")]' ), default=NotAvailable ) obj_rooms = CleanDecimal( ( offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]' + '/span[has-class("offer-rooms")]' + '/span[has-class("offer-rooms-number")]' ), default=NotAvailable ) obj_price_per_meter = PricePerMeterFilter() obj_cost = CleanDecimal( Regexp( CleanText( ( offer_details_wrapper + '/div/div/p[@class="offer-price"]/span' ), default=NotAvailable ), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotAvailable ), default=NotAvailable ) obj_currency = Regexp( CleanText( offer_details_wrapper + '/div/div/p[has-class("offer-price")]/span', default=NotAvailable ), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€' ) obj_utilities = UTILITIES.UNKNOWN obj_date = Date( Regexp( CleanText( './div/div/div[has-class("offer-picture-more")]/div/p[has-class("offer-update")]' ), ".*(\d{2}/\d{2}/\d{4}).*") ) obj_text = CleanText( offer_details_wrapper + '/div/div/div/p[has-class("offer-description")]/span' ) obj_location = CleanText( offer_details_wrapper + '//div[has-class("offer-places-block")]' ) def obj_photos(self): photos = [] url = Attr( './div/div/div/div[has-class("picture-wrapper")]/div/img', 'src' )(self) if url: photos.append(HousingPhoto(url)) return photos def obj_details(self): details = {} honoraires = CleanText( ( self.offer_details_wrapper + '/div/div/p[@class="offer-agency-fees"]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) return details
class get_housing(ItemElement): klass = Housing def parse(self, el): self.env['details'] = { obj['key']: obj['value_label'] for obj in self.el['adview']['attributes'] } obj_id = Env('_id') obj_area = CleanDecimal(PopDetail('square', default=0), default=NotAvailable) obj_rooms = CleanDecimal(PopDetail('rooms', default=0), default=NotAvailable) def obj_GES(self): ges = CleanText(PopDetail('ges', default='|'))(self) return getattr(ENERGY_CLASS, ges[0], NotAvailable) def obj_DPE(self): dpe = CleanText(PopDetail('energy_rate', default='|'))(self) return getattr(ENERGY_CLASS, dpe[0], NotAvailable) def obj_house_type(self): value = CleanText(PopDetail('real_estate_type'), default=' ')(self).lower() if value == 'parking': return HOUSE_TYPES.PARKING elif value == 'appartement': return HOUSE_TYPES.APART elif value == 'maison': return HOUSE_TYPES.HOUSE elif value == 'terrain': return HOUSE_TYPES.LAND else: return HOUSE_TYPES.OTHER def obj_utilities(self): value = CleanText(PopDetail('charges_included', default='Non'), default=NotAvailable)(self) if value == "Oui": return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_title = Dict('adview/subject') obj_cost = CleanDecimal(Dict('adview/price/0', default=NotAvailable), default=Decimal(0)) obj_currency = BaseCurrency.get_currency(u'€') obj_text = Dict('adview/body') obj_location = Dict('adview/location/city_label') def obj_advert_type(self): line_pro = Dict('adview/owner/type')(self) if line_pro == u'pro': return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL obj_date = DateTime(Dict('adview/first_publication_date')) def obj_photos(self): photos = [] for img in Dict('adview/images/urls_large', default=[])(self): photos.append(HousingPhoto(img)) return photos def obj_type(self): try: breadcrumb = int(Dict('adview/category_id')(self)) except ValueError: breadcrumb = None if breadcrumb == 11: return POSTS_TYPES.SHARING elif breadcrumb == 10: isFurnished = CleanText(PopDetail('furnished', default=' '))(self) if isFurnished.lower() == u'meublé': return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT else: return POSTS_TYPES.SALE obj_price_per_meter = PricePerMeterFilter() obj_url = Dict('adview/url') obj_details = Env('details')
class item(ItemElement): offer_details_wrapper = ( './/div[has-class("offer-details-wrapper")]') klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')])) obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): house_type = CleanText( './/div[has-class("offer-details-type")]/a')(self).split( ' ')[0].lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER obj_title = CleanText( './/div[has-class("offer-details-type")]/a/@title') obj_url = Format( u'%s%s', CleanText('.//div/a[@class="offer-link"]/@href'), CleanText('.//div/a[@class="offer-link"]/\ @data-orpi', default="")) obj_area = CleanDecimal( (offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]/span' + '/span[has-class("offer-area-number")]'), default=NotLoaded) obj_rooms = CleanDecimal( (offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]' + '/span[has-class("offer-rooms")]' + '/span[has-class("offer-rooms-number")]'), default=NotAvailable) obj_cost = CleanDecimal(Regexp(CleanText( (offer_details_wrapper + '/div/p[@class="offer-price"]/span'), default=NotLoaded), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotLoaded), default=NotLoaded) obj_currency = Currency(offer_details_wrapper + '/div/p[has-class("offer-price")]/span') obj_price_per_meter = PricePerMeterFilter() obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText( offer_details_wrapper + '/div/div/div/p[has-class("offer-description")]/span') obj_location = CleanText(offer_details_wrapper + '/div[@class="offer-details-location"]', replace=[('Voir sur la carte', '')]) def obj_photos(self): photos = [] url = Attr('.//div[has-class("offer-picture")]//img', 'src')(self) if url: url = url.replace('400x267', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos def obj_details(self): details = {} honoraires = CleanText( (self.offer_details_wrapper + '/div/div/p[@class="offer-agency-fees"]'), default=None)(self) if honoraires: details["Honoraires"] = ("{} (TTC, en sus)".format( honoraires.split(":")[1].strip())) return details
class get_housing(ItemElement): klass = Housing def parse(self, el): json_content = Regexp(CleanText('//script'), "var ava_data = ({.+?});")(self) json_content = json_content.replace("logged", "\"logged\"") json_content = json_content.replace("lengthcarrousel", "\"lengthcarrousel\"") json_content = json_content.replace("products", "\"products\"") json_content = json_content.replace( "// // ANNONCES_SIMILAIRE / RECO", "") self.house_json_datas = json.loads(json_content)['products'][0] obj_id = CleanText( '//form[@name="central"]/input[@name="idannonce"]/@value') def obj_house_type(self): naturebien = CleanText( '//form[@name="central"]/input[@name="naturebien"]/@value')( self) try: return next(k for k, v in RET.items() if v == naturebien) except StopIteration: return NotLoaded def obj_type(self): idType = int( CleanText('//form[@name="central"]/input[@name="idtt"]/@value') (self)) type = next(k for k, v in TYPES.items() if v == idType) if type == POSTS_TYPES.FURNISHED_RENT: # SeLoger does not let us discriminate between furnished and not furnished. return POSTS_TYPES.RENT return type def obj_advert_type(self): is_agency = (CleanText( '//form[@name="central"]/input[@name="nomagance"]/@value' )(self) or CleanText( '//form[@name="central"]/input[@name="urlagence"]/@value' )(self) or CleanText( '//form[@name="central"]/input[@name="adresseagence"]/@value') (self)) if is_agency: return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL def obj_photos(self): photos = [] for photo in XPath('//div[@class="carrousel_slide"]/img/@src')( self): photos.append(HousingPhoto("https:{}".format(photo))) for photo in XPath('//div[@class="carrousel_slide"]/@data-lazy')( self): p = json.loads(photo) photos.append(HousingPhoto("https:{}".format(p['url']))) return photos obj_title = CleanText('//title[1]') def obj_location(self): quartier = Regexp(CleanText('//script'), r"'nomQuartier', { value: \"([\w -]+)\", ")(self) ville = CleanText( '//form[@name="central"]/input[@name="ville"]/@value')(self) ville = ville if ville else '' cp = CleanText( '//form[@name="central"]/input[@name="codepostal"]/@value')( self) cp = cp if cp else '' return u'%s %s (%s)' % (quartier, ville, cp) def obj_address(self): p = PostalAddress() p.street = Regexp(CleanText('//script'), r"'nomQuartier', { value: \"([\w -]+)\", ")(self) p.postal_code = CleanText( '//form[@name="central"]/input[@name="codepostal"]/@value')( self) p.city = CleanText( '//form[@name="central"]/input[@name="ville"]/@value')(self) p.full_address = Field('location')(self) return p obj_text = CleanText( '//form[@name="central"]/input[@name="description"]/@value') obj_cost = CleanDecimal(CleanText('//a[@id="price"]'), default=NotLoaded) obj_currency = Currency(CleanText('//a[@id="price"]'), default=NotLoaded) obj_price_per_meter = PricePerMeterFilter() obj_area = CleanDecimal( '//form[@name="central"]/input[@name="surface"]/@value', replace_dots=True) obj_url = CleanText( '//form[@name="central"]/input[@name="urlannonce"]/@value') obj_phone = CleanText( '//div[@class="data-action"]/a[@data-phone]/@data-phone') def obj_utilities(self): mention = CleanText('//span[@class="detail_indice_prix"]', default="")(self) if "(CC) Loyer mensuel charges comprises" in mention: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN def obj_bedrooms(self): return CleanDecimal(Dict('nb_chambres', default=NotLoaded))(self.house_json_datas) def obj_rooms(self): return CleanDecimal(Dict('nb_pieces', default=NotLoaded))(self.house_json_datas)
class item(ItemElement): klass = Housing def condition(self): title = self.obj_title(self) isNotFurnishedOk = True if self.env['query_type'] == POSTS_TYPES.RENT: isNotFurnishedOk = 'meublé' not in title.lower() return (Regexp(Link('./div/a[has-class("item-title")]'), '/annonces/(.*)', default=None)(self) and isNotFurnishedOk) def parse(self, el): rooms_bedrooms_area = el.xpath( './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotLoaded self.env['bedrooms'] = NotLoaded self.env['area'] = NotLoaded for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('.')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('.')(item) else: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value obj_id = Regexp(Link('./div/a[has-class("item-title")]'), '/annonces/(.*)') obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PERSONAL def obj_house_type(self): item_link = Link('./div/a[@class="item-title"]')(self) house_type = item_link.split('/')[-1].split('-')[0] if 'parking' in house_type: return HOUSE_TYPES.PARKING elif 'appartement' in house_type: return HOUSE_TYPES.APART elif 'terrain' in house_type: return HOUSE_TYPES.LAND elif 'maison' in house_type: return HOUSE_TYPES.HOUSE else: return HOUSE_TYPES.OTHER obj_title = CleanText('./div/a[has-class("item-title")]') obj_area = Env('area') obj_cost = CleanDecimal(CleanText( './div/a[has-class("item-title")]/span[@class="item-price"]'), replace_dots=True, default=Decimal(0)) obj_currency = Currency( './div/a[@class="item-title"]/span[@class="item-price"]') obj_utilities = UTILITIES.UNKNOWN obj_station = CleanText('./div/p[@class="item-transports"]', default=NotLoaded) def obj_location(self): return CleanText('./div/p[@class="item-description"]')( self).split(".")[0] obj_text = CleanText('./div/p[@class="item-description"]', replace=[(' Lire la suite', '')]) obj_rooms = Env('rooms') obj_bedrooms = Env('bedrooms') obj_price_per_meter = PricePerMeterFilter() obj_url = Format(u'http://www.pap.fr%s', Link('./div/a[@class="item-title"]')) def obj_photos(self): photos = [] for img in XPath('./a/img/@src')(self): if (img.endswith("visuel-nophoto.png") or img.endswith('miniature-video.png')): continue photos.append(HousingPhoto(u'%s' % img)) return photos
class get_housing(ItemElement): klass = Housing def parse(self, el): rooms_bedrooms_area = el.xpath('.//ul[has-class("item-tags")]/li') self.env['rooms'] = NotAvailable self.env['bedrooms'] = NotAvailable self.env['area'] = NotAvailable for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('./strong')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('./strong')(item) elif ' m²' in name and 'le m²' not in name: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value obj_id = Env('_id') def obj_type(self): prev_link = Link('//ol[has-class("breadcrumb")]/li[1]/a')(self) if 'location' in prev_link: title = self.obj_title(self) if 'meublé' in title.lower(): return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif 'vente' in prev_link: return POSTS_TYPES.SALE elif 'viager' in prev_link: return POSTS_TYPES.VIAGER else: return NotAvailable obj_advert_type = ADVERT_TYPES.PERSONAL def obj_house_type(self): prev_link = Link('//ol[has-class("breadcrumb")]/li[1]/a')(self) house_type = prev_link.split('-')[-1] if 'parking' in house_type: return HOUSE_TYPES.PARKING elif 'appartement' in house_type: return HOUSE_TYPES.APART elif 'terrain' in house_type: return HOUSE_TYPES.LAND elif 'maison' in house_type: return HOUSE_TYPES.HOUSE else: return HOUSE_TYPES.OTHER obj_title = CleanText('//h1[@class="item-title"]') obj_cost = CleanDecimal( '//h1[@class="item-title"]/span[@class="item-price"]', replace_dots=True) obj_currency = Currency( '//h1[@class="item-title"]/span[@class="item-price"]') obj_utilities = UTILITIES.UNKNOWN obj_area = Env('area') def obj_date(self): date = CleanText('//p[@class="item-date"]')(self).split( "/")[-1].strip() return parse_french_date(date) obj_rooms = Env('rooms') obj_bedrooms = Env('bedrooms') obj_price_per_meter = PricePerMeterFilter() obj_location = CleanText('//div[has-class("item-description")]/h2') obj_text = CleanText( CleanHTML('//div[has-class("item-description")]/div/p')) def obj_station(self): return ", ".join([ station.text for station in XPath( '//ul[has-class("item-transports")]//span[has-class("label")]' )(self) ]) def obj_phone(self): phone = CleanText( '(//div[has-class("contact-proprietaire-box")]//strong[@class="tel-wrapper"])[1]' )(self) phone = phone.replace(' ', ', ') return phone obj_url = BrowserURL('housing', _id=Env('_id')) def obj_DPE(self): DPE = Attr( '//div[has-class("energy-box")]//div[has-class("energy-rank")]', 'class', default="")(self) if DPE: DPE = [ x.replace("energy-rank-", "").upper() for x in DPE.split() if x.startswith("energy-rank-") ][0] return getattr(ENERGY_CLASS, DPE, NotAvailable) def obj_photos(self): photos = [] for img in XPath('//div[@class="owl-thumbs"]/a/img/@src')(self): if not img.endswith('miniature-video.png'): photos.append(HousingPhoto(u'%s' % img)) return photos
class item(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('.//span[boolean(@data-reference)]', 'data-reference')) obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): url = self.obj_url(self) for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotLoaded obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_title = CleanText('.//h3[has-class("TeaserOffer-title")]') obj_area = CleanDecimal(Regexp(CleanText( './/div[has-class("MiniData")]//p[@data-behat="surfaceDesBiens"]' ), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal( './/strong[has-class("TeaserOffer-price-num")]') obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency( './/strong[has-class("TeaserOffer-price-num")]') obj_location = CleanText('.//p[has-class("TeaserOffer-loc")]') obj_text = CleanText('.//p[has-class("TeaserOffer-description")]') def obj_photos(self): url = CleanText( Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src'))(self) # If the used photo is a default no photo, the src is on the same domain. if url[0] == '/': return [] else: return [HousingPhoto(url)] obj_date = datetime.date.today() def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbPiecesDesBiens"]', default=NotLoaded) obj_bedrooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbChambresDesBiens"]', default=NotLoaded) def obj_details(self): return { "dispo": Date( Attr('.//span[boolean(@data-dispo)]', 'data-dispo', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText( './/span[has-class("TeaserOffer-price-mentions")]')( self) }
class get_housing(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('//div[boolean(@data-property-reference)]', 'data-property-reference')) obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_type(self): type = Env('type')(self) if type == 'location': if 'appartement-meuble' in self.page.url: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif type == 'achat': return POSTS_TYPES.SALE else: return NotAvailable def obj_url(self): return self.page.url def obj_house_type(self): url = self.obj_url() for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotAvailable obj_title = CleanText('//h1[has-class("OfferTop-title")]') obj_area = CleanDecimal(Regexp(CleanText( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][1]'), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal('//span[has-class("OfferTop-price")]', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency('//span[has-class("OfferTop-price")]') obj_location = Format('%s - %s', CleanText('//p[@data-behat="adresseBien"]'), CleanText('//p[has-class("OfferTop-loc")]')) obj_text = CleanText('//div[has-class("OfferDetails-content")]/p[1]') obj_phone = Regexp(Link('//a[has-class("OfferContact-btn--tel")]'), r'tel:(.*)') def obj_photos(self): photos = [] for photo in self.xpath('//div[has-class("OfferSlider")]//img'): photo_url = Attr('.', 'src')(photo) photo_url = photo_url.replace('640/480', '800/600') photos.append(HousingPhoto(photo_url)) return photos obj_date = datetime.date.today() def obj_utilities(self): price = CleanText('//p[has-class("OfferTop-price")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][2]', default=NotAvailable) obj_bedrooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][3]', default=NotAvailable) def obj_DPE(self): try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) except (RegexpError, XPathNotFound): electric_consumption = None DPE = "" if electric_consumption is not None: if electric_consumption <= 50: DPE = "A" elif 50 < electric_consumption <= 90: DPE = "B" elif 90 < electric_consumption <= 150: DPE = "C" elif 150 < electric_consumption <= 230: DPE = "D" elif 230 < electric_consumption <= 330: DPE = "E" elif 330 < electric_consumption <= 450: DPE = "F" else: DPE = "G" return getattr(ENERGY_CLASS, DPE, NotAvailable) return NotAvailable def obj_details(self): details = {} dispo = Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self) if dispo is not None: details["dispo"] = dispo priceMentions = CleanText('//p[has-class("OfferTop-mentions")]', default=None)(self) if priceMentions is not None: details["priceMentions"] = priceMentions agency = CleanText('//p[has-class("OfferContact-address")]', default=None)(self) if agency is not None: details["agency"] = agency for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]', default=None)(item) if not category: continue details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) details["electric_consumption"] = ( '{} kWhEP/m².an'.format(electric_consumption)) except (RegexpError, XPathNotFound): pass return details
class item(ItemElement): klass = Housing obj_id = CleanText('./@data-classified-id') obj_title = CleanText('./div/h2[@itemprop="name"]/a') obj_location = CleanText('./div/h2[@itemprop="name"]/span[@class="item-localisation"]/span[@class="localisation-label"]/strong') obj_cost = CleanDecimal('./div/div/span[@class="price-label"]|./div/div[@class="item-price-pdf"]', default=NotAvailable) obj_currency = Regexp(CleanText('./div/div/span[@class="price-label"]|./div/div[@class="item-price-pdf"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') def obj_utilities(self): utilities = Regexp(CleanText('./div/div/span[@class="price-label"]|./div/div[@class="item-price-pdf"]'), '.*[%s%s%s](.*)' % (u'€', u'$', u'£'), default=u'')(self) if "CC" in utilities: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_text = CleanText('./div/div/div[@itemprop="description"]') obj_area = CleanDecimal(Regexp(CleanText('./div/h2[@itemprop="name"]/a'), '(.*?)([\d,\.]*) m2(.*?)', '\\2', default=None), replace_dots=True, default=NotAvailable) obj_url = Format( "http://www.explorimmo.com%s", Link('./div/div/ul/li/a[has-class("js-goto-classified")]') ) obj_price_per_meter = PricePerMeterFilter() def obj_phone(self): phone = CleanText('./div/div/ul/li[has-class("js-clickphone")]', replace=[(u'Téléphoner : ', u'')], default=NotAvailable)(self) if '...' in phone: return NotLoaded return phone def obj_details(self): charges = CleanText('./div/div/span[@class="price-fees"]', default=None)(self) if charges: return { "fees": charges.split(":")[1].strip() } else: return NotLoaded def obj_photos(self): url = Attr( './div/div/a/div/img[@itemprop="image"]', 'src', default=None )(self) if url: url = unquote(url) if "http://" in url[3:]: url = url[url.find("http://", 3):url.rfind("?")] return [HousingPhoto(url)] else: return NotAvailable
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') def obj_type(self): url = BrowserURL('housing', _id=Env('_id'))(self) if 'colocation' in url: return POSTS_TYPES.SHARING elif 'location' in url: isFurnished = False for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')( li) if label.lower() == "meublé": isFurnished = ( CleanText('./span[has-class("criteria-value")]')( li).lower() == 'oui') if isFurnished: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif 'vente' in url: offertype = Attr( '//button[has-class("offer-contact-vertical-phone")][1]', 'data-offertransactiontype')(self) if offertype == '4': return POSTS_TYPES.VIAGER else: return POSTS_TYPES.SALE return NotAvailable obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): house_type = CleanText('.//div[has-class("offer-type")]')( self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER obj_title = CleanText(CleanHTML('//meta[@itemprop="name"]/@content')) obj_area = CleanDecimal(Regexp(CleanText( CleanHTML('//meta[@itemprop="name"]/@content')), '(.*?)(\d*) m\xb2(.*?)', '\\2', default=NotAvailable), default=NotAvailable) obj_rooms = CleanDecimal( '//div[has-class("offer-info")]//span[has-class("offer-rooms-number")]', default=NotAvailable) obj_cost = CleanDecimal('//*[@itemprop="price"]', default=0) obj_currency = Currency('//*[@itemprop="price"]') def obj_utilities(self): notes = CleanText('//p[@class="offer-description-notes"]')(self) if "Loyer mensuel charges comprises" in notes: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_price_per_meter = PricePerMeterFilter() obj_date = Date(Regexp( CleanText( '//p[@class="offer-description-notes"]|//p[has-class("darkergrey")]' ), u'.* Mis à jour : (\d{2}/\d{2}/\d{4}).*'), dayfirst=True) obj_text = CleanHTML( '//div[has-class("offer-description-text")]/meta[@itemprop="description"]/@content' ) obj_location = CleanText('//*[@itemprop="address"]') obj_station = CleanText('//div[has-class("offer-description-metro")]', default=NotAvailable) obj_url = BrowserURL('housing', _id=Env('_id')) def obj_photos(self): photos = [] for img in XPath('//div[has-class("carousel-content")]//img/@src')( self): url = u'%s' % img.replace('75x75', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos def obj_DPE(self): energy_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("energy-summary")]', default="")(self) if len(energy_value): energy_value = energy_value.replace("DPE", "").strip()[0] return getattr(ENERGY_CLASS, energy_value, NotAvailable) def obj_GES(self): greenhouse_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("greenhouse-summary")]', default="")(self) if len(greenhouse_value): greenhouse_value = greenhouse_value.replace("GES", "").strip()[0] return getattr(ENERGY_CLASS, greenhouse_value, NotAvailable) def obj_details(self): details = {} details["creationDate"] = Date(Regexp( CleanText( '//p[@class="offer-description-notes"]|//p[has-class("darkergrey")]' ), u'.*Mis en ligne : (\d{2}/\d{2}/\d{4}).*'), dayfirst=True)(self) honoraires = CleanText(( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None)(self) if honoraires: details["Honoraires"] = ("{} (TTC, en sus)".format( honoraires.split(":")[1].strip())) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) value = CleanText('./span[has-class("criteria-value")]')(li) details[label] = value return details
class AvendreAlouerItem(ItemElement): klass = Housing _url = AbsoluteLink('.//a[has-class("linkCtnr")]') load_details = _url & AsyncLoad obj_url = _url obj_id = Async('details') & CleanText(Regexp(CleanText('//p[has-class("property-reference")]'), r'\:(.*)$', default='')) obj_title = CleanText('.//a//ul') obj_area = CleanDecimal( CleanText('.//a//ul//li[has-class("first")]//following-sibling::li[2]'), default=NotAvailable ) obj_cost = CleanDecimal( CleanText('.//span[has-class("price")]') ) obj_price_per_meter = PricePerMeterFilter() obj_currency = CleanText( Regexp( CleanText('.//span[has-class("price")]'), r'[\d\ ]+(.*)' ) ) obj_location = CleanText('.//span[has-class("loca")]') obj_text = CleanText('.//p[has-class("propShortDesc")]') obj_date = Async('details') & Date( Regexp( CleanText('//div[has-class("property-description-main")]'), r'Mise à jour le ([\d\\]+)', default=datetime.today() ) ) def obj_details(self): page_doc = Async('details').loaded_page(self).doc return { 'GES': CleanText('//span[@id="gassymbol"]', '')(page_doc), 'DPE': CleanText('//span[@id="energysymbol"]', '')(page_doc), } def obj_utilities(self): price = CleanText('//span[has-class("price-info")]')(self) if 'CC' in price: return UTILITIES.INCLUDED elif 'HC' in price: return UTILITIES.EXCLUDED else: return UTILITIES.UNKNOWN obj_station = 'Test' obj_bedrooms = Async('details') & CleanDecimal( CleanText('.//td//span[contains(text(), "Chambre")]//following-sibling::span[has-class("r")]'), default=NotAvailable ) obj_rooms = Async('details') & CleanDecimal( CleanText('.//td//span[contains(text(), "Pièce")]//following-sibling::span[has-class("r")]'), default=NotAvailable ) def obj_photos(self): page_doc = Async('details').loaded_page(self).doc photos = [] for photo in page_doc.xpath('//div[@id="bxSliderContainer"]//ul//li//img'): url = Attr('.', 'src')(photo) if url[0] != '/': photos.append(HousingPhoto(url)) return photos def validate(self, obj): return obj.id != ''
class get_housing(ItemElement): klass = Housing def parse(self, el): details = dict() self.env['area'] = NotAvailable self.env['GES'] = NotAvailable self.env['DPE'] = NotAvailable self.env['typeBien'] = NotAvailable for item in el.xpath('//div[@class="line"]/h2'): property = CleanText('./span[@class="property"]')(item) if 'Surface' in property: self.env['area'] = CleanDecimal( Regexp(CleanText('./span[@class="value"]'), '(.*)m.*'), replace_dots=(',', '.'))(item) elif 'Type de bien' in property: value = CleanText('./span[@class="value"]')(item).lower() if value == 'parking': self.env['typeBien'] = HOUSE_TYPES.PARKING elif value == 'appartement': self.env['typeBien'] = HOUSE_TYPES.APART elif value == 'maison': self.env['typeBien'] = HOUSE_TYPES.HOUSE elif value == 'terrain': self.env['typeBien'] = HOUSE_TYPES.LAND else: self.env['typeBien'] = HOUSE_TYPES.OTHER elif 'Meublé' in property: value = CleanText('./span[@class="value"]')(item).lower() self.env['isFurnished'] = (value == 'meublé') else: key = u'%s' % CleanText('./span[@class="property"]')(item) if 'GES' in key or 'Classe' in key: if 'Classe' in key: key = 'DPE' value = ( CleanText('./span[@class="value"]')(item).strip()) if len(value): self.env[key] = getattr(ENERGY_CLASS, value[0], NotAvailable) else: details[key] = CleanText('./span[@class="value"]')( item) self.env['details'] = details obj_id = Env('_id') def obj_type(self): breadcrumb = Link( '(//nav[has-class("breadcrumbsNav")]//a)[last()]')(self) if 'colocations' in breadcrumb: return POSTS_TYPES.SHARING elif 'locations' in breadcrumb: if self.env['isFurnished']: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT else: return POSTS_TYPES.SALE def obj_advert_type(self): line_pro = XPath('.//span[has-class("ispro")]', default=None)(self) if line_pro: return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL obj_house_type = Env('typeBien') obj_title = CleanText('//h1[@itemprop="name"]') obj_cost = CleanDecimal('//h2[@itemprop="price"]/@content', default=Decimal(0)) obj_currency = Currency('//h2[@itemprop="price"]/span[@class="value"]') def obj_utilities(self): utilities = Regexp( CleanText('//h2[@itemprop="price"]/span[@class="value"]'), '.*[%s%s%s](.*)' % (u'€', u'$', u'£'), default=u'')(self) if "C.C." in utilities: return UTILITIES.INCLUDED elif "H.C." in utilities: return UTILITIES.EXCLUDED else: return UTILITIES.UNKNOWN obj_DPE = Env('DPE') obj_GES = Env('GES') obj_text = CleanText('//p[@itemprop="description"]') obj_location = CleanText('//span[@itemprop="address"]') obj_details = Env('details') def obj_rooms(self): rooms = self.env["details"].get(u"Pièces", None) return Decimal(rooms) if rooms else NotAvailable obj_area = Env('area') obj_price_per_meter = PricePerMeterFilter() obj_url = BrowserURL('housing', _id=Env('_id')) def obj_date(self): _date = Regexp( CleanText('//p[has-class("line")]', replace=[(u'à', '')]), '.*Mise en ligne le (.*)')(self) for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self) def obj_photos(self): items = re.findall(r'images\[\d\]\s*=\s*"([\w:\/\.-]*\.jpg)";', CleanText('//script')(self)) photos = [HousingPhoto(unicode(item)) for item in items] if not photos: img = CleanText('//meta[@itemprop="image"]/@content', default=None)(self) if img: photos.append(HousingPhoto(img)) return photos
class get_housing(ItemElement): klass = Housing def is_agency(self): return Dict('agency/isParticulier')(self) == 'false' obj_id = Env('_id') def obj_type(self): transaction = Dict('characteristics/transaction')(self) if transaction == 'location': if Dict('characteristics/isFurnished')(self): return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif transaction == 'vente': type = Dict('characteristics/estateType')(self).lower() if 'viager' in type: return POSTS_TYPES.VIAGER else: return POSTS_TYPES.SALE else: return NotAvailable def obj_advert_type(self): if self.is_agency: return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL def obj_house_type(self): type = Dict('characteristics/estateType')(self).lower() if 'appartement' in type: return HOUSE_TYPES.APART elif 'maison' in type: return HOUSE_TYPES.HOUSE elif 'parking' in type: return HOUSE_TYPES.PARKING elif 'terrain' in type: return HOUSE_TYPES.LAND else: return HOUSE_TYPES.OTHER obj_title = Dict('characteristics/titleWithTransaction') obj_location = Format('%s %s %s', Dict('location/address'), Dict('location/cityLabel'), Dict('location/postalCode')) def obj_cost(self): cost = TypeDecimal(Dict('characteristics/price'))(self) if cost == 0: cost = TypeDecimal(Dict('characteristics/priceMin'))(self) return cost obj_currency = BaseCurrency.get_currency('€') def obj_utilities(self): are_fees_included = Dict('characteristics/areFeesIncluded', default=None)(self) if are_fees_included: return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_text = CleanHTML(Dict('characteristics/description')) obj_url = BrowserURL('housing_html', _id=Env('_id')) def obj_area(self): area = TypeDecimal(Dict('characteristics/area'))(self) if area == 0: area = TypeDecimal(Dict('characteristics/areaMin'))(self) return area obj_date = FromTimestamp(Dict('characteristics/date')) obj_bedrooms = TypeDecimal(Dict('characteristics/bedroomCount')) def obj_rooms(self): # TODO: Why is roomCount a list? rooms = Dict('characteristics/roomCount', default=[])(self) if rooms: return TypeDecimal(rooms[0])(self) return NotAvailable obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] for img in Dict('characteristics/images')(self): m = re.search('http://thbr\.figarocms\.net.*(http://.*)', img.get('xl')) if m: photos.append(HousingPhoto(m.group(1))) else: photos.append(HousingPhoto(img.get('xl'))) return photos def obj_DPE(self): DPE = Dict('characteristics/energyConsumptionCategory', default="")(self) return getattr(ENERGY_CLASS, DPE, NotAvailable) def obj_GES(self): GES = Dict('characteristics/greenhouseGasEmissionCategory', default="")(self) return getattr(ENERGY_CLASS, GES, NotAvailable) def obj_details(self): details = {} details['fees'] = Dict('characteristics/fees', default=NotAvailable)(self) details['agencyFees'] = Dict('characteristics/agencyFees', default=NotAvailable)(self) details['guarantee'] = Dict('characteristics/guarantee', default=NotAvailable)(self) details['bathrooms'] = Dict('characteristics/bathroomCount', default=NotAvailable)(self) details['creationDate'] = FromTimestamp(Dict( 'characteristics/creationDate', default=NotAvailable), default=NotAvailable)(self) details['availabilityDate'] = Dict( 'characteristics/estateAvailabilityDate', default=NotAvailable)(self) details['exposure'] = Dict('characteristics/exposure', default=NotAvailable)(self) details['heatingType'] = Dict('characteristics/heatingType', default=NotAvailable)(self) details['floor'] = Dict('characteristics/floor', default=NotAvailable)(self) details['bedrooms'] = Dict('characteristics/bedroomCount', default=NotAvailable)(self) details['isFurnished'] = Dict('characteristics/isFurnished', default=NotAvailable)(self) rooms = Dict('characteristics/roomCount', default=[])(self) if len(rooms): details['rooms'] = rooms[0] details['available'] = Dict('characteristics/isAvailable', default=NotAvailable)(self) agency = Dict('agency', default=NotAvailable)(self) details['agency'] = ', '.join([ x for x in [ agency.get('corporateName', ''), agency.get('corporateAddress', ''), agency.get('corporatePostalCode', ''), agency.get('corporateCity', '') ] if x ]) return details
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText(CleanHTML('//meta[@itemprop="name"]/@content')) obj_area = CleanDecimal(Regexp(CleanText(CleanHTML('//meta[@itemprop="name"]/@content')), '(.*?)(\d*) m\xb2(.*?)', '\\2', default=NotAvailable), default=NotAvailable) obj_rooms = CleanDecimal('//div[has-class("offer-info")]//span[has-class("offer-rooms-number")]', default=NotAvailable) obj_cost = CleanDecimal('//*[@itemprop="price"]', default=0) obj_currency = Regexp( CleanText('//*[@itemprop="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€' ) def obj_utilities(self): notes = CleanText('//p[@class="offer-description-notes"]')(self) if "Loyer mensuel charges comprises" in notes: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_price_per_meter = PricePerMeterFilter() obj_date = Date(Regexp(CleanText('//p[@class="offer-description-notes"]|//p[has-class("darkergrey")]'), u'.* Mis à jour : (\d{2}/\d{2}/\d{4}).*'), dayfirst=True) obj_text = CleanHTML('//div[has-class("offer-description-text")]/meta[@itemprop="description"]/@content') obj_location = CleanText('//*[@itemprop="address"]') obj_station = CleanText( '//div[has-class("offer-description-metro")]', default=NotAvailable ) obj_url = BrowserURL('housing', _id=Env('_id')) def obj_photos(self): photos = [] for img in XPath('//div[@class="carousel-content"]/ul/li/a/img/@src|//div[@class="carousel"]/ul/li/a/img/@src')(self): photos.append(HousingPhoto(u'%s' % img.replace('75x75', '800x600'))) return photos def obj_details(self): details = {} energy_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("energy-summary")]', default=None )(self) if energy_value and len(energy_value) > 1: energy_value = energy_value.replace("DPE", "").strip()[0] if energy_value not in ["A", "B", "C", "D", "E", "F", "G"]: energy_value = None if energy_value is None: energy_value = NotAvailable details["DPE"] = energy_value greenhouse_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("greenhouse-summary")]', default=None )(self) if greenhouse_value and len(greenhouse_value) > 1: greenhouse_value = greenhouse_value.replace("GES", "").strip()[0] if greenhouse_value not in ["A", "B", "C", "D", "E", "F", "G"]: greenhouse_value = None if greenhouse_value is None: greenhouse_value = NotAvailable details["GES"] = greenhouse_value details["creationDate"] = Date( Regexp( CleanText( '//p[@class="offer-description-notes"]|//p[has-class("darkergrey")]' ), u'.*Mis en ligne : (\d{2}/\d{2}/\d{4}).*' ), dayfirst=True )(self) honoraires = CleanText( ( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./div[has-class("criteria-label")]')(li) value = CleanText('./div[has-class("criteria-value")]')(li) details[label] = value return details
class item(ItemElement): klass = Housing price_selector = './/span[@class="price-label"]|./div/div[@class="item-price-pdf"]' def is_agency(self): agency = CleanText('.//span[has-class("item-agency-name")]')( self.el) return 'annonce de particulier' not in agency.lower() def condition(self): if len(self.env['advert_types']) == 1: is_agency = self.is_agency() if self.env['advert_types'][0] == ADVERT_TYPES.PERSONAL: return not is_agency elif self.env['advert_types'][ 0] == ADVERT_TYPES.PROFESSIONAL: return is_agency return Attr('.', 'data-classified-id', default=False)(self) obj_id = Attr('.', 'data-classified-id') obj_type = Env('query_type') obj_title = CleanText('./div/h2[@class="item-type"]') def obj_advert_type(self): if self.is_agency(): return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL def obj_house_type(self): type = self.obj_title(self).split()[0].lower() if type == "appartement" or type == "studio" or type == "chambre": return HOUSE_TYPES.APART elif type == "maison" or type == "villa": return HOUSE_TYPES.HOUSE elif type == "parking": return HOUSE_TYPES.PARKING elif type == "terrain": return HOUSE_TYPES.LAND else: return HOUSE_TYPES.OTHER def obj_location(self): script = CleanText('./script')(self) try: # Should be standard JSON+LD data script = json.loads(script) except ValueError: try: # But explorimmo can't write JSON correctly and there # is a trailing "}" script = json.loads(script.strip().rstrip('}')) except ValueError: script = None if not script: return NotLoaded try: return '%s (%s)' % (script['address']['addressLocality'], script['address']['postalCode']) except (KeyError): return NotLoaded def obj_cost(self): cost = CleanDecimal( Regexp(CleanText(self.price_selector, default=''), r'de (.*) à .*', default=0))(self) if cost == 0: return CleanDecimal(self.price_selector, default=NotAvailable)(self) else: return cost obj_currency = Currency(price_selector) def obj_utilities(self): utilities = CleanText( './div/div/span[@class="price-label"]|' './div/div[@class="item-price-pdf"]|' './div/div/span[@class="item-price"]')(self) if "CC" in utilities: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN obj_text = CleanText('./div/p[@itemprop="description"]') obj_area = CleanDecimal(Regexp(obj_title, r'(.*?)([\d,\.]*) m2(.*?)', '\\2', default=None), replace_dots=True, default=NotLoaded) obj_url = Format( "https://immobilier.lefigaro.fr/annonces/annonce-%s.html", CleanText('./@data-classified-id')) obj_price_per_meter = PricePerMeterFilter() def obj_phone(self): phone = CleanText( './div/div/ul/li[has-class("js-clickphone")]', replace=[('Téléphoner : ', '')], default=NotLoaded)(self) if '...' in phone: return NotLoaded return phone def obj_details(self): charges = CleanText('.//span[@class="price-fees"]', default=None)(self) if charges: return {"fees": charges.split(":")[1].strip()} else: return NotLoaded def obj_photos(self): url = CleanText( './div[has-class("default-img")]/img/@data-src')(self) if url: url = unquote(url) if "http://" in url[3:]: rindex = url.rfind("?") if rindex == -1: rindex = None url = url[url.find("http://", 3):rindex] return [HousingPhoto(url)] else: return NotLoaded