コード例 #1
0
 def __init__(self, *args, **kwargs):
     HTMLPage.__init__(self, *args, **kwargs)
     json_content = Regexp(
         CleanText('//script'),
         r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);window\[\"tags\"\]"
     )(self.doc)
     json_content = codecs.unicode_escape_decode(json_content)[0]
     json_content = json_content.encode('utf-8',
                                        'surrogatepass').decode('utf-8')
     self.doc = json.loads(json_content)
コード例 #2
0
        def obj_details(self):
            details = {}

            dispo = Date(
                Regexp(CleanText('//p[has-class("OfferTop-dispo")]'),
                       r'.* (\d\d\/\d\d\/\d\d\d\d)',
                       default=datetime.date.today().isoformat()))(self)
            if dispo is not None:
                details["dispo"] = dispo

            priceMentions = CleanText('//p[has-class("OfferTop-mentions")]',
                                      default=None)(self)
            if priceMentions is not None:
                details["priceMentions"] = priceMentions

            agency = CleanText('//p[has-class("OfferContact-address")]',
                               default=None)(self)
            if agency is not None:
                details["agency"] = agency

            for item in self.xpath(
                    '//div[has-class("OfferDetails-columnize")]/div'):
                category = CleanText(
                    './h3[has-class("OfferDetails-title--2")]',
                    default=None)(item)
                if not category:
                    continue

                details[category] = {}

                for detail_item in item.xpath(
                        './/ul[has-class("List--data")]/li'):
                    detail_title = CleanText(
                        './/span[has-class("List-data")]')(detail_item)
                    detail_value = CleanText('.//*[has-class("List-value")]')(
                        detail_item)
                    details[category][detail_title] = detail_value

                for detail_item in item.xpath(
                        './/ul[has-class("List--bullet")]/li'):
                    detail_title = CleanText('.')(detail_item)
                    details[category][detail_title] = True

            try:
                electric_consumption = CleanDecimal(
                    Regexp(
                        Attr('//div[has-class("OfferDetails-content")]//img',
                             'src'),
                        r'https://dpe.foncia.net\/(\d+)\/.*'))(self)
                details["electric_consumption"] = (
                    '{} kWhEP/m².an'.format(electric_consumption))
            except (RegexpError, XPathNotFound):
                pass

            return details
コード例 #3
0
 def __init__(self, *args, **kwargs):
     HTMLPage.__init__(self, *args, **kwargs)
     json_content = Regexp(
         CleanText('//script'),
         r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);")(self.doc)
     json_content = codecs.unicode_escape_decode(json_content)[0]
     json_content = json_content.encode('utf-8',
                                        'surrogatepass').decode('utf-8')
     self.doc = {
         "advert":
         json.loads(json_content).get('advert', {}).get('mainAdvert', {}),
         "agency":
         json.loads(json_content).get('agency', {})
     }
コード例 #4
0
        def obj_details(self):
            details = {}

            details["creationDate"] = Date(
                Regexp(
                    CleanText(
                        '//div[@class="offer-description-notes"]'
                    ),
                    u'.*Mis en ligne: (\d{2}/\d{2}/\d{4}).*'
                ),
                dayfirst=True
            )(self)

            honoraires = CleanText(
                (
                    '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]'
                ),
                default=None
            )(self)
            if honoraires:
                details["Honoraires"] = (
                    "{} (TTC, en sus)".format(
                        honoraires.split(":")[1].strip()
                    )
                )

            for li in XPath('//ul[@itemprop="description"]/li')(self):
                label = CleanText('./span[has-class("criteria-label")]')(li)
                value = CleanText('./span[has-class("criteria-value")]')(li)
                details[label] = value

            return details
コード例 #5
0
        def obj_DPE(self):
            try:
                electric_consumption = CleanDecimal(
                    Regexp(
                        Attr('//div[has-class("OfferDetails-content")]//img',
                             'src'),
                        r'https://dpe.foncia.net\/(\d+)\/.*'))(self)
            except (RegexpError, XPathNotFound):
                electric_consumption = None

            DPE = ""
            if electric_consumption is not None:
                if electric_consumption <= 50:
                    DPE = "A"
                elif 50 < electric_consumption <= 90:
                    DPE = "B"
                elif 90 < electric_consumption <= 150:
                    DPE = "C"
                elif 150 < electric_consumption <= 230:
                    DPE = "D"
                elif 230 < electric_consumption <= 330:
                    DPE = "E"
                elif 330 < electric_consumption <= 450:
                    DPE = "F"
                else:
                    DPE = "G"
                return getattr(ENERGY_CLASS, DPE, NotAvailable)
            return NotAvailable
コード例 #6
0
        class item(ItemElement):
            klass = Housing

            obj_id = Format('colocation-%s', CleanText('./div/header/@id', replace=[('header-offer-', '')]))
            obj_type = POSTS_TYPES.SHARING
            obj_advert_type = ADVERT_TYPES.PROFESSIONAL
            obj_title = CleanText(CleanHTML('./div/header/section/p[@class="property-type"]/span/@title'))

            obj_area = CleanDecimal('./div/header/section/p[@class="offer-attributes"]/a/span[@class="offer-area-number"]',
                                    default=0)

            obj_cost = CleanDecimal('./div/header/section/p[@class="price"]', default=0)
            obj_currency = Currency(
                './div/header/section/p[@class="price"]'
            )
            obj_utilities = UTILITIES.UNKNOWN

            obj_text = CleanText(
                './div/div[@class="content-offer"]/section[has-class("content-desc")]/p/span[has-class("offer-text")]/@title',
                default=NotLoaded
            )

            obj_date = Date(Regexp(CleanText('./div/header/section/p[has-class("update-date")]'),
                                   ".*(\d{2}/\d{2}/\d{4}).*"))

            obj_location = CleanText(
                '(./div/div[@class="content-offer"]/section[has-class("content-desc")]/p)[1]/span/@title',
                default=NotLoaded
            )
コード例 #7
0
ファイル: pages.py プロジェクト: Phyks/Flatisfy
    class get_housing(ItemElement):
        klass = Housing

        obj_id = Env('_id')
        obj_title = CleanText('//h1[@itemprop="name"]')
        obj_location = CleanText('//span[@class="informations-localisation"]')
        obj_cost = CleanDecimal('//span[@itemprop="price"]')
        obj_currency = Currency('//span[@itemprop="price"]')
        obj_text = CleanHTML('//div[@itemprop="description"]')
        obj_url = BrowserURL('housing', _id=Env('_id'))
        obj_area = CleanDecimal(Regexp(CleanText('//h1[@itemprop="name"]'),
                                       r'(.*?)(\d*) m2(.*?)', '\\2'), default=NotAvailable)
        obj_price_per_meter = PricePerMeterFilter()

        def obj_photos(self):
            photos = []
            for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self):
                url = Regexp(CleanText('./@src'), r'http://thbr\.figarocms\.net.*(http://.*)')(img)
                photos.append(HousingPhoto(url))
            return photos

        def obj_details(self):
            details = dict()
            for item in XPath('//div[@class="features clearfix"]/ul/li')(self):
                key = CleanText('./span[@class="name"]')(item)
                value = CleanText('./span[@class="value"]')(item)
                if value and key:
                    details[key] = value

            key = CleanText('//div[@class="title-dpe clearfix"]')(self)
            value = CleanText('//div[@class="energy-consumption"]')(self)
            if value and key:
                details[key] = value
            return details
コード例 #8
0
ファイル: pages.py プロジェクト: Phyks/Flatisfy
 def obj_cost(self):
     cost = CleanDecimal(Regexp(CleanText(self.price_selector, default=''),
                                r'de (.*) à .*',
                                default=0))(self)
     if cost == 0:
         return CleanDecimal(self.price_selector, default=NotAvailable)(self)
     else:
         return cost
コード例 #9
0
 def condition(self):
     title = self.obj_title(self)
     isNotFurnishedOk = True
     if self.env['query_type'] == POSTS_TYPES.RENT:
         isNotFurnishedOk = 'meublé' not in title.lower()
     id = self.obj_id(self)
     if id is None:
         return False
     return (Regexp(Link('./div/a[has-class("item-title")]'),
                    '/annonces/(.*)',
                    default=None)(self) and isNotFurnishedOk)
コード例 #10
0
        def parse(self, el):
            rooms_bedrooms_area = el.xpath('.//ul[has-class("item-tags")]/li')
            self.env['rooms'] = NotAvailable
            self.env['bedrooms'] = NotAvailable
            self.env['area'] = NotAvailable

            for item in rooms_bedrooms_area:
                name = CleanText('.')(item)
                if 'chambre' in name.lower():
                    name = 'bedrooms'
                    value = CleanDecimal('./strong')(item)
                elif 'pièce' in name.lower():
                    name = 'rooms'
                    value = CleanDecimal('./strong')(item)
                elif ' m²' in name and 'le m²' not in name:
                    name = 'area'
                    value = CleanDecimal(
                        Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item)
                self.env[name] = value
コード例 #11
0
            def parse(self, el):
                rooms_bedrooms_area = el.xpath(
                    './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li'
                )
                self.env['rooms'] = NotLoaded
                self.env['bedrooms'] = NotLoaded
                self.env['area'] = NotLoaded

                for item in rooms_bedrooms_area:
                    name = CleanText('.')(item)
                    if 'chambre' in name.lower():
                        name = 'bedrooms'
                        value = CleanDecimal('.')(item)
                    elif 'pièce' in name.lower():
                        name = 'rooms'
                        value = CleanDecimal('.')(item)
                    else:
                        name = 'area'
                        value = CleanDecimal(
                            Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item)
                    self.env[name] = value
コード例 #12
0
        class item(ItemElement):
            klass = Housing

            def condition(self):
                title = self.obj_title(self)
                isNotFurnishedOk = True
                if self.env['query_type'] == POSTS_TYPES.RENT:
                    isNotFurnishedOk = 'meublé' not in title.lower()
                id = self.obj_id(self)
                if id is None:
                    return False
                return (Regexp(Link('./div/a[has-class("item-title")]'),
                               '/annonces/(.*)',
                               default=None)(self) and isNotFurnishedOk)

            def parse(self, el):
                rooms_bedrooms_area = el.xpath(
                    './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li'
                )
                self.env['rooms'] = NotLoaded
                self.env['bedrooms'] = NotLoaded
                self.env['area'] = NotLoaded

                for item in rooms_bedrooms_area:
                    name = CleanText('.')(item)
                    if 'chambre' in name.lower():
                        name = 'bedrooms'
                        value = CleanDecimal('.')(item)
                    elif 'pièce' in name.lower():
                        name = 'rooms'
                        value = CleanDecimal('.')(item)
                    else:
                        name = 'area'
                        value = CleanDecimal(
                            Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item)
                    self.env[name] = value

            obj_id = Regexp(Link('./div/a[has-class("item-title")]'),
                            '/annonces/(.*)',
                            default=None)

            obj_type = Env('query_type')
            obj_advert_type = ADVERT_TYPES.PERSONAL

            def obj_house_type(self):
                item_link = Link('./div/a[@class="item-title"]')(self)
                house_type = item_link.split('/')[-1].split('-')[0]
                if 'parking' in house_type:
                    return HOUSE_TYPES.PARKING
                elif 'appartement' in house_type:
                    return HOUSE_TYPES.APART
                elif 'terrain' in house_type:
                    return HOUSE_TYPES.LAND
                elif 'maison' in house_type:
                    return HOUSE_TYPES.HOUSE
                else:
                    return HOUSE_TYPES.OTHER

            obj_title = CleanText('./div/a[has-class("item-title")]')
            obj_area = Env('area')
            obj_cost = CleanDecimal(CleanText(
                './div/a[has-class("item-title")]/span[@class="item-price"]'),
                                    replace_dots=True,
                                    default=Decimal(0))
            obj_currency = Currency(
                './div/a[@class="item-title"]/span[@class="item-price"]')
            obj_utilities = UTILITIES.UNKNOWN

            obj_station = CleanText('./div/p[@class="item-transports"]',
                                    default=NotLoaded)

            def obj_location(self):
                return CleanText('./div/p[@class="item-description"]')(
                    self).split(".")[0]

            obj_text = CleanText('./div/p[@class="item-description"]',
                                 replace=[(' Lire la suite', '')])
            obj_rooms = Env('rooms')
            obj_bedrooms = Env('bedrooms')
            obj_price_per_meter = PricePerMeterFilter()

            obj_url = Format(u'http://www.pap.fr%s',
                             Link('./div/a[@class="item-title"]'))

            def obj_photos(self):
                photos = []
                for img in XPath('./a/img/@src')(self):
                    if (img.endswith("visuel-nophoto.png")
                            or img.endswith('miniature-video.png')):
                        continue
                    photos.append(HousingPhoto(u'%s' % img))
                return photos
コード例 #13
0
ファイル: pages.py プロジェクト: Phyks/Flatisfy
 def obj_photos(self):
     photos = []
     for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self):
         url = Regexp(CleanText('./@src'), r'http://thbr\.figarocms\.net.*(http://.*)')(img)
         photos.append(HousingPhoto(url))
     return photos
コード例 #14
0
    class get_housing(ItemElement):
        klass = Housing

        obj_id = Env('_id')

        def obj_type(self):
            url = BrowserURL('housing', _id=Env('_id'))(self)
            if 'colocation' in url:
                return POSTS_TYPES.SHARING
            elif 'location' in url:
                isFurnished = False
                for li in XPath('//ul[@itemprop="description"]/li')(self):
                    label = CleanText('./span[has-class("criteria-label")]')(li)
                    if label.lower() == "meublé":
                        isFurnished = (
                            CleanText('./span[has-class("criteria-value")]')(li).lower() == 'oui'
                        )
                if isFurnished:
                    return POSTS_TYPES.FURNISHED_RENT
                else:
                    return POSTS_TYPES.RENT
            elif 'vente' in url:
                return POSTS_TYPES.SALE
            return NotAvailable
        obj_advert_type = ADVERT_TYPES.PROFESSIONAL

        def obj_house_type(self):
            house_type = CleanText('.//h2[@class="offerMainFeatures"]/div')(self).lower()
            if house_type == "appartement":
                return HOUSE_TYPES.APART
            elif house_type == "maison":
                return HOUSE_TYPES.HOUSE
            elif house_type == "terrain":
                return HOUSE_TYPES.LAND
            elif house_type == "parking":
                return HOUSE_TYPES.PARKING
            else:
                return HOUSE_TYPES.OTHER

        obj_title = Attr('//meta[@property="og:title"]', 'content')
        obj_area = CleanDecimal(
            CleanText(
                '//p[@class="offerArea"]/span',
            ),
            default=NotAvailable
        )
        obj_rooms = CleanDecimal(
                        Regexp(
                            CleanText('//p[@class="offerRooms"]/span'),
                            '(\d) p.',
                            default=NotAvailable
                        ),
                        default=NotAvailable
                    )
        obj_bedrooms = CleanDecimal(
                        Regexp(
                            CleanText('//p[@class="offerRooms"]/span'),
                            '(\d) ch.',
                            default=NotAvailable
                        ),
                        default=NotAvailable
                    )
        obj_cost = CleanDecimal('//*[@itemprop="price"]', default=0)
        obj_currency = Currency(
            '//*[@itemprop="price"]'
        )

        def obj_utilities(self):
            notes = CleanText('//p[@class="offer-description-notes"]')(self)
            if "Loyer mensuel charges comprises" in notes:
                return UTILITIES.INCLUDED
            else:
                return UTILITIES.UNKNOWN

        obj_price_per_meter = PricePerMeterFilter()
        obj_date = Date(Regexp(CleanText('//div[@class="offer-description-notes"]'),
                               u'.* Mis à jour: (\d{2}/\d{2}/\d{4}).*'),
                        dayfirst=True)
        obj_text = CleanHTML('//p[@class="descrProperty"]')
        obj_location = CleanText('//em[@class="infoAdresse"]')
        obj_station = CleanText(
            '//div[has-class("offer-description-metro")]',
            default=NotAvailable
        )

        obj_url = BrowserURL('housing', _id=Env('_id'))

        def obj_photos(self):
            photos = []
            for img in XPath('//ul[@class="thumbsContainer"]//img/@src')(self):
                if img.endswith('.svg'):
                    continue
                url = u'%s' % img.replace('182x136', '800x600')
                url = urljoin(self.page.url, url)  # Ensure URL is absolute
                photos.append(HousingPhoto(url))
            return photos

        def obj_DPE(self):
            energy_value = CleanText(
                '//ul[@class="energyInfosDPE"]//li[@class="energyInfos"]/span/@data-class',
                default=""
            )(self)
            if len(energy_value):
                energy_value = energy_value.replace("DPE", "").strip()[0]
            return getattr(ENERGY_CLASS, energy_value, NotAvailable)

        def obj_GES(self):
            greenhouse_value = CleanText(
                '//ul[@class="energyInfosGES"]//li[@class="energyInfos"]/span/@data-class',
                default=""
            )(self)
            if len(greenhouse_value):
                greenhouse_value = greenhouse_value.replace("GES", "").strip()[0]
            return getattr(ENERGY_CLASS, greenhouse_value, NotAvailable)

        def obj_details(self):
            details = {}

            details["creationDate"] = Date(
                Regexp(
                    CleanText(
                        '//div[@class="offer-description-notes"]'
                    ),
                    u'.*Mis en ligne: (\d{2}/\d{2}/\d{4}).*'
                ),
                dayfirst=True
            )(self)

            honoraires = CleanText(
                (
                    '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]'
                ),
                default=None
            )(self)
            if honoraires:
                details["Honoraires"] = (
                    "{} (TTC, en sus)".format(
                        honoraires.split(":")[1].strip()
                    )
                )

            for li in XPath('//ul[@itemprop="description"]/li')(self):
                label = CleanText('./span[has-class("criteria-label")]')(li)
                value = CleanText('./span[has-class("criteria-value")]')(li)
                details[label] = value

            return details
コード例 #15
0
        class item(ItemElement):
            offer_details_wrapper = (
                './/div[has-class("offer-details-wrapper")]'
            )
            klass = Housing

            obj_id = Format(
                '%s-%s',
                Regexp(Env('type'), '(.*)-.*'),
                CleanText('./@id', replace=[('header-offer-', '')])
            )
            obj_type = Env('query_type')
            obj_advert_type = ADVERT_TYPES.PROFESSIONAL

            def obj_house_type(self):
                house_type = CleanText('.//div[has-class("offer-details-caracteristik")]/meta[@itemprop="name"]/@content')(self).lower()
                if house_type == "appartement":
                    return HOUSE_TYPES.APART
                elif house_type == "maison":
                    return HOUSE_TYPES.HOUSE
                elif house_type == "terrain":
                    return HOUSE_TYPES.LAND
                elif house_type == "parking":
                    return HOUSE_TYPES.PARKING
                else:
                    return HOUSE_TYPES.OTHER

            obj_title = CleanText('.//div[has-class("offer-details-type")]/a/@title')

            obj_url = Format(u'%s%s',
                             CleanText('.//div/a[@class="offer-link"]/@href'),
                             CleanText('.//div/a[@class="offer-link"]/\
@data-orpi', default=""))

            obj_area = CleanDecimal(
                (
                    offer_details_wrapper +
                    '/div/div/div[has-class("offer-details-second")]' +
                    '/div/h3[has-class("offer-attributes")]/span' +
                    '/span[has-class("offer-area-number")]'
                ),
                default=NotLoaded
            )
            obj_rooms = CleanDecimal(
                (
                    offer_details_wrapper +
                    '/div/div/div[has-class("offer-details-second")]' +
                    '/div/h3[has-class("offer-attributes")]' +
                    '/span[has-class("offer-rooms")]' +
                    '/span[has-class("offer-rooms-number")]'
                ),
                default=NotAvailable
            )
            obj_cost = CleanDecimal(
                Regexp(
                    CleanText(
                        (
                            offer_details_wrapper +
                            '/div/p[@class="offer-price"]/span'
                        ),
                        default=NotLoaded
                    ),
                    '(.*) [%s%s%s]' % (u'€', u'$', u'£'),
                    default=NotLoaded
                ),
                default=NotLoaded
            )
            obj_currency = Currency(
                offer_details_wrapper + '/div/p[has-class("offer-price")]/span'
            )
            obj_price_per_meter = PricePerMeterFilter()
            obj_utilities = UTILITIES.UNKNOWN
            obj_text = CleanText(
                offer_details_wrapper + '/div/div/div/p[has-class("offer-description")]/span'
            )
            obj_location = CleanText(
                offer_details_wrapper + '/div[@class="offer-details-location"]',
                replace=[('Voir sur la carte','')]
            )

            def obj_photos(self):
                photos = []
                url = None
                try:
                    url = Attr(
                        './/div[has-class("offer-picture")]//img',
                        'src'
                    )(self)
                except:
                    pass

                if url:
                    url = url.replace('335x253', '800x600')
                    url = urljoin(self.page.url, url)  # Ensure URL is absolute
                    photos.append(HousingPhoto(url))
                return photos

            def obj_details(self):
                details = {}
                honoraires = CleanText(
                    (
                        self.offer_details_wrapper +
                        '/div/div/p[@class="offer-agency-fees"]'
                    ),
                    default=None
                )(self)
                if honoraires:
                    details["Honoraires"] = (
                        "{} (TTC, en sus)".format(
                            honoraires.split(":")[1].strip()
                        )
                    )
                return details
コード例 #16
0
ファイル: pages.py プロジェクト: Phyks/Flatisfy
        class item(ItemElement):
            klass = Housing
            price_selector = './/span[@class="price-label"]|./div/div[@class="item-price-pdf"]'

            def is_agency(self):
                agency = CleanText('.//span[has-class("item-agency-name")]')(self.el)
                return 'annonce de particulier' not in agency.lower()

            def condition(self):
                if len(self.env['advert_types']) == 1:
                    is_agency = self.is_agency()
                    if self.env['advert_types'][0] == ADVERT_TYPES.PERSONAL:
                        return not is_agency
                    elif self.env['advert_types'][0] == ADVERT_TYPES.PROFESSIONAL:
                        return is_agency
                return Attr('.', 'data-classified-id', default=False)(self)

            obj_id = Attr('.', 'data-classified-id')
            obj_type = Env('query_type')
            obj_title = CleanText('./div/h2[@class="item-type"]')

            def obj_advert_type(self):
                if self.is_agency():
                    return ADVERT_TYPES.PROFESSIONAL
                else:
                    return ADVERT_TYPES.PERSONAL

            def obj_house_type(self):
                type = self.obj_title(self).split()[0].lower()
                if type == "appartement" or type == "studio" or type == "chambre":
                    return HOUSE_TYPES.APART
                elif type == "maison" or type == "villa":
                    return HOUSE_TYPES.HOUSE
                elif type == "parking":
                    return HOUSE_TYPES.PARKING
                elif type == "terrain":
                    return HOUSE_TYPES.LAND
                else:
                    return HOUSE_TYPES.OTHER

            def obj_location(self):
                script = CleanText('./script')(self)
                try:
                    # Should be standard JSON+LD data
                    script = json.loads(script)
                except ValueError:
                    try:
                        # But explorimmo can't write JSON correctly and there
                        # is a trailing "}"
                        script = json.loads(script.strip().rstrip('}'))
                    except ValueError:
                        script = None
                if not script:
                    return NotLoaded

                try:
                    return '%s (%s)' % (
                        script['address']['addressLocality'],
                        script['address']['postalCode']
                    )
                except (KeyError):
                    return NotLoaded

            def obj_cost(self):
                cost = CleanDecimal(Regexp(CleanText(self.price_selector, default=''),
                                           r'de (.*) à .*',
                                           default=0))(self)
                if cost == 0:
                    return CleanDecimal(self.price_selector, default=NotAvailable)(self)
                else:
                    return cost

            obj_currency = Currency(price_selector)

            def obj_utilities(self):
                utilities = CleanText(
                    './div/div/span[@class="price-label"]|'
                    './div/div[@class="item-price-pdf"]|'
                    './div/div/span[@class="item-price"]'
                )(self)
                if "CC" in utilities:
                    return UTILITIES.INCLUDED
                else:
                    return UTILITIES.UNKNOWN

            obj_text = CleanText('./div/p[@itemprop="description"]')
            obj_area = CleanDecimal(
                Regexp(
                    obj_title,
                    r'(.*?)([\d,\.]*) m2(.*?)',
                    '\\2',
                    default=None
                ),
                replace_dots=True,
                default=NotLoaded
            )

            obj_url = Format(
                "https://immobilier.lefigaro.fr/annonces/annonce-%s.html",
                CleanText('./@data-classified-id')
            )

            obj_price_per_meter = PricePerMeterFilter()

            def obj_phone(self):
                phone = CleanText('./div/div/ul/li[has-class("js-clickphone")]',
                                  replace=[('Téléphoner : ', '')],
                                  default=NotLoaded)(self)

                if '...' in phone:
                    return NotLoaded

                return phone

            def obj_details(self):
                charges = CleanText('.//span[@class="price-fees"]',
                                    default=None)(self)
                if charges:
                    return {
                        "fees": charges.split(":")[1].strip()
                    }
                else:
                    return NotLoaded

            def obj_photos(self):
                url = CleanText('./div[has-class("default-img")]/img/@data-src')(self)
                if url:
                    url = unquote(url)
                    if "http://" in url[3:]:
                        rindex = url.rfind("?")
                        if rindex == -1:
                            rindex = None
                        url = url[url.find("http://", 3):rindex]
                    return [HousingPhoto(url)]
                else:
                    return NotLoaded
コード例 #17
0
        class item(ItemElement):
            klass = Housing

            obj_id = Format(
                '%s:%s', Env('type'),
                Attr('.//span[boolean(@data-reference)]', 'data-reference'))
            obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a')
            obj_type = Env('query_type')
            obj_advert_type = ADVERT_TYPES.PROFESSIONAL

            def obj_house_type(self):
                url = self.obj_url(self)
                for house_type, types in QUERY_HOUSE_TYPES.items():
                    for type in types:
                        if ('/%s/' % type) in url:
                            return house_type
                return NotLoaded

            obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a')
            obj_title = CleanText('.//h3[has-class("TeaserOffer-title")]')
            obj_area = CleanDecimal(Regexp(CleanText(
                './/div[has-class("MiniData")]//p[@data-behat="surfaceDesBiens"]'
            ),
                                           r'(\d*\.*\d*) .*',
                                           default=NotAvailable),
                                    default=NotAvailable)
            obj_cost = CleanDecimal(
                './/strong[has-class("TeaserOffer-price-num")]',
                default=NotAvailable)
            obj_price_per_meter = PricePerMeterFilter()
            obj_currency = Currency(
                './/strong[has-class("TeaserOffer-price-num")]')
            obj_location = CleanText('.//p[has-class("TeaserOffer-loc")]')
            obj_text = CleanText('.//p[has-class("TeaserOffer-description")]')

            def obj_photos(self):
                url = CleanText(
                    Attr('.//a[has-class("TeaserOffer-ill")]/img',
                         'src'))(self)
                # If the used photo is a default no photo, the src is on the same domain.
                if url[0] == '/':
                    return []
                else:
                    return [HousingPhoto(url)]

            obj_date = datetime.date.today()

            def obj_utilities(self):
                price = CleanText(
                    './/strong[has-class("TeaserOffer-price-num")]')(self)
                if "charges comprises" in price.lower():
                    return UTILITIES.INCLUDED
                else:
                    return UTILITIES.EXCLUDED

            obj_rooms = CleanDecimal(
                './/div[has-class("MiniData")]//p[@data-behat="nbPiecesDesBiens"]',
                default=NotLoaded)
            obj_bedrooms = CleanDecimal(
                './/div[has-class("MiniData")]//p[@data-behat="nbChambresDesBiens"]',
                default=NotLoaded)

            def obj_details(self):
                return {
                    "dispo":
                    Date(
                        Attr('.//span[boolean(@data-dispo)]',
                             'data-dispo',
                             default=datetime.date.today().isoformat()))(self),
                    "priceMentions":
                    CleanText(
                        './/span[has-class("TeaserOffer-price-mentions")]')(
                            self)
                }
コード例 #18
0
    class get_housing(ItemElement):
        klass = Housing

        obj_id = Format(
            '%s:%s', Env('type'),
            Attr('//div[boolean(@data-property-reference)]',
                 'data-property-reference'))
        obj_advert_type = ADVERT_TYPES.PROFESSIONAL

        def obj_type(self):
            type = Env('type')(self)
            if type == 'location':
                if 'appartement-meuble' in self.page.url:
                    return POSTS_TYPES.FURNISHED_RENT
                else:
                    return POSTS_TYPES.RENT
            elif type == 'achat':
                return POSTS_TYPES.SALE
            else:
                return NotAvailable

        def obj_url(self):
            return self.page.url

        def obj_house_type(self):
            url = self.obj_url()
            for house_type, types in QUERY_HOUSE_TYPES.items():
                for type in types:
                    if ('/%s/' % type) in url:
                        return house_type
            return NotAvailable

        obj_title = CleanText('//h1[has-class("OfferTop-title")]')
        obj_area = CleanDecimal(Regexp(CleanText(
            '//div[has-class("MiniData")]//p[has-class("MiniData-item")][1]'),
                                       r'(\d*\.*\d*) .*',
                                       default=NotAvailable),
                                default=NotAvailable)
        obj_cost = CleanDecimal('//span[has-class("OfferTop-price")]',
                                default=NotAvailable)
        obj_price_per_meter = PricePerMeterFilter()
        obj_currency = Currency('//span[has-class("OfferTop-price")]')
        obj_location = Format('%s - %s',
                              CleanText('//p[@data-behat="adresseBien"]'),
                              CleanText('//p[has-class("OfferTop-loc")]'))
        obj_text = CleanText('//div[has-class("OfferDetails-content")]/p[1]')
        obj_phone = Regexp(Link('//a[has-class("OfferContact-btn--tel")]'),
                           r'tel:(.*)')

        def obj_photos(self):
            photos = []
            for photo in self.xpath('//div[has-class("OfferSlider")]//img'):
                photo_url = Attr('.', 'src')(photo)
                photo_url = photo_url.replace('640/480', '800/600')
                photos.append(HousingPhoto(photo_url))
            return photos

        obj_date = datetime.date.today()

        def obj_utilities(self):
            price = CleanText('//p[has-class("OfferTop-price")]')(self)
            if "charges comprises" in price.lower():
                return UTILITIES.INCLUDED
            else:
                return UTILITIES.EXCLUDED

        obj_rooms = CleanDecimal(
            '//div[has-class("MiniData")]//p[has-class("MiniData-item")][2]',
            default=NotAvailable)
        obj_bedrooms = CleanDecimal(
            '//div[has-class("MiniData")]//p[has-class("MiniData-item")][3]',
            default=NotAvailable)

        def obj_DPE(self):
            try:
                electric_consumption = CleanDecimal(
                    Regexp(
                        Attr('//div[has-class("OfferDetails-content")]//img',
                             'src'),
                        r'https://dpe.foncia.net\/(\d+)\/.*'))(self)
            except (RegexpError, XPathNotFound):
                electric_consumption = None

            DPE = ""
            if electric_consumption is not None:
                if electric_consumption <= 50:
                    DPE = "A"
                elif 50 < electric_consumption <= 90:
                    DPE = "B"
                elif 90 < electric_consumption <= 150:
                    DPE = "C"
                elif 150 < electric_consumption <= 230:
                    DPE = "D"
                elif 230 < electric_consumption <= 330:
                    DPE = "E"
                elif 330 < electric_consumption <= 450:
                    DPE = "F"
                else:
                    DPE = "G"
                return getattr(ENERGY_CLASS, DPE, NotAvailable)
            return NotAvailable

        def obj_details(self):
            details = {}

            dispo = Date(
                Regexp(CleanText('//p[has-class("OfferTop-dispo")]'),
                       r'.* (\d\d\/\d\d\/\d\d\d\d)',
                       default=datetime.date.today().isoformat()))(self)
            if dispo is not None:
                details["dispo"] = dispo

            priceMentions = CleanText('//p[has-class("OfferTop-mentions")]',
                                      default=None)(self)
            if priceMentions is not None:
                details["priceMentions"] = priceMentions

            agency = CleanText('//p[has-class("OfferContact-address")]',
                               default=None)(self)
            if agency is not None:
                details["agency"] = agency

            for item in self.xpath(
                    '//div[has-class("OfferDetails-columnize")]/div'):
                category = CleanText(
                    './h3[has-class("OfferDetails-title--2")]',
                    default=None)(item)
                if not category:
                    continue

                details[category] = {}

                for detail_item in item.xpath(
                        './/ul[has-class("List--data")]/li'):
                    detail_title = CleanText(
                        './/span[has-class("List-data")]')(detail_item)
                    detail_value = CleanText('.//*[has-class("List-value")]')(
                        detail_item)
                    details[category][detail_title] = detail_value

                for detail_item in item.xpath(
                        './/ul[has-class("List--bullet")]/li'):
                    detail_title = CleanText('.')(detail_item)
                    details[category][detail_title] = True

            try:
                electric_consumption = CleanDecimal(
                    Regexp(
                        Attr('//div[has-class("OfferDetails-content")]//img',
                             'src'),
                        r'https://dpe.foncia.net\/(\d+)\/.*'))(self)
                details["electric_consumption"] = (
                    '{} kWhEP/m².an'.format(electric_consumption))
            except (RegexpError, XPathNotFound):
                pass

            return details