Beispiel #1
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_id = Env('id')
        obj_url = BrowserURL('advert', id=Env('id'))
        obj_title = CleanText('//div[@class="modal-body"]/h2')
        obj_job_name = CleanText('//div[@class="modal-body"]/h2')
        obj_description = CleanText('//div[has-class("description")]/p')
        obj_society_name = CleanText('//div[@class="media-body"]/h4')
        obj_experience = Join(
            u'- ',
            '//h4[contains(text(), "Exp")]/following-sibling::ul[has-class("skill-list")][1]/li',
            newline=True,
            addBefore='\n- ')
        obj_formation = Join(
            u'- ',
            '//h4[contains(text(), "For")]/following-sibling::ul[has-class("skill-list")][1]/li',
            newline=True,
            addBefore='\n- ')

        obj_place = CleanText(
            '//div[@class="modal-body"]/h2/following-sibling::p[1]')
        obj_publication_date = PoleEmploiDate(
            CleanText('//div[@class="modal-body"]/h2/following-sibling::p[2]'))

        def parse(self, el):
            for el in XPath('//dl[@class="icon-group"]/dt')(el):
                dt = CleanText('.')(el)
                if dt == u'Type de contrat':
                    self.obj.contract_type = CleanText(
                        './following-sibling::dd[1]')(el)
                elif dt == u'Salaire':
                    self.obj.pay = Regexp(
                        CleanText('./following-sibling::dd[1]'),
                        u'Salaire : (.*)')(el)
Beispiel #2
0
 def filter(self, el):
     header = "//div[@class='pvi-hero-product']"
     section = "//section[@class='pvi-productDetails']"
     return Format(u'\n%s\n\n%s%s\n',
                   CleanText("(%s/div[@class='d-rubric-inner']/h1)[1]" % header),
                   Join(u'- ', "%s/ul/li" % section, newline=True, addBefore='- '),
                   Join(u'- Avec ', "%s/div[@class='pvi-productDetails-workers']/a" % section,
                        newline=True, addBefore='- Avec '))(el[0])
Beispiel #3
0
 def filter(self, el):
     header = "//div[@class='pvi-hero-product']"
     section = "//section[@class='pvi-productDetails']"
     return Format(
         u'%s %s\n\n%s%s\n\n',
         CleanText("%s/div[@class='d-rubric-inner']/h1" % header),
         CleanText("%s/div[@class='d-rubric-inner']/small" % header),
         Join(u'- %s\n', "%s/ul[@class='pvi-product-specs']/li" % header),
         Join(u'- %s\n', "%s/ul/li" % section))(el[0])
Beispiel #4
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_description = CleanText(
            Join('\n',
                 '//div[@id="annonce-detail"]/p[@class="text"]',
                 textCleaner=CleanHTML))
        obj_id = Env('_id')
        obj_url = BrowserURL('advert_page', _id=Env('_id'))
        obj_publication_date = Date(Regexp(
            CleanText('//div[@id="annonce-detail"]/p[@class="infos"]'),
            '(\d{2}/\d{2}/\d{4})',
            default=NotAvailable),
                                    default=NotAvailable)
        obj_title = CleanText('//div[@id="annonce"]/div/div/h1')
        obj_society_name = CleanText('//section[@class="entp-resume"]/h1/a')

        obj_contract_type = CleanText(
            '//dl[@class="infos-annonce"]/dt[span[@class="picto picto-contrat-grey"]]/following-sibling::dd[1]'
        )
        obj_place = CleanText(
            '//dl[@class="infos-annonce"]/dt[span[@class="picto picto-geolocalisation-grey"]]/following-sibling::dd[1]'
        )
        obj_pay = CleanText(
            '//div[@id="annonce-detail"]/p[@class="infos"]/preceding-sibling::p[1]',
            replace=[('Salaire : ', '')])
Beispiel #5
0
    def get_roadmap(self):
        roadstep = None
        for step in self.doc.xpath(
                '(//ol[@class="trajet_feuilleDeRoute transport"])[1]/li'):
            if step.attrib and 'class' in step.attrib and step.attrib[
                    'class'] == 'odd':

                if roadstep:
                    roadstep.end_time = Time(
                        CleanText('./div/div[has-class("temps")]'))(step)
                    roadstep.arrival = CleanText(
                        './div/div/div/div[@class="step_infos clearfix"]',
                        default=None)(step)

                    yield roadstep

                roadstep = RoadStep()
                roadstep.start_time = Time(
                    CleanText('./div/div[has-class("temps")]'))(step)
                roadstep.departure = CleanText(
                    './div/div/div/div[@class="step_infos clearfix"]',
                    default=None)(step)

            if not step.attrib:
                roadstep.line = CleanText('./div/div/div/div/div/div[@class="transport"]', default=None)(step) or\
                                CleanText('./div/div/div/div[@class="step_infos clearfix"]', default=None)(step) or\
                                Join('\n', './div/div/div/div/div/ul/li/text()')(step)

                roadstep.duration = RoadMapDuration(
                    CleanText('./div/div[has-class("temps")]'))(step)

        del roadstep
Beispiel #6
0
    class get_recipe(ItemElement):
        klass = Recipe

        obj_id = Env('_id')
        obj_title = CleanText('//h1')
        obj_preparation_time = Type(Regexp(CleanText('//li[@class="time"]/span'), ".* (\d*) min"), type=int)

        obj_cooking_time = Type(Regexp(CleanText('//li[@class="time-cooking"]/span'), ".* (\d*) min"), type=int)

        def obj_nb_person(self):
            nb_pers = Regexp(CleanText('//div[@class="row ingredients"]/div/p'),
                             '.*pour (\d+) personnes', default=0)(self)
            return [nb_pers] if nb_pers else NotAvailable

        def obj_ingredients(self):
            i = []
            ingredients = XPath('//ul[@class="ingredientsList"]/li',
                                default=[])(self)
            for ingredient in ingredients:
                i.append(CleanText('.')(ingredient))
            return i

        obj_instructions = Join(u'\n- ', '//div[@class="recipe-prepa"]/ol/li', newline=True, addBefore='- ')

        obj_thumbnail_url = CleanText('//div[has-class("toprecipeImage")]/img/@src', default=NotAvailable)
        obj_picture_url = CleanText('//div[has-class("toprecipeImage")]/img/@src', default=NotAvailable)
Beispiel #7
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_id = Env('id')
        obj_url = BrowserURL('advert_page', id=Env('id'))
        obj_society_name = CleanText(
            '//td[@class="Contenu"]/table[4]/tr[1]/td[1]/a')
        obj_title = CleanText('//td[@class="Titre15"]')

        obj_description = Format(
            '%s\n%s',
            Join(
                '\n',
                u'//td[@class="Contenu"]/table[3]/tr[td/text()="Détails :"]/following-sibling::tr',
                textCleaner=CleanHTML),
            CleanHTML('//td[@class="Contenu"]/table[2]'))

        obj_job_name = CleanText(
            u'//td[@class="Contenu"]/table[3]/tr/td[text()="Poste :"]/following-sibling::td',
            replace=[(u'-- Indifférent --', u'')])

        obj_contract_type = CleanText(CleanHTML(
            u'//td[@class="Contenu"]/table[3]/tr/td[text()="Contrat :"]/following-sibling::td',
            default=u''),
                                      replace=[(u'-- Indifférent --', u'')])

        obj_pay = CleanText(
            u'//td[@class="Contenu"]/table[3]/tr/td[contains(text(), "Rémunération")]/following-sibling::td',
            default=u'')

        obj_place = CleanText(
            u'//td[@class="Contenu"]/table[3]/tr/td[contains(text(), "Région")]/following-sibling::td',
            default=u'',
            replace=[(u'-- Indifférent --', u''),
                     (u'Lieu de travail : ', u'')])
Beispiel #8
0
        class item(ItemElement):
            klass = BaseAudio

            def condition(self):
                return Dict('path_mp3')(self)

            obj_id = BaseAudioIdFilter(Format(u'%s.%s', Env('radio_id'), Dict('nid')))
            obj_format = u'mp3'
            obj_ext = u'mp3'

            obj_title = Format(u'%s : %s',
                               Dict('title_emission'),
                               Dict('title_diff'))
            obj_description = Dict('desc_emission', default=u'')

            obj_author = Join(u', ', Dict('personnes', default=u''))
            obj_url = Dict('path_mp3')

            def obj_thumbnail(self):
                if 'path_img_emission' in self.el:
                    thumbnail = Thumbnail(Dict('path_img_emission')(self))
                    thumbnail.url = thumbnail.id
                    return thumbnail

            def obj_duration(self):
                fin = Dict('fin')(self)
                debut = Dict('debut')(self)
                if debut and fin:
                    return timedelta(seconds=int(fin) - int(debut))
Beispiel #9
0
 def obj_iban(self):
     rib_page = Async('iban').loaded_page(self)
     if 'RibPdf' in rib_page.url:
         return rib_page.get_iban()
     return Join(
         '',
         Regexp(CleanText(
             '//td[has-class("ColonneCode")][contains(text(), "IBAN")]'
         ),
                r'\b((?!IBAN)[A-Z0-9]+)\b',
                nth='*'))(rib_page.doc) or NotAvailable
Beispiel #10
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_id = Env('_id')
        obj_url = BrowserURL('advert', _id=Env('_id'))
        obj_title = CleanText('//div[@id="jobcopy"]/h1[@itemprop="title"]')
        obj_description = CleanHTML('//div[@id="jobBodyContent"]')
        obj_contract_type = Join('%s ',
                                 '//dd[starts-with(@class, "multipledd")]')
        obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]')
        obj_place = CleanText('//span[@itemprop="jobLocation"]')
        obj_pay = CleanText('//span[@itemprop="baseSalary"]')
        obj_formation = CleanText('//span[@itemprop="educationRequirements"]')
        obj_experience = CleanText('//span[@itemprop="qualifications"]')
Beispiel #11
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_url = Format('%s#%s', Env('url'), Env('id'))
        obj_description = Join('\r\n',
                               'div/fieldset/*[(@class="titreParagraphe" or @class="normal")]',
                               textCleaner=CleanHTML)
        obj_title = CleanText('div/span[@class="intituleposte"]')
        obj_job_name = CleanText('div/span[@class="intituleposte"]')
        obj_society_name = Format('CCI %s', CleanText('div/span[@class="crci crcititle"]'))
        obj_publication_date = DateTime(CleanText('div/fieldset/p[@class="dateOffre"]'), dayfirst=True)

        def parse(self, el):
            self.el = el.xpath('//div[@id=$id]/div', id=self.obj.id)[0]
            self.env['url'] = self.page.url
            self.env['id'] = self.obj.id
Beispiel #12
0
        class item(ItemElement):
            klass = Housing

            def validate(self, obj):
                return obj.id is not None

            obj_id = Regexp(
                Link('.'),
                '//www.leboncoin.fr/(ventes_immobilieres|locations|colocations)/(.*).htm.*',
                '\\2',
                default=None)

            obj_title = CleanText('./@title|./section/p[@class="item_title"]')
            obj_cost = CleanDecimal(
                './section[@class="item_infos"]/*[@class="item_price"]',
                replace_dots=(',', '.'),
                default=Decimal(0))
            obj_currency = Regexp(CleanText(
                './section[@class="item_infos"]/*[@class="item_price"]'),
                                  '.*([%s%s%s])' % (u'€', u'$', u'£'),
                                  default=u'€')
            obj_text = Join(' - ', './/p[@class="item_supp"]')

            def obj_date(self):
                _date = CleanText(
                    './section[@class="item_infos"]/aside/p[@class="item_supp"]/text()',
                    replace=[('Aujourd\'hui', str(date.today())),
                             ('Hier', str(
                                 (date.today() - timedelta(1))))])(self)

                if not _date:
                    return NotAvailable

                for fr, en in DATE_TRANSLATE_FR:
                    _date = fr.sub(en, _date)

                self.env['tmp'] = _date
                return DateTime(Env('tmp'), LinearDateGuesser())(self)

            def obj_photos(self):
                photos = []
                url = Attr('./div[@class="item_image"]/span/span/img',
                           'src',
                           default=None)(self)
                if url:
                    photos.append(HousingPhoto(url))
                return photos
Beispiel #13
0
    class get_recipe(ItemElement):
        klass = Recipe

        obj_id = Env('_id')
        obj_title = CleanText('//h1')

        class obj_picture(ItemElement):
            klass = BaseImage

            obj_url = Format(
                'http:%s',
                CleanText('//img[@id="shareimg" and @src!=""]/@src',
                          default=None))
            obj_thumbnail = Eval(Thumbnail, obj_url)

            def validate(self, obj):
                return obj.url != 'http:'

        def obj_preparation_time(self):
            _prep = CuisineazDuration(
                CleanText(
                    '//span[@id="ContentPlaceHolder_LblRecetteTempsPrepa"]'))(
                        self)
            return int(_prep.total_seconds() / 60)

        def obj_cooking_time(self):
            _cook = CuisineazDuration(
                CleanText(
                    '//span[@id="ContentPlaceHolder_LblRecetteTempsCuisson"]')
            )(self)
            return int(_cook.total_seconds() / 60)

        def obj_nb_person(self):
            nb_pers = CleanText(
                '//span[@id="ContentPlaceHolder_LblRecetteNombre"]')(self)
            return [nb_pers] if nb_pers else NotAvailable

        def obj_ingredients(self):
            ingredients = []
            for el in XPath(
                    '//section[has-class("recipe_ingredients")]/ul/li')(self):
                ingredients.append(CleanText('.')(el))
            return ingredients

        obj_instructions = Join('\n\n - ',
                                '//div[@id="preparation"]/span/p/text()',
                                addBefore=' - ')
Beispiel #14
0
        class item(ItemElement):
            klass = ArteSiteVideo

            def condition(self):
                return len(XPath('.//div[@class="article-secondary "]')(self)) == 1

            obj__site = SITE.CINEMA.get('id')
            obj_id = Format('%s.%s', Field('_site'),
                            Regexp(CleanText('./div/div/a/@href|./div/a/@href'),
                                   '(http://.*\.arte\.tv)?/(.*)',
                                   '\\2'))
            obj_title = Join(u' - ',
                             './/div[@class="article-secondary "]/div/div')

            def obj_thumbnail(self):
                url = CleanText('.//div[@class="article-primary "]/div[has-class("field-thumbnail")]/span/noscript/img/@src')(self)
                thumbnail = Thumbnail(url)
                thumbnail.url = thumbnail.id
                return thumbnail
Beispiel #15
0
    class get_recipe(ItemElement):
        klass = Recipe

        obj_id = Env('_id')
        obj_title = CleanText('//div[@id="ficheRecette"]/h1')

        obj_picture_url = CleanText('//img[@id="shareimg" and @src!=""]/@src',
                                    default=None)

        obj_thumbnail_url = CleanText(
            '//img[@id="shareimg" and @src!=""]/@src', default=None)

        def obj_preparation_time(self):
            _prep = CuisineazDuration(
                CleanText(
                    '//span[@id="ctl00_ContentPlaceHolder_LblRecetteTempsPrepa"]'
                ))(self)
            return int(_prep.total_seconds() / 60)

        def obj_cooking_time(self):
            _cook = CuisineazDuration(
                CleanText(
                    '//span[@id="ctl00_ContentPlaceHolder_LblRecetteTempsCuisson"]'
                ))(self)
            return int(_cook.total_seconds() / 60)

        def obj_nb_person(self):
            nb_pers = CleanText(
                '//span[@id="ctl00_ContentPlaceHolder_LblRecetteNombre"]')(
                    self)
            return [nb_pers] if nb_pers else NotAvailable

        def obj_ingredients(self):
            ingredients = []
            for el in XPath('//div[@id="ingredients"]/ul/li')(self):
                ingredients.append(CleanText('.')(el))
            return ingredients

        obj_instructions = Join('\n\n - ',
                                '//div[@id="preparation"]/span/p/text()',
                                addBefore=' - ')
Beispiel #16
0
 def obj_instructions(self):
     ins = [Dict('displayValue')(el) for el in Dict('directions')(self)]
     return Join('\n * ', ins, addBefore=' * ', addAfter='\n')(self)
Beispiel #17
0
        class item(ItemElement):
            klass = Housing

            def validate(self, obj):
                return obj.id is not None

            obj_url = Format(u'http:%s', Link('.'))
            obj_id = Regexp(
                Link('.'),
                '//www.leboncoin.fr/(ventes_immobilieres|locations|colocations)/(.*).htm.*',
                '\\2',
                default=None)
            obj_type = Env('query_type')

            def obj_advert_type(self):
                ispro = XPath('.//span[has-class("ispro")]',
                              default=None)(self)
                if ispro:
                    return ADVERT_TYPES.PROFESSIONAL
                else:
                    return ADVERT_TYPES.PERSONAL

            obj_house_type = NotAvailable

            obj_title = CleanText('./@title|./section/p[@class="item_title"]')
            obj_cost = CleanDecimal(
                './section[@class="item_infos"]/*[@class="item_price"]/text()',
                replace_dots=(',', '.'),
                default=Decimal(0))
            obj_location = CleanText(
                './section[@class="item_infos"]/*[@itemtype="http://schema.org/Place"]/text()'
            )
            obj_currency = Currency(
                './section[@class="item_infos"]/*[@class="item_price"]')

            def obj_utilities(self):
                utilities = Regexp(CleanText(
                    './section[@class="item_infos"]/*[@class="item_price"]'),
                                   '\d+ [%s%s%s](.*)' % (u'€', u'$', u'£'),
                                   default=u'')(self)
                if "C.C." in utilities:
                    return UTILITIES.INCLUDED
                elif "H.C." in utilities:
                    return UTILITIES.EXCLUDED
                else:
                    return UTILITIES.UNKNOWN

            obj_text = Join(' - ', './/p[@class="item_supp"]')

            def obj_date(self):
                _date = CleanText(
                    './section[@class="item_infos"]/aside/p[@class="item_supp"]/text()',
                    replace=[('Aujourd\'hui', str(date.today())),
                             ('Hier', str(
                                 (date.today() - timedelta(1))))])(self)

                if not _date:
                    return NotAvailable

                for fr, en in DATE_TRANSLATE_FR:
                    _date = fr.sub(en, _date)

                self.env['tmp'] = _date
                return DateTime(Env('tmp'), LinearDateGuesser())(self)

            def obj_photos(self):
                photos = []
                url = Attr(
                    './div[@class="item_image"]/span/span[@class="lazyload"]',
                    'data-imgsrc',
                    default=None)(self)
                if url:
                    photos.append(
                        HousingPhoto(url.replace("ad-thumb", "ad-image")))
                return photos