class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Env('id') obj_url = BrowserURL('advert', id=Env('id')) obj_title = CleanText('//div[@class="modal-body"]/h2') obj_job_name = CleanText('//div[@class="modal-body"]/h2') obj_description = CleanText('//div[has-class("description")]/p') obj_society_name = CleanText('//div[@class="media-body"]/h4') obj_experience = Join( u'- ', '//h4[contains(text(), "Exp")]/following-sibling::ul[has-class("skill-list")][1]/li', newline=True, addBefore='\n- ') obj_formation = Join( u'- ', '//h4[contains(text(), "For")]/following-sibling::ul[has-class("skill-list")][1]/li', newline=True, addBefore='\n- ') obj_place = CleanText( '//div[@class="modal-body"]/h2/following-sibling::p[1]') obj_publication_date = PoleEmploiDate( CleanText('//div[@class="modal-body"]/h2/following-sibling::p[2]')) def parse(self, el): for el in XPath('//dl[@class="icon-group"]/dt')(el): dt = CleanText('.')(el) if dt == u'Type de contrat': self.obj.contract_type = CleanText( './following-sibling::dd[1]')(el) elif dt == u'Salaire': self.obj.pay = Regexp( CleanText('./following-sibling::dd[1]'), u'Salaire : (.*)')(el)
def filter(self, el): header = "//div[@class='pvi-hero-product']" section = "//section[@class='pvi-productDetails']" return Format(u'\n%s\n\n%s%s\n', CleanText("(%s/div[@class='d-rubric-inner']/h1)[1]" % header), Join(u'- ', "%s/ul/li" % section, newline=True, addBefore='- '), Join(u'- Avec ', "%s/div[@class='pvi-productDetails-workers']/a" % section, newline=True, addBefore='- Avec '))(el[0])
def filter(self, el): header = "//div[@class='pvi-hero-product']" section = "//section[@class='pvi-productDetails']" return Format( u'%s %s\n\n%s%s\n\n', CleanText("%s/div[@class='d-rubric-inner']/h1" % header), CleanText("%s/div[@class='d-rubric-inner']/small" % header), Join(u'- %s\n', "%s/ul[@class='pvi-product-specs']/li" % header), Join(u'- %s\n', "%s/ul/li" % section))(el[0])
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_description = CleanText( Join('\n', '//div[@id="annonce-detail"]/p[@class="text"]', textCleaner=CleanHTML)) obj_id = Env('_id') obj_url = BrowserURL('advert_page', _id=Env('_id')) obj_publication_date = Date(Regexp( CleanText('//div[@id="annonce-detail"]/p[@class="infos"]'), '(\d{2}/\d{2}/\d{4})', default=NotAvailable), default=NotAvailable) obj_title = CleanText('//div[@id="annonce"]/div/div/h1') obj_society_name = CleanText('//section[@class="entp-resume"]/h1/a') obj_contract_type = CleanText( '//dl[@class="infos-annonce"]/dt[span[@class="picto picto-contrat-grey"]]/following-sibling::dd[1]' ) obj_place = CleanText( '//dl[@class="infos-annonce"]/dt[span[@class="picto picto-geolocalisation-grey"]]/following-sibling::dd[1]' ) obj_pay = CleanText( '//div[@id="annonce-detail"]/p[@class="infos"]/preceding-sibling::p[1]', replace=[('Salaire : ', '')])
def get_roadmap(self): roadstep = None for step in self.doc.xpath( '(//ol[@class="trajet_feuilleDeRoute transport"])[1]/li'): if step.attrib and 'class' in step.attrib and step.attrib[ 'class'] == 'odd': if roadstep: roadstep.end_time = Time( CleanText('./div/div[has-class("temps")]'))(step) roadstep.arrival = CleanText( './div/div/div/div[@class="step_infos clearfix"]', default=None)(step) yield roadstep roadstep = RoadStep() roadstep.start_time = Time( CleanText('./div/div[has-class("temps")]'))(step) roadstep.departure = CleanText( './div/div/div/div[@class="step_infos clearfix"]', default=None)(step) if not step.attrib: roadstep.line = CleanText('./div/div/div/div/div/div[@class="transport"]', default=None)(step) or\ CleanText('./div/div/div/div[@class="step_infos clearfix"]', default=None)(step) or\ Join('\n', './div/div/div/div/div/ul/li/text()')(step) roadstep.duration = RoadMapDuration( CleanText('./div/div[has-class("temps")]'))(step) del roadstep
class get_recipe(ItemElement): klass = Recipe obj_id = Env('_id') obj_title = CleanText('//h1') obj_preparation_time = Type(Regexp(CleanText('//li[@class="time"]/span'), ".* (\d*) min"), type=int) obj_cooking_time = Type(Regexp(CleanText('//li[@class="time-cooking"]/span'), ".* (\d*) min"), type=int) def obj_nb_person(self): nb_pers = Regexp(CleanText('//div[@class="row ingredients"]/div/p'), '.*pour (\d+) personnes', default=0)(self) return [nb_pers] if nb_pers else NotAvailable def obj_ingredients(self): i = [] ingredients = XPath('//ul[@class="ingredientsList"]/li', default=[])(self) for ingredient in ingredients: i.append(CleanText('.')(ingredient)) return i obj_instructions = Join(u'\n- ', '//div[@class="recipe-prepa"]/ol/li', newline=True, addBefore='- ') obj_thumbnail_url = CleanText('//div[has-class("toprecipeImage")]/img/@src', default=NotAvailable) obj_picture_url = CleanText('//div[has-class("toprecipeImage")]/img/@src', default=NotAvailable)
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Env('id') obj_url = BrowserURL('advert_page', id=Env('id')) obj_society_name = CleanText( '//td[@class="Contenu"]/table[4]/tr[1]/td[1]/a') obj_title = CleanText('//td[@class="Titre15"]') obj_description = Format( '%s\n%s', Join( '\n', u'//td[@class="Contenu"]/table[3]/tr[td/text()="Détails :"]/following-sibling::tr', textCleaner=CleanHTML), CleanHTML('//td[@class="Contenu"]/table[2]')) obj_job_name = CleanText( u'//td[@class="Contenu"]/table[3]/tr/td[text()="Poste :"]/following-sibling::td', replace=[(u'-- Indifférent --', u'')]) obj_contract_type = CleanText(CleanHTML( u'//td[@class="Contenu"]/table[3]/tr/td[text()="Contrat :"]/following-sibling::td', default=u''), replace=[(u'-- Indifférent --', u'')]) obj_pay = CleanText( u'//td[@class="Contenu"]/table[3]/tr/td[contains(text(), "Rémunération")]/following-sibling::td', default=u'') obj_place = CleanText( u'//td[@class="Contenu"]/table[3]/tr/td[contains(text(), "Région")]/following-sibling::td', default=u'', replace=[(u'-- Indifférent --', u''), (u'Lieu de travail : ', u'')])
class item(ItemElement): klass = BaseAudio def condition(self): return Dict('path_mp3')(self) obj_id = BaseAudioIdFilter(Format(u'%s.%s', Env('radio_id'), Dict('nid'))) obj_format = u'mp3' obj_ext = u'mp3' obj_title = Format(u'%s : %s', Dict('title_emission'), Dict('title_diff')) obj_description = Dict('desc_emission', default=u'') obj_author = Join(u', ', Dict('personnes', default=u'')) obj_url = Dict('path_mp3') def obj_thumbnail(self): if 'path_img_emission' in self.el: thumbnail = Thumbnail(Dict('path_img_emission')(self)) thumbnail.url = thumbnail.id return thumbnail def obj_duration(self): fin = Dict('fin')(self) debut = Dict('debut')(self) if debut and fin: return timedelta(seconds=int(fin) - int(debut))
def obj_iban(self): rib_page = Async('iban').loaded_page(self) if 'RibPdf' in rib_page.url: return rib_page.get_iban() return Join( '', Regexp(CleanText( '//td[has-class("ColonneCode")][contains(text(), "IBAN")]' ), r'\b((?!IBAN)[A-Z0-9]+)\b', nth='*'))(rib_page.doc) or NotAvailable
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Env('_id') obj_url = BrowserURL('advert', _id=Env('_id')) obj_title = CleanText('//div[@id="jobcopy"]/h1[@itemprop="title"]') obj_description = CleanHTML('//div[@id="jobBodyContent"]') obj_contract_type = Join('%s ', '//dd[starts-with(@class, "multipledd")]') obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]') obj_place = CleanText('//span[@itemprop="jobLocation"]') obj_pay = CleanText('//span[@itemprop="baseSalary"]') obj_formation = CleanText('//span[@itemprop="educationRequirements"]') obj_experience = CleanText('//span[@itemprop="qualifications"]')
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_url = Format('%s#%s', Env('url'), Env('id')) obj_description = Join('\r\n', 'div/fieldset/*[(@class="titreParagraphe" or @class="normal")]', textCleaner=CleanHTML) obj_title = CleanText('div/span[@class="intituleposte"]') obj_job_name = CleanText('div/span[@class="intituleposte"]') obj_society_name = Format('CCI %s', CleanText('div/span[@class="crci crcititle"]')) obj_publication_date = DateTime(CleanText('div/fieldset/p[@class="dateOffre"]'), dayfirst=True) def parse(self, el): self.el = el.xpath('//div[@id=$id]/div', id=self.obj.id)[0] self.env['url'] = self.page.url self.env['id'] = self.obj.id
class item(ItemElement): klass = Housing def validate(self, obj): return obj.id is not None obj_id = Regexp( Link('.'), '//www.leboncoin.fr/(ventes_immobilieres|locations|colocations)/(.*).htm.*', '\\2', default=None) obj_title = CleanText('./@title|./section/p[@class="item_title"]') obj_cost = CleanDecimal( './section[@class="item_infos"]/*[@class="item_price"]', replace_dots=(',', '.'), default=Decimal(0)) obj_currency = Regexp(CleanText( './section[@class="item_infos"]/*[@class="item_price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_text = Join(' - ', './/p[@class="item_supp"]') def obj_date(self): _date = CleanText( './section[@class="item_infos"]/aside/p[@class="item_supp"]/text()', replace=[('Aujourd\'hui', str(date.today())), ('Hier', str( (date.today() - timedelta(1))))])(self) if not _date: return NotAvailable for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self) def obj_photos(self): photos = [] url = Attr('./div[@class="item_image"]/span/span/img', 'src', default=None)(self) if url: photos.append(HousingPhoto(url)) return photos
class get_recipe(ItemElement): klass = Recipe obj_id = Env('_id') obj_title = CleanText('//h1') class obj_picture(ItemElement): klass = BaseImage obj_url = Format( 'http:%s', CleanText('//img[@id="shareimg" and @src!=""]/@src', default=None)) obj_thumbnail = Eval(Thumbnail, obj_url) def validate(self, obj): return obj.url != 'http:' def obj_preparation_time(self): _prep = CuisineazDuration( CleanText( '//span[@id="ContentPlaceHolder_LblRecetteTempsPrepa"]'))( self) return int(_prep.total_seconds() / 60) def obj_cooking_time(self): _cook = CuisineazDuration( CleanText( '//span[@id="ContentPlaceHolder_LblRecetteTempsCuisson"]') )(self) return int(_cook.total_seconds() / 60) def obj_nb_person(self): nb_pers = CleanText( '//span[@id="ContentPlaceHolder_LblRecetteNombre"]')(self) return [nb_pers] if nb_pers else NotAvailable def obj_ingredients(self): ingredients = [] for el in XPath( '//section[has-class("recipe_ingredients")]/ul/li')(self): ingredients.append(CleanText('.')(el)) return ingredients obj_instructions = Join('\n\n - ', '//div[@id="preparation"]/span/p/text()', addBefore=' - ')
class item(ItemElement): klass = ArteSiteVideo def condition(self): return len(XPath('.//div[@class="article-secondary "]')(self)) == 1 obj__site = SITE.CINEMA.get('id') obj_id = Format('%s.%s', Field('_site'), Regexp(CleanText('./div/div/a/@href|./div/a/@href'), '(http://.*\.arte\.tv)?/(.*)', '\\2')) obj_title = Join(u' - ', './/div[@class="article-secondary "]/div/div') def obj_thumbnail(self): url = CleanText('.//div[@class="article-primary "]/div[has-class("field-thumbnail")]/span/noscript/img/@src')(self) thumbnail = Thumbnail(url) thumbnail.url = thumbnail.id return thumbnail
class get_recipe(ItemElement): klass = Recipe obj_id = Env('_id') obj_title = CleanText('//div[@id="ficheRecette"]/h1') obj_picture_url = CleanText('//img[@id="shareimg" and @src!=""]/@src', default=None) obj_thumbnail_url = CleanText( '//img[@id="shareimg" and @src!=""]/@src', default=None) def obj_preparation_time(self): _prep = CuisineazDuration( CleanText( '//span[@id="ctl00_ContentPlaceHolder_LblRecetteTempsPrepa"]' ))(self) return int(_prep.total_seconds() / 60) def obj_cooking_time(self): _cook = CuisineazDuration( CleanText( '//span[@id="ctl00_ContentPlaceHolder_LblRecetteTempsCuisson"]' ))(self) return int(_cook.total_seconds() / 60) def obj_nb_person(self): nb_pers = CleanText( '//span[@id="ctl00_ContentPlaceHolder_LblRecetteNombre"]')( self) return [nb_pers] if nb_pers else NotAvailable def obj_ingredients(self): ingredients = [] for el in XPath('//div[@id="ingredients"]/ul/li')(self): ingredients.append(CleanText('.')(el)) return ingredients obj_instructions = Join('\n\n - ', '//div[@id="preparation"]/span/p/text()', addBefore=' - ')
def obj_instructions(self): ins = [Dict('displayValue')(el) for el in Dict('directions')(self)] return Join('\n * ', ins, addBefore=' * ', addAfter='\n')(self)
class item(ItemElement): klass = Housing def validate(self, obj): return obj.id is not None obj_url = Format(u'http:%s', Link('.')) obj_id = Regexp( Link('.'), '//www.leboncoin.fr/(ventes_immobilieres|locations|colocations)/(.*).htm.*', '\\2', default=None) obj_type = Env('query_type') def obj_advert_type(self): ispro = XPath('.//span[has-class("ispro")]', default=None)(self) if ispro: return ADVERT_TYPES.PROFESSIONAL else: return ADVERT_TYPES.PERSONAL obj_house_type = NotAvailable obj_title = CleanText('./@title|./section/p[@class="item_title"]') obj_cost = CleanDecimal( './section[@class="item_infos"]/*[@class="item_price"]/text()', replace_dots=(',', '.'), default=Decimal(0)) obj_location = CleanText( './section[@class="item_infos"]/*[@itemtype="http://schema.org/Place"]/text()' ) obj_currency = Currency( './section[@class="item_infos"]/*[@class="item_price"]') def obj_utilities(self): utilities = Regexp(CleanText( './section[@class="item_infos"]/*[@class="item_price"]'), '\d+ [%s%s%s](.*)' % (u'€', u'$', u'£'), default=u'')(self) if "C.C." in utilities: return UTILITIES.INCLUDED elif "H.C." in utilities: return UTILITIES.EXCLUDED else: return UTILITIES.UNKNOWN obj_text = Join(' - ', './/p[@class="item_supp"]') def obj_date(self): _date = CleanText( './section[@class="item_infos"]/aside/p[@class="item_supp"]/text()', replace=[('Aujourd\'hui', str(date.today())), ('Hier', str( (date.today() - timedelta(1))))])(self) if not _date: return NotAvailable for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self) def obj_photos(self): photos = [] url = Attr( './div[@class="item_image"]/span/span[@class="lazyload"]', 'data-imgsrc', default=None)(self) if url: photos.append( HousingPhoto(url.replace("ad-thumb", "ad-image"))) return photos