class item(ItemElement): klass = Bill obj_id = Format('%s_%s', Env('subid'), Dict('documentNumber')) obj_date = Date( Eval( lambda t: datetime.fromtimestamp(int(t) / 1000).strftime( '%Y-%m-%d'), Dict('creationDate'))) obj_format = u"pdf" obj_label = Format('Facture %s', Dict('documentNumber')) obj_type = u"bill" obj_price = Env('price') obj_currency = u"€" obj_vat = NotAvailable obj__doc_number = Dict('documentNumber') obj__par_number = Dict('parNumber') obj__num_acc = Env('numAcc') obj__bp = Env('bpNumber') def parse(self, el): self.env['price'] = Decimal(Dict('billAmount')(self)) self.env['numAcc'] = str(int(Env('subid')(self)))
class Item(ItemElement): klass = BaseJobAdvert obj_id = CleanText(Format('%s#%s#%s', Regexp(Attr('.', 'id'), '^..(.*)'), Attr('h2/a', 'title'), CleanText('span[@class="company"]')), replace=[(" ", "-"), ("/", "-")]) obj_title = Attr('h2/a', 'title') obj_society_name = CleanText('span[@class="company"]') obj_place = CleanText('span/span[@class="location"]') obj_publication_date = IndeedDate( CleanText('table/tr/td/span[@class="date"]'))
class item(ItemElement): klass = Album obj_url = AbsoluteLink('./a') obj__thumbnail_url = Attr('./a/div[@class="art"]/img', 'src') obj_title = CleanText('./a/p[@class="title"]', children=False) obj_id = Format('album.%s.%s', Env('band'), Regexp(Field('url'), r'/album/([-\w]+)')) def obj_author(self): return CleanText( './a/p[@class="title"]/span[@class="artist-override"]')( self) or self.page.get_artist()
class get_album(ItemElement): klass = Album obj_id = Format('album.%s.%s', Env('band'), Env('album')) obj_title = CleanText('//h2[@class="trackTitle"]') obj_author = CleanText('//span[@itemprop="byArtist"]') _date = Date(Attr('//meta[@itemprop="datePublished"]', 'content')) def obj_year(self): return self._date(self).year def obj_url(self): return self.page.url
class account(ItemElement): klass = Account def condition(self): return '/outil/UWLM/ListeMouvement' in self.el.attrib[ 'onclick'] NATURE2TYPE = { '001': Account.TYPE_SAVINGS, '005': Account.TYPE_CHECKING, '006': Account.TYPE_CHECKING, '007': Account.TYPE_SAVINGS, '012': Account.TYPE_SAVINGS, '023': Account.TYPE_CHECKING, '046': Account.TYPE_SAVINGS, '047': Account.TYPE_SAVINGS, '049': Account.TYPE_SAVINGS, '068': Account.TYPE_PEA, '069': Account.TYPE_SAVINGS, } obj__link_id = Format('%s&mode=190', Regexp(CleanText('./@onclick'), "'(.*)'")) obj__agence = Regexp(Field('_link_id'), r'.*agence=(\w+)') obj__compte = Regexp(Field('_link_id'), r'compte=(\w+)') obj_id = Format('%s%s', Field('_agence'), Field('_compte')) obj__transfer_id = Format('%s0000%s', Field('_agence'), Field('_compte')) obj__coming_links = [] obj_label = CleanText('.//div[@class="libelleCompte"]') obj_balance = MyDecimal('.//td[has-class("right")]', replace_dots=True) obj_currency = FrenchTransaction.Currency( './/td[has-class("right")]') obj_type = Map(Regexp(Field('_link_id'), r'.*nature=(\w+)'), NATURE2TYPE, default=Account.TYPE_UNKNOWN) obj__market_link = None
class item(ItemElement): klass = Document obj_id = Format('%s_%s%s', Env('sub_id'), Regexp(CleanText('.//a/@title'), r' (\d{2}) '), CleanText('.//span[contains(@class, "date")]' ,symbols='/')) obj_label = Format('%s - %s', CleanText('.//span[contains(@class, "lib")]'), CleanText('.//span[contains(@class, "date")]')) obj_url = Format('/voscomptes/canalXHTML/relevePdf/relevePdf_historique/%s', Link('./a')) obj_format = 'pdf' obj_type = DocumentTypes.OTHER def obj_date(self): date = CleanText('.//span[contains(@class, "date")]')(self) m = re.search(r'(\d{2}/\d{2}/\d{4})', date) if m: return Date(CleanText('.//span[contains(@class, "date")]'), dayfirst=True)(self) else: return Date( Format( '%s/%s', Regexp(CleanText('.//a/@title'), r' (\d{2}) '), CleanText('.//span[contains(@class, "date")]') ), dayfirst=True )(self)
class item(ItemElement): klass = Transaction obj_label = Format('%s du %s', Field('_labeltype'), Field('date')) obj_type = Transaction.TYPE_BANK obj_date = Date(CleanText( u'./div[@data-label="Date d\'effet"]', children=False), dayfirst=True) obj_amount = CleanDecimal(u'./div[@data-label="Montant en €"]', replace_dots=True) obj__labeltype = Regexp( Capitalize('./preceding::h2[@class="feature"][1]'), 'Historique Des\s+(\w+)')
def get_profile(self): profile = Person() profile.name = Format( '%s %s', CleanText('//div[@id="persoIdentiteDetail"]//dd[3]'), CleanText('//div[@id="persoIdentiteDetail"]//dd[2]'))(self.doc) profile.address = CleanText('//div[@id="persoAdresseDetail"]//dd')( self.doc) profile.email = CleanText('//div[@id="persoEmailDetail"]//td[2]')( self.doc) profile.job = CleanText('//div[@id="persoIdentiteDetail"]//dd[4]')( self.doc) return profile
class item(ItemElement): klass = Account # TableCell('service_number') alone is not enough because a person with the # same service_number might have multiple cards. # And a card number can be associated to multiple persons. obj_id = obj_number = Format( '%s_%s', CleanText(TableCell('service_number')), CleanText(TableCell('card_number')), ) obj_label = CleanText(TableCell('label')) obj_currency = 'EUR' obj_type = Account.TYPE_CARD
class get_profile(ItemElement): klass = Person obj_name = Format('%s %s %s', MySelect('genderTitle'), MyInput('firstName'), MyInput('lastName')) obj_nationality = CleanText(u'//span[contains(text(), "Nationalité")]/span') obj_spouse_name = MyInput('spouseFirstName') obj_children = CleanDecimal(MyInput('dependentChildren'), default=NotAvailable) obj_family_situation = MySelect('maritalStatus') obj_matrimonial = MySelect('matrimonial') obj_housing_status = MySelect('housingSituation') obj_job = MyInput('occupation') obj_job_start_date = Date(MyInput('employeeSince'), default=NotAvailable) obj_company_name = MyInput('employer') obj_socioprofessional_category = MySelect('socioProfessionalCategory')
class item(ItemElement): klass = Document obj__refdoc = Regexp( Attr(".", "onclick", default=""), r"\('refdoc'\)\.value='([^\']+)'", default=None, ) obj__norng = Regexp( Attr(".", "onclick", default=""), r"\('norng'\)\.value='([^\']+)'", default=None, ) obj_id = Format("%s_%s", Env("subscription_id"), Field("_refdoc"))
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') def obj_type(self): idType = int(CleanText('idTypeTransaction')(self)) type = next(k for k, v in TYPES.items() if v == idType) if type == POSTS_TYPES.FURNISHED_RENT: # SeLoger does not let us discriminate between furnished and not # furnished. return POSTS_TYPES.RENT return type def obj_house_type(self): idType = CleanText('idTypeBien')(self) try: return next(k for k, v in RET.items() if v == idType) except StopIteration: return NotAvailable obj_title = Format( "%s %s%s - %s", CleanText('titre'), CleanText('surface'), CleanText('surfaceUnite'), CleanText('ville'), ) obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = Currency('prixUnite') obj_area = CleanDecimal('surface', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_text = CleanText('descriptif') obj_rooms = CleanDecimal('nbPiece|nbPieces', default=NotAvailable) obj_bedrooms = CleanDecimal('nbChambre|nbChambres', default=NotAvailable) def obj_location(self): location = CleanText('adresse', default="")(self) quartier = CleanText('quartier', default=None)(self) if not location and quartier is not None: location = quartier ville = CleanText('ville')(self) cp = CleanText('cp')(self) return u'%s %s (%s)' % (location, ville, cp) obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
def obj_date(self): date = CleanText('.//span[contains(@class, "date")]')(self) m = re.search(r'(\d{2}/\d{2}/\d{4})', date) if m: return Date(CleanText('.//span[contains(@class, "date")]'), dayfirst=True)(self) else: return Date( Format( '%s/%s', Regexp(CleanText('.//a/@title'), r' (\d{2}) '), CleanText('.//span[contains(@class, "date")]') ), dayfirst=True )(self)
class item(ItemElement): klass = Account obj_id = obj_number = Format( '%s_%s', Attr('//div[span[contains(text(), "Identifiant prestation")]]/following-sibling::input', 'value'), Attr('//div[span[contains(text(), "Numéro de la carte")]]/following-sibling::input', 'value'), ) obj_label = CleanText('//div[@class="v-slot"]/div[contains(@class, "v-label-undef-w")]') obj_iban = CleanText(Attr('//div[span[contains(text(), "IBAN")]]/following-sibling::input', 'value'), replace=[(' ', '')]) obj_balance = 0 obj_type = Account.TYPE_CARD
class item(ItemElement): klass = SensCritiquenCalendarEvent def condition(self): if '_id' in self.env and self.env['_id']: return Format(u'%s#%s#%s', Regexp(Link('.'), '/film/(.*)'), FormatDate("%Y%m%d%H%M", Date('div[@class="elgr-guide-details"]/div[@class="elgr-data-diffusion"]')), CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')]))(self) == self.env['_id'] return True def validate(self, obj): if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']: if not self.env['date_to']: return True else: if empty(obj.end_date): if obj.start_date < self.env['date_to']: return True elif obj.end_date <= self.env['date_to']: return True if '_id' in self.env: return True return False obj_id = Format(u'%s#%s#%s', Regexp(Link('.'), '/film/(.*)'), FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')), CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')])) obj_start_date = Date('div/div[@class="elgr-data-diffusion"]') obj_summary = Format('%s - %s', Regexp(CleanText('./div/img/@alt'), '^Affiche(.*)'), CleanText('./div/span[@class="d-offset"]'))
class get_video(ItemElement): klass = RmllVideo obj_id = CleanHTML('/html/head/meta[@property="og:url"]/@content' ) & CleanText() & Regexp( pattern=r'.*/permalink/(.+)/$') obj_title = Format( u'%s', CleanHTML('/html/head/meta[@name="DC.title"]/@content') & CleanText()) obj_description = Format( u'%s', CleanHTML('/html/head/meta[@property="og:description"]/@content') & CleanText()) def obj_thumbnail(self): url = NormalizeThumbnail( CleanText('/html/head/meta[@property="og:image"]/@content'))( self) if url: thumbnail = Thumbnail(url) thumbnail.url = thumbnail.id return thumbnail obj_duration = CleanText('/html/head/script[not(@src)]') & Regexp( pattern=r'media_duration: ([^,.]+),?.*,', default='') & Duration(default=NotAvailable) def obj_url(self): links = XPath( '//div[@id="tab_sharing_content"]/div/div/div[@class="paragraph"]/div[@class="share"]/a[@target="_blank"]/@href' )(self) for link in links: ext = str(link).split('.')[-1] self.logger.debug("Link:%s Ext:%s", link, ext) if ext in ['mp4', 'webm']: return unicode(link)
class item(ItemElement): klass = Bill def condition(self): num = Attr('.', 'data-fact_ligne', default='')(self) return self.env['subid'] == num obj_url = Attr('.//div[@class="pdf"]/a', 'href') obj__localid = Regexp(Field('url'), '&id=(.*)&date', u'\\1') obj_label = Regexp(Field('url'), '&date=(\d*)', u'\\1') obj_id = Format('%s.%s', Env('subid'), Field('_localid')) obj_date = FormatDate(Field('label')) obj_format = u"pdf" obj_type = u"bill" obj_price = CleanDecimal('div[@class="montant"]', default=Decimal(0), replace_dots=False)
class get_unique_card(ItemElement): item_xpath = '//table[@class="ca-table"][@summary]' klass = Account # Transform 'n° 4999 78xx xxxx xx72' into '499978xxxxxxxx72' obj_number = CleanText( '//table[@class="ca-table"][@summary]//tr[@class="ligne-impaire"]/td[@class="cel-texte"][1]', replace=[(' ', ''), ('n°', '')]) # Card ID is formatted as '499978xxxxxxxx72MrFirstnameLastname-' obj_id = Format( '%s%s', Field('number'), CleanText( '//table[@class="ca-table"][@summary]//caption[@class="caption"]//b', replace=[(' ', '')])) # Card label is formatted as 'Carte VISA Premier - Mr M Lastname' obj_label = Format( '%s - %s', CleanText( '//table[@class="ca-table"][@summary]//tr[@class="ligne-impaire ligne-bleu"]/th[@id="compte-1"]' ), CleanText( '//table[@class="ca-table"][@summary]//caption[@class="caption"]//b' )) obj_balance = CleanDecimal(0) obj_coming = CleanDecimal.French( '//table[@class="ca-table"][@summary]//tr[@class="ligne-paire"]//td[@class="cel-num"]', default=0) obj_currency = Currency( Regexp(CleanText('//th[contains(text(), "Montant en")]'), r'^Montant en (.*)')) obj_type = Account.TYPE_CARD obj__form = None
class item(ItemElement): klass = Detail def condition(self): txt = self.el.xpath('td[1]')[0].text return (txt is not None) and (txt != "Date") obj_id = None obj_datetime = DateTime(CleanText('td[1]', symbols=u'à'), dayfirst=True) obj_label = Format(u'%s %s %s', CleanText('td[2]'), CleanText('td[3]'), CleanText('td[4]')) obj_price = CleanDecimal('td[5]', default=Decimal(0), replace_dots=True)
class item(ItemElement): klass = Message obj_id = Format(u'%s#%s', CleanText(Dict('origin/streamId')), CleanText(Dict('id'))) obj_sender = CleanText(Dict('author', default=u'')) obj_title = Format(u'%s - %s', CleanText(Dict('origin/title', default=u'')), CleanText(Dict('title'))) def obj_date(self): return datetime.fromtimestamp(Dict('published')(self.el) / 1e3) def obj_content(self): if 'content' in self.el.keys(): return Format(u'%s%s\r\n', CleanHTML(Dict('content/content')), CleanText(Dict('origin/htmlUrl')))(self.el) elif 'summary' in self.el.keys(): return Format(u'%s%s\r\n', CleanHTML(Dict('summary/content')), CleanText(Dict('origin/htmlUrl')))(self.el) else: return ''
class item(ItemElement): klass = Bill obj_id = Format('%s_%s', Env('subid'), CleanText('./td[3]')) obj_url = Attr('./td[@class="center" or @class="center pdf"]/a', 'href') obj_date = Env('date') obj_format = u"pdf" obj_type = u"bill" obj_price = CleanDecimal('./td[@class="center montant"]/span', replace_dots=True) def parse(self, el): self.env['date'] = parse_french_date(el.xpath('./td[2]')[0].text).date() def condition(self): return CleanText().filter(self.el.xpath('.//td')[-1]) != "" and len(self.el.xpath('./td[@class="center" or @class="center pdf"]/a/@href')) == 1
class item(ItemElement): klass = Subscription obj_id = CleanText(Dict('num_ligne')) obj__type = CleanText(Dict('type')) obj_label = Env('label') obj_subscriber = Format("%s %s %s", CleanText(Dict('civilite')), CleanText(Dict('prenom')), CleanText(Dict('nom'))) obj__contract = Env('contract') def parse(self, el): # add spaces number = iter(self.obj_id(el)) self.env['label'] = ' '.join(a+b for a, b in zip(number, number)) self.env['contract'] = re.search('\\"user_id\\":\\"([0-9]+)\\"', self.page.get('data.tag')).group(1)
class item(ItemElement): klass = Account obj__prestation_number = Dict('numeroPrestation') obj_id = Format( '%s_TITRE', CleanText(Field('_prestation_number'), replace=[(' ', '')])) obj_number = CleanText(Field('_prestation_number'), replace=[(' ', '')]) obj_label = Dict('intitule') obj_balance = CleanDecimal.French(Dict('evaluation')) obj_currency = CurrencyFilter(Dict('evaluation')) obj_type = Account.TYPE_MARKET
class get_current(ItemElement): klass = Current obj_date = DateTime(Dict('vt1currentdatetime/dateTime')) obj_id = Env('city_id') obj_text = Format('%shPa (%s) - humidity %s%% - feels like %s°C - %s', Dict('vt1observation/altimeter'), Dict('vt1observation/barometerTrend'), Dict('vt1observation/humidity'), Dict('vt1observation/feelsLike'), Dict('vt1observation/phrase')) def obj_temp(self): temp = Dict('vt1observation/temperature')(self) return Temperature(float(temp), 'C')
class item(ItemElement): def condition(self): return Dict('accountNumber', default=None)(self) klass = Recipient obj_id = Dict('accountNumber') obj_label = CleanText( Format('%s %s', Dict('accountHolderLongDesignation'), Dict('accountNatureShortLabel', default=''))) obj_iban = Dict('ibanCode') obj_category = 'Interne' obj_enabled_at = date.today() obj__is_recipient = Dict('recipientOfTransfert', default=False) obj__owner_name = CleanText(Dict('accountHolderLongDesignation'))
class item(ItemElement): klass = Bill obj__ref = CleanText('//input[@id="noref"]/@value') obj_id = Format('%s_%s', Env('subid'), CleanText('./@facture-id')) obj_url = Format( 'http://www.bouyguestelecom.fr/parcours/facture/download/index?id=%s&no_reference=%s', CleanText('./@facture-id'), CleanText('./@facture-ligne')) obj_date = Env('date') obj_format = u"pdf" obj_label = CleanText('./text()') obj_type = u"bill" obj_price = CleanDecimal( CleanText('./span', replace=[(u' € ', '.')])) obj_currency = u"€" def parse(self, el): self.env['date'] = parse_french_date( '01 %s' % CleanText('./text()')(self)).date() def condition(self): # XXX ugly fix to avoid duplicate bills return CleanText('./@facture-id')(self.el) != CleanText( './following-sibling::div[1]/@facture-id')(self.el)
class item(ItemElement): klass = Bill obj_date = Date(Dict('date'), default=NotAvailable) obj_price = Eval(lambda x: x / 100, CleanDecimal(Dict('amount'))) obj_format = 'pdf' def obj_label(self): return 'Facture du %s' % Field('date')(self) def obj_id(self): return '%s_%s' % (Env('subid')(self), Field('date')(self).strftime('%d%m%Y')) obj_url = Format('%s%s', BrowserURL('doc_api_par'), Dict('hrefPdf')) obj__is_v2 = True
def obj_details(self): details = dict() for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self): key = CleanText('./span[@class="label"]')(item) value = CleanText('.', replace=[(key, '')])(item) if value and key: details[key] = value key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self) value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'), CleanText('//div[@class="classe-energie-content"]/div/@class', replace=[('-', ' ')]))(self) if value and key: details[key] = value return details
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Format('d#%s', Env('_id')) obj_url = BrowserURL('advert2', _id=Env('_id')) obj_title = CleanText('//h3') obj_description = CleanHTML( '//div[@id="jobBodyContent"]|//div[@itemprop="description"]') obj_contract_type = CleanHTML('//div[@class="jobview-section"]') obj_society_name = Regexp(CleanText('//h4[@class="company"]'), '.* : (.*) - .*') obj_place = Regexp(CleanText('//h4[@class="company"]'), '.* - (.*)') obj_publication_date = Date(Regexp( CleanText('//span[@class="postedDate"]'), '.* : (.*)'), dayfirst=True)
class item(ItemElement): klass = Subscription obj_id = CleanText(Dict('num_ligne')) obj__type = CleanText(Dict('type')) obj_label = Env('label') obj_subscriber = Format("%s %s %s", CleanText(Dict('civilite')), CleanText(Dict('prenom')), CleanText(Dict('nom'))) def parse(self, el): # add spaces number = iter(self.obj_id(el)) self.env['label'] = ' '.join(a + b for a, b in zip(number, number))
def obj_label(self): label = Format('%s', CleanText('./td[2]'))(self) label = label.replace(" o ", " ") return label