def obj_url(self): keys_to_copy = { 'idDocument': 'idDoc', 'dateDocument': 'dateDoc', 'idLocalisation': 'idLocalisation', 'viDocDocument': 'viDocDocument', } # Here we parse the json with ibancrypte in it, for most cases if 'ibanCrypte' in self.el: url = 'demat-wspl/rest/consultationDocumentDemat?' keys_to_copy.update({ 'typeCpt': 'typeCompte', 'familleDoc': 'famDoc', 'ibanCrypte': 'ibanCrypte', 'typeDoc': 'typeDoc', 'consulted': 'consulted', }) request_params = {'typeFamille': 'R001', 'ikpiPersonne': ''} # Here we parse the json with idcontrat in it. For the cases present # on privee.mabanque where sometimes the doc url is different else: url = 'demat-wspl/rest/consultationDocumentSpecialBpfDemat?' keys_to_copy.update({ 'heureDocument': 'heureDoc', 'numClient': 'numClient', 'typeReport': 'typeReport', }) request_params = {'ibanCrypte': ''} for k, v in keys_to_copy.items(): request_params[k] = Dict(v)(self) return Env('baseurl')(self) + url + urlencode(request_params)
class item(ItemElement): klass = Account obj_type = Account.TYPE_CARD obj_currency = 'EUR' obj_number = CleanText(TableCell('number')) obj_label = Format('%s %s', CleanText(TableCell('label')), obj_number) obj_id = Format('%s.%s', Env('parent_id'), obj_number) def obj_coming(self): comings = (CleanDecimal(TableCell('balance', default=None), replace_dots=True, default=None)(self), CleanDecimal(TableCell('_credit', default=None), replace_dots=True, default=None)(self), CleanDecimal(TableCell('_debit', default=None), replace_dots=True, default=None)(self)) for coming in comings: if not empty(coming): return coming else: # There should have at least 0.00 in debit column assert False def obj_url(self): td = TableCell('label')(self)[0].xpath('.//a')[0] return urljoin(self.page.url, td.attrib['href'])
class get_recipe(ItemElement): klass = Recipe obj_id = Env('id') obj_title = CleanText('//h1[@class="fn"]') def obj_ingredients(self): ingredients = [] for el in self.page.doc.xpath( '//section[has-class("recette_ingredients")]/ul/li'): ingredients.append(CleanText('.')(el)) return ingredients obj_cooking_time = Time(CleanText('//span[@class="cooktime"]')) obj_preparation_time = Time(CleanText('//span[@class="preptime"]')) def obj_nb_person(self): return [ Type(CleanText('//span[@class="yield"]'), type=int, default=0)(self) ] obj_instructions = CleanHTML( '//article[@class="recette_etape"]/h3|//article[@class="recette_etape"]/div[@class="recette_etape_texte"]/*[not(self::article)]' ) obj_picture_url = CleanText( '//section[has-class("recette_infos")]/div/img[@class="photo"]/@src' ) obj_author = CleanText('//span[@class="author"]', default=NotAvailable)
class item(ItemElement): klass = GaugeSensor obj_name = Map(Dict('key'), SENSOR_NAMES) obj_gaugeid = Env('nom_court_sit') obj_id = Format('%s.%s', obj_gaugeid, Dict('key')) obj_unit = 'µg/m³' class obj_lastvalue(ItemElement): klass = GaugeMeasure obj_date = DateTime( Format( '%s %s', Env('min_donnees'), Env('date'), # "date" contains the time... ) ) obj_level = CleanDecimal(Dict('value')) class obj_geo(ItemElement): klass = GeoCoordinates obj_latitude = CleanDecimal(Env('latitude')) obj_longitude = CleanDecimal(Env('longitude')) class obj_location(ItemElement): klass = PostalAddress obj_street = Env('adresse') obj_postal_code = Env('ninsee') obj_city = Env('city') obj_region = 'Ile-de-France' obj_country = 'France'
class item(ItemElement): klass = Subscription obj_label = CleanText('//span[@class="ecconumteleule"]') obj_subscriber = CleanText( '//span[@class="economligneaseule eccobold"]') obj_id = Env('id') obj__contract = Env('contract') def parse(self, el): self.env['id'] = re.sub( r'[^\d\-\.]', '', el.xpath('//span[@class="ecconumteleule"]')[0].text) self.env['contract'] = re.search( "tc_vars\[\"ID_contrat\"\] = '([0-9]+)'", self.page.data).group(1)
class item(ItemElement): klass = BaseAudio def condition(self): return Dict('path_mp3')(self) obj_id = BaseAudioIdFilter(Format(u'%s.%s', Env('radio_id'), Dict('nid'))) obj_format = u'mp3' obj_ext = u'mp3' obj_title = Format(u'%s : %s', Dict('title_emission'), Dict('title_diff')) obj_description = Dict('desc_emission', default=u'') obj_author = Join(u', ', Dict('personnes', default=u'')) obj_url = Dict('path_mp3') def obj_thumbnail(self): if 'path_img_emission' in self.el: thumbnail = Thumbnail(Dict('path_img_emission')(self)) thumbnail.url = thumbnail.id return thumbnail def obj_duration(self): fin = Dict('fin')(self) debut = Dict('debut')(self) if debut and fin: return timedelta(seconds=int(fin) - int(debut))
class get_recipe(ItemElement): klass = Recipe def parse(self, el): item = XPath(u'//script[@type="application/ld+json"]')(self) json_content = CleanText(u'.', replace=[('//<![CDATA[ ', ''), (' //]]>', '')])(item[0]) self.el = json.loads(json_content) obj_id = Env('id') obj_title = Dict('name') obj_ingredients = Dict('recipeIngredient') class obj_picture(ItemElement): klass = BaseImage obj_url = Dict('image') obj_thumbnail = Eval(Thumbnail, obj_url) def obj_instructions(self): instructions = '' for item in Dict('recipeInstructions')(self): instructions = u"{0} - {1}\n\n".format(instructions, item['text']) return instructions obj_preparation_time = Eval(int, CleanDecimal(Dict('prepTime'))) obj_cooking_time = Eval(int, CleanDecimal(Dict('cookTime'))) def obj_nb_person(self): return [Dict('recipeYield')(self)]
class item(ItemElement): klass = Bill obj_id = Format('facture-%s-%s-%s#%s', Slugify(CleanText(TableCell('date'))), Slugify(CleanText(TableCell('amount'))), Slugify(CleanText(TableCell('type'))), Env('sub_id')) obj_url = AbsoluteLink('./td[5]//a', default=NotAvailable) obj_date = Date(CleanText(TableCell('date')), dayfirst=True) obj_label = Format('%s %s %s', CleanText(TableCell('type')), CleanText(TableCell('amount')), CleanText(TableCell('date'))) obj_type = DocumentTypes.BILL obj_price = CleanDecimal(TableCell('amount'), replace_dots=True) obj_currency = Currency(TableCell('amount')) obj_duedate = Date(Regexp(CleanText(TableCell('status')), r'le (\d+)/(\d+)/(\d+)', r'\1/\2/\3'), dayfirst=True) def obj_format(self): if self.obj_url(self): return 'pdf' return NotAvailable def obj_income(self): if self.obj_price(self) < 0: return True return False
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_url = Format('%s#%s', Env('url'), Env('id')) obj_description = Join('%s\r\n', 'div/fieldset/*[(@class="titreParagraphe" or @class="normal")]', textCleaner=CleanHTML) obj_title = CleanText('div/span[@class="intituleposte"]') obj_job_name = CleanText('div/span[@class="intituleposte"]') obj_society_name = Format('CCI %s', CleanText('div/span[@class="crci crcititle"]')) obj_publication_date = DateTime(CleanText('div/fieldset/p[@class="dateOffre"]'), dayfirst=True) def parse(self, el): self.el = el.xpath("//a[@name='%s']/following-sibling::div[1]" % self.obj.id)[0] self.env['url'] = self.page.url self.env['id'] = self.obj.id
def parse(self, el): for i, sub_group in enumerate(self.el): for j, sub in enumerate(Dict('listOfBillsByAccDTO')(sub_group)): if Dict('accDTO/numAcc')(sub) in Env('subid')(self): self.item_xpath = "%d/listOfBillsByAccDTO/%d/listOfbills" % (i, j) self.env['bpNumber'] = Dict('%d/bpDto/bpNumber' % i)(self) break
def obj_shop(self): shop = Shop(Env('_id')(self)) shop.name = Regexp(CleanText('(//div[@xtcz="contacter_le_vendeur"]/div/ul/li)[1]'), 'Nom : (.*)')(self) shop.location = JSVar(CleanText('//script'), var='tooltip')(self) shop.info = CleanText('//div[@xtcz="contacter_le_vendeur"]/div/ul/li[has-class("printPhone")]')(self) return shop
def next_page(self): page = Regexp(CleanText('//link[@rel="next"]/@href', default=''), '.*pg=(\d*)', default=None)(self) return BrowserURL('adv_search', search=Env('search'), page=int(page))(self)
class get_recipe(ItemElement): klass = Recipe obj_id = Env('id') obj_title = CleanText('//h1[has-class("m_title")]') obj_preparation_time = Type(CleanText('//span[@class="preptime"]'), type=int) obj_cooking_time = Type(CleanText('//span[@class="cooktime"]'), type=int) def obj_nb_person(self): nb_pers = Regexp(CleanText( '//div[@class="m_content_recette_ingredients m_avec_substitution"]/span[1]' ), '.*\(pour (\d+) personnes\)', default=0)(self) return [nb_pers] if nb_pers else NotAvailable def obj_ingredients(self): ingredients = CleanText( '//div[@class="m_content_recette_ingredients m_avec_substitution"]', default='')(self).split('-') if len(ingredients) > 1: return ingredients[1:] return [] obj_instructions = CleanHTML('//div[@class="m_content_recette_todo"]') obj_thumbnail_url = CleanText( '//a[@class="m_content_recette_illu"]/img/@src', default=NotAvailable) obj_picture_url = CleanText( '//a[@class="m_content_recette_illu"]/img/@src', default=NotAvailable)
class item(ItemElement): klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')])) obj_title = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-type"]/span/@title' ) obj_area = CleanDecimal( './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h3/a/span[@class="offer-area-number"]', default=0) obj_cost = CleanDecimal(Regexp(CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span', default=NotAvailable), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotAvailable), default=0) obj_currency = Regexp(CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span', default=NotAvailable), '.* ([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_date = Date( Regexp( CleanText( './div/div/div[has-class("offer-picture-more")]/div/p[@class="offer-update"]' ), ".*(\d{2}/\d{2}/\d{4}).*")) obj_text = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/div/p[has-class("offer-description")]/span' ) obj_location = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h2' )
class get_story(ItemElement): klass = Story obj_id = Env('id') obj_title = CleanText('//h1') obj_date = CleanText('//span[has-class("t4")]') & Regexp( pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]') def obj_body(self): div = self.el.xpath('//div[@align="justify"]')[0] body = '' for para in div.findall('br'): if para.text is not None: body += para.text.strip() body += '\n' if para.tail is not None: body += para.tail.strip() return body.replace(u'\x92', "'").strip() class obj_author(ItemElement): klass = Author obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]') obj_sex = CleanText('//td[has-class("t0")]') & Regexp( pattern=r"Auteur (\w+)") & Author.Sex2Enum
def obj_investment(self): investment = None for inv in self.page.browser.cache['invs'][Env('accid')(self)]: if inv.label in CleanText('./parent::tbody/preceding-sibling::tbody[1]')(self): investment = inv assert investment is not None return investment
class item(ItemElement): klass = AgendaculturelEvent def validate(self, obj): return self.check_date(obj) and self.check_category(obj) def check_date(self, obj): if self.env['date_from'] and obj.start_date >= self.env[ 'date_from']: if not self.env['date_to']: return True elif obj.end_date and obj.end_date <= self.env['date_to']: return True elif self.env['date_to'] >= obj.start_date: return True return False def check_category(self, obj): return (not self.env['categories'] or obj.category in self.env['categories']) obj_id = Format( '%s.%s', Env('region'), Regexp(CleanText('./div/a[@itemprop="url"]/@href'), '/(.*).html')) obj_summary = CleanText('./div/a[@itemprop="url"]') def obj_start_date(self): _date = Date( CleanText('./meta[@itemprop="startDate"]/@content'))(self) return datetime.combine(_date, time.min) obj_category = AgendaculturelCategory( Regexp(CleanText('./@itemtype'), 'http://schema.org/(.*)'))
class item(ItemElement): klass = Bill obj_date = Date(Dict('dueDate'), parse_func=parse_french_date, default=NotAvailable) obj_price = CleanDecimal(Dict('amountIncludingTax')) obj_format = 'pdf' def obj_label(self): return 'Facture du %s' % Field('date')(self) def obj_id(self): return '%s_%s' % (Env('subid')(self), Field('date')(self).strftime('%d%m%Y')) def get_params(self): params = { 'billid': Dict('id')(self), 'billDate': Dict('dueDate')(self) } return urlencode(params) obj_url = BrowserURL('doc_api_pro', subid=Env('subid'), dir=Dict('documents/0/mainDir'), fact_type=Dict('documents/0/subDir'), billparams=get_params) obj__is_v2 = False
class item(ItemElement): klass = Bill obj_id = Format( '%s#%s', Env('subscription'), Attr('.', 'id') ) obj_price = CleanDecimal('.//span[has-class("nbPrice")]', replace_dots=(',', '€')) obj_currency = "€" def obj_income(self): price = CleanText('.//span[has-class("nbPrice")]')(self) return not price.startswith('−') obj_label = CleanText('.//p[has-class("TeaserRow-desc")]') obj_date = Date(CleanText('.//p[has-class("TeaserRow-date")]'), dayfirst=True) obj_duedate = obj_date obj_format = "pdf" def obj_url(self): try: return urljoin( self.page.browser.BASEURL, Link('.//a[has-class("Download")]')(self) ) except XPathNotFound: return NotAvailable
class account(ItemElement): klass = Account obj_balance = CleanDecimal(TableCell('balance'), replace_dots=True, sign=lambda x: -1) obj_currency = FrenchTransaction.Currency(TableCell('balance')) obj_type = Account.TYPE_LOAN obj_id = Env('id') def obj_label(self): has_type = CleanText( './ancestor::table[.//th[contains(text(), "Type")]]', default=None)(self) return CleanText('./td[2]')(self) if has_type else CleanText( './ancestor::table/preceding-sibling::div[1]')(self).split( ' - ')[0] def parse(self, el): label = Field('label')(self) trs = self.xpath( '//td[contains(text(), "%s")]/ancestor::tr[1] | ./ancestor::table[1]/tbody/tr' % label) i = [i for i in range(len(trs)) if el == trs[i]] i = i[0] if i else 0 label = label.replace(' ', '') self.env['id'] = "%s%s%s" % (Regexp( CleanText(TableCell('id')), r'(\w+)\s-\s(\w+)', r'\1\2')(self), label.replace(' ', ''), i)
def obj_date(self): date = CleanText('./td[1]/font//text()')(self) if len(date) == 10: return Date(CleanText('./td[1]/font//text()'), dayfirst=True)(self) elif len(date) == 5: # Date has no indicated year. return DateGuesser(CleanText('./td[1]//text()'), Env('date_guesser'))(self)
class item(ItemElement): klass = Bill obj__simple_id = CleanText( './/div[has-class("actions")]//span[has-class("value")]') obj_id = Format('%s_%s', Env('subid'), Field('_simple_id')) obj_url = Format( '/gp/css/summary/print.html/ref=oh_aui_ajax_pi?ie=UTF8&orderID=%s', Field('_simple_id')) obj_format = 'html' obj_label = Format('Facture %s', Field('_simple_id')) obj_type = 'bill' def obj_date(self): currency = Env('currency')(self) return parse_french_date( CleanText( './/div[has-class("a-col-left")]//span[has-class("value") and not(contains(., "%s"))]' % currency)(self)) def obj_price(self): currency = Env('currency')(self) return CleanDecimal( './/div[has-class("a-col-left")]//span[has-class("value") and contains(., "%s")]' % currency, replace_dots=currency == u'EUR')(self) def obj_currency(self): currency = Env('currency')(self) return Currency( './/div[has-class("a-col-left")]//span[has-class("value") and contains(., "%s")]' % currency)(self)
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText( '//h1[@class="desc clearfix"]/span[@class="title"]') obj_cost = CleanDecimal( '//h1[@class="desc clearfix"]/span[@class="prix"]') obj_currency = Regexp( CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_area = CleanDecimal(Regexp( CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) obj_location = CleanText('//div[@class="text-annonce"]/h2') obj_text = CleanHTML('//div[@class="text-annonce"]/p') obj_station = CleanText('//div[@class="metro"]') obj_phone = CleanText('//span[@class="telephone hide-tel"]') obj_url = BrowserURL('housing', _id=Env('_id')) def obj_details(self): details = dict() for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self): key = CleanText('./span[@class="label"]')(item) value = CleanText('.', replace=[(key, '')])(item) if value and key: details[key] = value key = CleanText( '//div[@class="classe-energie-content"]/div/div/span')(self) value = Format( '%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'), CleanText('//div[@class="classe-energie-content"]/div/@class', replace=[('-', ' ')]))(self) if value and key: details[key] = value return details def obj_photos(self): photos = [] for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')( self): photos.append(HousingPhoto(u'%s' % img)) return photos
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@class="clearfix"]/span[@class="title"]') obj_cost = CleanDecimal('//h1[@class="clearfix"]/span[@class="price"]', replace_dots=True) obj_currency = Regexp( CleanText('//h1[@class="clearfix"]/span[@class="price"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_area = CleanDecimal(Regexp( CleanText('//h1[@class="clearfix"]/span[@class="title"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_location = CleanText('//div[@class="item-geoloc"]/h2') obj_text = CleanText(CleanHTML('//p[@class="item-description"]')) obj_station = CleanText('//div[@class="metro"]') obj_phone = CleanHTML('(//div[has-class("tel-wrapper")])[1]') obj_url = BrowserURL('housing', _id=Env('_id')) def obj_details(self): details = dict() for item in XPath('//ul[@class="item-summary"]/li')(self): key = CleanText('.', children=False)(item) value = CleanText('./strong')(item) if value and key: details[key] = value key = CleanText( '//div[@class="box energy-box"]/div/div/p[@class="h3"]')(self) value = Format( '%s(%s)', CleanText('(//div[@class="box energy-box"]/div/div/p)[2]'), CleanText('//div[@class="box energy-box"]/div/div/@class', replace=[('-', ''), ('rank', '')]))(self) if value and key: details[key] = value return details def obj_photos(self): photos = [] for img in XPath( '//div[has-class("showcase-thumbnail")]/img/@src')(self): photos.append(HousingPhoto(u'%s' % img)) return photos
class get_video(ItemElement): klass = BaseVideo obj_id = Env('id') obj_title = CleanText('//title') obj_nsfw = True obj_ext = u'mp4' obj_url = CleanText('//script') & Regexp(pattern=r'(https:\\/\\/[^"]+\.mp4[^"]+)"') & CleanText(replace=[('\\', '')])
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Format('#%s', Env('_id')) obj_url = BrowserURL('advert', _id=Env('_id')) obj_title = CleanText( '//div[@id="jobcopy"]/h1[@itemprop="title"]|//div[@itemprop="title"]/h1' ) obj_description = CleanHTML( '//div[@id="jobBodyContent"]|//div[@itemprop="description"]') obj_contract_type = Join(u' ', '//dd[starts-with(@class, "multipledd")]') obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]') obj_place = CleanText('//span[@itemprop="jobLocation"]') obj_pay = CleanText('//span[@itemprop="baseSalary"]') obj_formation = CleanText('//span[@itemprop="educationRequirements"]') obj_experience = CleanText('//span[@itemprop="qualifications"]')
class get_thread(ItemElement): klass = Thread obj_id = Format('%s#%s', Env('user'), Env('_id')) obj_title = Format( '%s \n\t %s', CleanText( '//div[@class="permalink-inner permalink-tweet-container"]/div/div/div/a', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), CleanText( '//div[@class="permalink-inner permalink-tweet-container"]/div/div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])) obj_date = DateTime(Regexp( CleanText( '//div[@class="permalink-inner permalink-tweet-container"]/div/div/div[@class="client-and-actions"]/span/span' ), '(\d+:\d+).+- (.+\d{4})', '\\2 \\1'), translations=DATE_TRANSLATE_FR)
class item(ItemElement): klass = Collection obj_title = Dict(CleanText(Env('title'))) obj_id = Dict('clusterId') def obj_split_path(self): return [SITE.PROGRAM.get('id'), Dict('clusterId')(self)]
class item(ItemElement): klass = Document obj_id = Format("%s_%s", Env("subscription"), Dict("reference")) obj_format = "pdf" obj_date = Date(Dict("datePrelevement")) obj__period = Regexp(Dict("datePrelevement"), r"(\d{4})-(\d{2})-(\d{2})", "\\1\\2") obj_label = Format("Prélèvement du %s", Field("date")) obj_type = DocumentTypes.OTHER obj_url = BrowserURL( "direct_debit_download", employer=Env("employer"), reference=Dict("reference"), period=Field("_period"), type=Dict("typeOrigine"), )
def obj_price(self): # Some orders, audiobooks for example, are paid using "audio credits", they have no price or currency currency = Env('currency')(self) return CleanDecimal( './/div[has-class("a-col-left")]//span[has-class("value") and contains(., "%s")]' % currency, replace_dots=currency == 'EUR', default=NotAvailable)(self)