def get_session_storage(self): sessionContent = Regexp( CleanText('//script[@type="text/javascript"]'), 'sessionStorage.setItem\((.*)\)' )(self.doc) key, value = map(lambda x: x.strip("'").strip(), sessionContent.split(",", 1)) return key, json.decoder.JSONDecoder().decode(value)
def filter(self, el): duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0]) if duration[-1:] == "'": t = [0, int(duration[:-1])] else: t = map(int, duration.split(':')) return timedelta(hours=t[0], minutes=t[1])
def next_page(self): str_results = Regexp( CleanText('//div[@class="m_resultats_recherche_titre"]'), '.* - (\d* / \d*) .*')(self) results = str_results.split('/') if int(results[0]) - int(results[1]) < 10: return BrowserURL('search', pattern=Env('pattern'), start=int(results[0]))(self)
def iter_payment_details(self, sub): id_str = self.doc.xpath('//div[@class="entete container"]/h2')[0].text.strip() m = re.match('.*le (.*) pour un montant de.*', id_str) if m: blocs_benes = self.doc.xpath('//span[contains(@id,"nomBeneficiaire")]') blocs_prestas = self.doc.xpath('//table[@id="tableauPrestation"]') i = 0 last_bloc = len(blocs_benes) for i in range(0, last_bloc): bene = blocs_benes[i].text; id_str = m.group(1) id_date = datetime.strptime(id_str, '%d/%m/%Y').date() id = sub._id + "." + datetime.strftime(id_date, "%Y%m%d") table = blocs_prestas[i].xpath('.//tr') line = 1 last_date = None for tr in table: tds = tr.xpath('.//td') if len(tds) == 0: continue det = Detail() # TO TEST : Indemnités journalières : Pas pu tester de cas de figure similaire dans la nouvelle mouture du site if len(tds) == 4: date_str = Regexp(pattern=r'.*<br/>(\d+/\d+/\d+)\).*').filter(tds[0].text) det.id = id + "." + str(line) det.label = tds[0].xpath('.//span')[0].text.strip() jours = tds[1].text if jours is None: jours = '0' montant = tds[2].text if montant is None: montant = '0' price = tds[3].text if price is None: price = '0' if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = '%s (%sj) * %s€' % (date_str, re.sub(r'[^\d,-]+', '', jours), re.sub(r'[^\d,-]+', '', montant)) det.datetime = datetime.strptime(date_str.split(' ')[3], '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.')) if len(tds) == 5: date_str = Regexp(pattern=r'\w*(\d{2})/(\d{2})/(\d{4}).*', template='\\1/\\2/\\3', default="").filter("".join(tds[0].itertext())) det.id = id + "." + str(line) det.label = '%s - %s' % (bene, tds[0].xpath('.//span')[0].text.strip()) paye = tds[1].text if paye is None: paye = '0' base = tds[2].text if base is None: base = '0' tdtaux = tds[3].xpath('.//span')[0].text if tdtaux is None: taux = '0' else: taux = tdtaux.strip() tdprice = tds[4].xpath('.//span')[0].text if tdprice is None: price = '0' else: price = tdprice.strip() if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = u' Payé %s€ / Base %s€ / Taux %s%%' % (re.sub(r'[^\d,-]+', '', paye), re.sub(r'[^\d,-]+', '', base), re.sub('[^\d,-]+', '', taux)) det.datetime = datetime.strptime(date_str, '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.')) line = line + 1 yield det
def get_session_storage(self): sessionContent = Regexp(CleanText('//script[@type="text/javascript"]'), 'sessionStorage.setItem\((.*)\)')(self.doc) key, value = map(lambda x: x.strip("'").strip(), sessionContent.split(",", 1)) return key, json.decoder.JSONDecoder().decode(value)
def obj_split_path(self): _id = Regexp(CleanText('./a/@href'), '/\w{2}/(.*)')(self) return [SITE.CINEMA.get('id')] + _id.split('/')
def iter_payment_details(self, sub): id_str = self.doc.xpath( '//div[@class="entete container"]/h2')[0].text.strip() m = re.match('.*le (.*) pour un montant de.*', id_str) if m: blocs_benes = self.doc.xpath( '//span[contains(@id,"nomBeneficiaire")]') blocs_prestas = self.doc.xpath('//table[@id="tableauPrestation"]') i = 0 last_bloc = len(blocs_benes) for i in range(0, last_bloc): bene = blocs_benes[i].text id_str = m.group(1) id_date = datetime.strptime(id_str, '%d/%m/%Y').date() id = sub._id + "." + datetime.strftime(id_date, "%Y%m%d") table = blocs_prestas[i].xpath('.//tr') line = 1 last_date = None for tr in table: tds = tr.xpath('.//td') if len(tds) == 0: continue det = Detail() # TO TEST : Indemnités journalières : Pas pu tester de cas de figure similaire dans la nouvelle mouture du site if len(tds) == 4: date_str = Regexp(r'.*<br/>(\d+/\d+/\d+)\).*', '\\1')(tds[0].text) det.id = id + "." + str(line) det.label = unicode( tds[0].xpath('.//span')[0].text.strip()) jours = tds[1].text if jours is None: jours = '0' montant = tds[2].text if montant is None: montant = '0' price = tds[3].text if price is None: price = '0' if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = date_str + u' (' + unicode( re.sub('[^\d,-]+', '', jours)) + u'j) * ' + unicode( re.sub('[^\d,-]+', '', montant)) + u'€' det.datetime = datetime.strptime( date_str.split(' ')[3], '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal( re.sub('[^\d,-]+', '', price).replace(',', '.')) if len(tds) == 5: date_str = Regexp( pattern=r'\w*(\d{2})/(\d{2})/(\d{4}).*', template='\\1/\\2/\\3', default="").filter("".join(tds[0].itertext())) det.id = id + "." + str(line) det.label = bene + u' - ' + unicode( tds[0].xpath('.//span')[0].text.strip()) paye = tds[1].text if paye is None: paye = '0' base = tds[2].text if base is None: base = '0' tdtaux = tds[3].xpath('.//span')[0].text if tdtaux is None: taux = '0' else: taux = tdtaux.strip() tdprice = tds[4].xpath('.//span')[0].text if tdprice is None: price = '0' else: price = tdprice.strip() if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = u' Payé ' + unicode( re.sub('[^\d,-]+', '', paye)) + u'€ / Base ' + unicode( re.sub('[^\d,-]+', '', base) ) + u'€ / Taux ' + unicode( re.sub('[^\d,-]+', '', taux)) + '%' det.datetime = datetime.strptime( date_str, '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal( re.sub('[^\d,-]+', '', price).replace(',', '.')) line = line + 1 yield det
def get_params(self): a = Regexp( CleanText('//script'), '"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*', '\\1|\\2')(self.doc) return a.split('|')
def get_params(self): a = Regexp(CleanText('//script'), '"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*', '\\1|\\2')(self.doc) return a.split('|')