class item(ItemElement): klass = Detail obj_id = None obj_datetime = DateTime(CleanText('td[1] | td[2]')) obj_price = CleanDecimal('td[7]', replace_dots=False, default=0) obj_currency = u'EUR' obj_label = Format(u"%s from %s to %s - %s", CleanText('td[3]'), CleanText('td[4]'), CleanText('td[5]'), CleanText('td[6]'))
def obj_date(self): _date = Regexp( CleanText('//div[@class="upload_by"]', replace=[(u'à', '')]), '.*- Mise en ligne le (.*).')(self) for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self)
def obj_end_date(self): m = re.findall(r'\w* \w* \d?\d \w* \d{4} \w* \d{2}h\d{2}', CleanText('./@title')(self), re.UNICODE) if m: if len(m) == 1: return DateTime(Regexp( CleanText('./@title'), r'\w* \w* (\d?\d \w* \d{4}) \w* \d{2}h\d{2} \w* (\d{2}h\d{2})', '\\1 \\2', flags=re.UNICODE), parse_func=parse_french_date)(self) else: return DateTime(Regexp( CleanText('./@title'), r'\w* \w* (\d?\d \w* \d{4}) \w* (\d{2}h\d{2})', '\\1 \\2', nth=-1, flags=re.UNICODE), parse_func=parse_french_date)(self)
def obj_start_date(self): m = re.findall(r'\w* \w* \d?\d \w* \d{4} \w* \d{2}h\d{2}', CleanText('./@title')(self), re.UNICODE) if m: return DateTime(Regexp( CleanText('./@title'), '\w* \w* (\d?\d \w* \d{4}) \w* (\d{2}h\d{2}).*', '\\1 \\2', flags=re.UNICODE), parse_func=parse_french_date)(self)
def obj_date(self): _date = Regexp( CleanText('//p[has-class("line")]', replace=[(u'à', '')]), '.*Mise en ligne le (.*)')(self) for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self)
def obj_date(self): _date = CleanText('./div[@class="lbc"]/div[@class="date"]', replace=[('Aujourd\'hui', str(date.today())), ('Hier', str((date.today() - timedelta(1)))) ])(self) for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self)
class item(ItemElement): klass = BaseCalendarEvent obj_url = Link('./div[@class="bbox"]/h1/a') obj_id = Regexp(Link('./div[@class="bbox"]/h1/a'), r'aspx\?(.+)') obj_location = CleanText('./div[@class="bbox"]/span/a') obj_start_date = DateTime(Attr('.//time', 'datetime')) obj_summary = Regexp(Attr('./div[@class="bbox"]/h1/a', 'title'), r'details of (.+)') obj_category = CATEGORIES.CONCERT obj_status = STATUS.CONFIRMED
def obj_sensors(self): sensors = [] lastdate = DateTime(Regexp(Env('datetime'), r'(\d+)\.(\d+)\.(\d+) (\d+):(\d+)', r'\3-\2-\1 \4:\5', default=NotAvailable), default=NotAvailable)(self) forecast = Map(Env('forecast'), self.forecasts, default=NotAvailable)(self) alarm = Map(Env('alarm'), self.alarmlevel, default=u'')(self) self.add_sensor(sensors, u"Level", u"cm", self.env['levelvalue'], forecast, alarm, lastdate) self.add_sensor(sensors, u"Flow", u"m3/s", self.env['flowvalue'], forecast, alarm, lastdate) return sensors
class obj_lastvalue(ItemElement): klass = GaugeMeasure obj_date = DateTime( Format( '%s %s', Env('min_donnees'), Env('date'), # "date" contains the time... ) ) obj_level = CleanDecimal(Dict('value'))
class get_video(ItemElement): klass = BaseVideo obj_id = Env('_id') obj_title = CleanText('//title') obj_author = CleanText('//meta[@name="author"]/@content') obj_description = CleanText('//meta[@name="description"]/@content') def obj_duration(self): seconds = int( CleanText('//meta[@property="video:duration"]/@content', default=0)(self)) return timedelta(seconds=seconds) def obj_thumbnail(self): url = CleanText('//meta[@property="og:image"]/@content')(self) thumbnail = Thumbnail(url) thumbnail.url = url return thumbnail obj_date = DateTime( CleanText('//meta[@property="video:release_date"]/@content')) def obj__formats(self): player = Regexp( CleanText('//script'), '.*var config = ({"context".*}}});\s*buildPlayer\(config\);.*', default=None)(self) if player: info = json.loads(player) if info.get('error') is not None: raise ParseError(info['error']['title']) metadata = info.get('metadata') formats = {} for quality, media_list in metadata['qualities'].items(): for media in media_list: media_url = media.get('url') if not media_url: continue type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue ext = determine_ext(media_url) if ext in formats: if quality in formats.get(ext): formats[ext][quality] = media_url else: formats[ext] = {quality: media_url} else: formats[ext] = {quality: media_url} return formats return None
def _parse_transaction(self, payment): transaction = Transaction() transaction_id = Dict('transaction_number', default=None)(payment) # Check if transaction_id is None which indicates failed transaction if transaction_id is None: return transaction.id = transaction_id transaction.date = DateTime(Dict('executed_at'))(payment) transaction.rdate = DateTime(Dict('created_at'))(payment) types = { 'ORDER': Transaction.TYPE_CARD, # order on lunchr website 'LUNCHR_CARD_PAYMENT': Transaction.TYPE_CARD, # pay in shop 'MEAL_VOUCHER_CREDIT': Transaction.TYPE_DEPOSIT, # type can be null for refunds } transaction.type = types.get(Dict('type')(payment), Transaction.TYPE_UNKNOWN) transaction.label = Dict('name')(payment) transaction.amount = CleanDecimal(Dict('amount/value'))(payment) return transaction
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_type = EPAdvertType(CleanText('//rubrique')) obj_advert_type = ADVERT_TYPES.PERSONAL obj_house_type = EPHouseType(CleanText('//tbien')) obj_title = CleanText('//titre') obj_rooms = CleanDecimal('//pieces') obj_cost = CleanDecimal('//prix') obj_currency = Currency.get_currency(u'€') obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText('//titre') obj_location = CleanText('//ville') obj_url = CleanText('//urlDetailAnnonce') obj_area = CleanDecimal('//surface') obj_price_per_meter = PricePerMeterFilter() obj_phone = CleanText('//telephone1') obj_date = DateTime(CleanText('//DateCheck')) def obj_GES(self): value = CleanText('//GSE')(self) return getattr(ENERGY_CLASS, value.upper(), NotAvailable) def obj_photos(self): photos = [] for photo in ['//UrlImage1', '//UrlImage2', '//UrlImage3']: p = CleanText(photo)(self) if p: photos.append(HousingPhoto(p)) return photos def obj_DPE(self): value = CleanText('//DPE')(self) return getattr(ENERGY_CLASS, value.upper(), NotAvailable) def obj_details(self): details = dict() d = [('//Nb_Etage', 'Nombre d\'etages'), ('//Neuf', 'Neuf'), ('//Ancien_avec_du_Charme', 'Ancien avec charme'), ('//Avec_terasse', 'Avec Terrasse'), ('//latitude', 'Latitude'), ('//longitude', 'Longitude'), ('//loyer', 'Loyer'), ('//piscine', 'Piscine'), ('//surface_balcon', 'Surface du balcon'), ('//surface_exp', 'Surface exploitable'), ('//surface_terrain', 'Surface du Terrain'), ('//Meuble', 'furnished')] for key, value in d: key = CleanText(key)(self) if key: details[value] = key return details
def get_roadmap(self): for step in self.doc.xpath( '//table[@class="trajet_etapes"]/tr[@class="etape"]'): roadstep = RoadStep() roadstep.line = '%s %s' % (DepartureTypeFilter( step.xpath('./td[@class="moyen"]'))(self), CleanText('./td[@class="moyen"]')(step)) roadstep.start_time = DateTime( CleanText('./th/span[@class="depart"]'), LinearDateGuesser())(step) roadstep.end_time = DateTime( CleanText( './th/span[@class="depart"]/following-sibling::span'), LinearDateGuesser())(step) roadstep.departure = CleanText('./td[@class="arret"]/p/strong')( step) roadstep.arrival = CleanText( './td[@class="arret"]/p/following-sibling::p/strong')(step) roadstep.duration = RoadMapDuration( CleanText('./td[@class="time"]'))(step) yield roadstep
class get_last_video(ItemElement): klass = BaseVideo obj_id = CleanText('//div[@id="diffusion-info"]/@data-diffusion') obj_title = CleanText( '//div[@id="diffusion-info"]/h1/div[@id="diffusion-titre"]') obj_date = DateTime(Regexp( CleanText( '//div[@id="diffusion-info"]/h1|//div[@id="diffusion-info"]/div/div/*[1]', replace=[(u'à', u''), (u' ', u' ')]), '.+(\d{2}-\d{2}-\d{2}.+\d{1,2}h\d{1,2}).+'), dayfirst=True)
class item(ItemElement): klass = BaseVideo def condition(self): return CleanText('div[@class="autre-emission-c3"]')(self) == "En replay" obj_id = Regexp(Link('.'), '^/videos/.+,(.+).html$') obj_title = CleanText('//meta[@name="programme_titre"]/@content') obj_date = DateTime(Regexp(CleanText('./div[@class="autre-emission-c2"]|./div[@class="autre-emission-c4"]', replace=[(u'à', u''), (u' ', u' ')]), '(\d{2}-\d{2}.+\d{1,2}:\d{1,2})'), dayfirst=True)
class item(ItemElement): klass = Detail def condition(self): txt = self.el.xpath('td[1]')[0].text return (txt is not None) and (txt != "Date") obj_id = None obj_datetime = DateTime(CleanText('td[1]', symbols=u'à'), dayfirst=True) obj_label = Format(u'%s %s %s', CleanText('td[2]'), CleanText('td[3]'), CleanText('td[4]')) obj_price = CleanDecimal('td[5]', default=Decimal(0), replace_dots=True)
class get_video(ItemElement): obj_title = CleanText('//article[@id="description"]//h1') obj_description = CleanText('//article[@id="description"]//section/following-sibling::div') obj_date = DateTime(Regexp( CleanText('//article[@id="description"]//span[contains(text(),"diffusé le")]'), r'(\d{2})\.(\d{2})\.(\d{2}) à (\d{2})h(\d{2})', r'20\3/\2/\1 \4:\5')) obj_duration = Eval(parse_duration, Regexp(CleanText('//div[span[text()="|"]]'), r'| (\d+)min')) obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('//div[@id="playerPlaceholder"]//img', 'data-src'))) obj__number = Attr('//div[@id="player"]', 'data-main-video') obj_license = LICENSES.COPYRIGHT
class get_thread(ItemElement): klass = Thread obj_id = Format('%s#%s', Env('user'), Env('_id')) obj_title = Format('%s \n\t %s', CleanText('//div[@class="permalink-inner permalink-tweet-container"]/div/div/div/a', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), CleanText('//div[@class="permalink-inner permalink-tweet-container"]/div/div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])) obj_date = DateTime(Regexp(CleanText('//div[@class="permalink-inner permalink-tweet-container"]/div/div/div[@class="client-and-actions"]/span/span'), '(\d+:\d+).+- (.+\d{4})', '\\2 \\1'), translations=DATE_TRANSLATE_FR)
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') obj_title = CleanText('titre') obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = CleanText('prixUnite') obj_area = CleanDecimal('surface') obj_text = CleanText('descriptif') obj_location = CleanText('ville') obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
def obj_date(self): _date = CleanText('./section[@class="item_infos"]/aside/p[@class="item_supp"]/text()', replace=[('Aujourd\'hui', str(date.today())), ('Hier', str((date.today() - timedelta(1))))])(self) if not _date: return NotAvailable for fr, en in DATE_TRANSLATE_FR: _date = fr.sub(en, _date) self.env['tmp'] = _date return DateTime(Env('tmp'), LinearDateGuesser())(self)
def create_video(metadata): video = RmllVideo(metadata['oid']) video.title = unicode(metadata['title']) video.date = DateTime(Dict('creation'), default=NotLoaded)(metadata) video.duration = RmllDuration(Dict('duration', default=''), default=NotLoaded)(metadata) thumbnail = NormalizeThumbnail(Dict('thumb'))(metadata) video.thumbnail = Thumbnail(thumbnail) video.thumbnail.url = video.thumbnail.id video.url = NotLoaded return video
class item(ItemElement): klass = BaseJobAdvert obj_id = CleanText('./@data-jobid') obj_society_name = CleanText('./div/div/div[@class="company"]', default=NotAvailable) obj_title = CleanText('./div/div/header/h2[@class="title"]/a', default=NotAvailable) obj_publication_date = DateTime( CleanText('./div/div[has-class("meta")]/time/@datetime'), default=NotAvailable) obj_place = CleanText('./div/div/div[@class="location"]', default=NotAvailable)
class item(ItemElement): klass = RazibusCalendarEvent def validate(self, obj): return (self.is_valid_event(obj, self.env['city'], self.env['categories']) and self.is_event_in_valid_period( obj.start_date, self.env['date_from'], self.env['date_to'])) def is_valid_event(self, event, city, categories): if city and city != '' and city.upper() != event.city.upper(): return False if categories and len( categories) > 0 and event.category not in categories: return False return True def is_event_in_valid_period(self, event_date, date_from, date_to): if event_date >= date_from: if not date_to: return True else: if event_date <= date_to: return True return False obj_id = Regexp(Link('./p/strong/a[@itemprop="url"]'), 'http://razibus.net/(.*).html') obj_summary = CleanText('./p/strong/a[@itemprop="url"]') obj_start_date = DateTime( CleanText('./p/span[@itemprop="startDate"]/@content')) obj_end_date = CombineDate( DateTime( CleanText('./p/span[@itemprop="startDate"]/@content')), EndTime('.')) obj_location = CleanText('./p/span[@itemprop="location"]/@content') obj_city = CleanText('./p/span[@itemprop="location"]')
def _get_coef_value(self, AM=True, jour=0): if AM: time = DateTime( CleanText('//tr[@id="MareeJours_%s"]/td[1]/b[1]' % jour))(self) value = CleanText('//tr[@id="MareeJours_%s"]/td[3]/b[1]' % jour)(self) else: time, value = None, None if len( XPath('//tr[@id="MareeJours_%s"]/td[1]/b' % jour)(self)) > 1: time = DateTime( CleanText('//tr[@id="MareeJours_%s"]/td[1]/b[2]' % jour))(self) value = CleanText('//tr[@id="MareeJours_%s"]/td[3]/b[2]' % jour)(self) if time and value: measure = GaugeMeasure() measure.level = float(value) measure.date = time + timedelta(days=jour) return measure
class fill_paste(ItemElement): klass = PastealaconPaste obj_id = Env('id') obj_title = Regexp(CleanText('id("content")/h3'), r'Posted by (.+) on .+ \(') obj__date = DateTime( Regexp(CleanText('id("content")/h3'), r'Posted by .+ on (.+) \(')) obj_contents = RawText('//textarea[@id="code"]') def parse(self, el): # there is no 404, try to detect if there really is a content if len(el.xpath('id("content")/div[@class="syntax"]//ol')) != 1: raise PasteNotFound()
class SeLogerItem(ItemElement): klass = Housing obj_id = CleanText('idAnnonce') def obj_type(self): idType = int(CleanText('idTypeTransaction')(self)) type = next(k for k, v in TYPES.items() if v == idType) if type == POSTS_TYPES.FURNISHED_RENT: # SeLoger does not let us discriminate between furnished and not # furnished. return POSTS_TYPES.RENT return type def obj_house_type(self): idType = CleanText('idTypeBien')(self) try: return next(k for k, v in RET.items() if v == idType) except StopIteration: return NotAvailable obj_title = Format( "%s %s%s - %s", CleanText('titre'), CleanText('surface'), CleanText('surfaceUnite'), CleanText('ville'), ) obj_date = DateTime(CleanText('dtFraicheur')) obj_cost = CleanDecimal('prix') obj_currency = Currency('prixUnite') obj_area = CleanDecimal('surface', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_text = CleanText('descriptif') obj_rooms = CleanDecimal('nbPiece|nbPieces', default=NotAvailable) obj_bedrooms = CleanDecimal('nbChambre|nbChambres', default=NotAvailable) def obj_location(self): location = CleanText('adresse', default="")(self) quartier = CleanText('quartier', default=None)(self) if not location and quartier is not None: location = quartier ville = CleanText('ville')(self) cp = CleanText('cp')(self) return u'%s %s (%s)' % (location, ville, cp) obj_station = CleanText('proximite', default=NotAvailable) obj_url = CleanText('permaLien')
def _get_high_tide_value(self, AM=True, jour=0): if AM: time = DateTime( CleanText('//tr[@id="MareeJours_%s"]/td[1]/b[1]' % jour))(self) value = CleanDecimal('//tr[@id="MareeJours_0"]/td[2]/b[1]', replace_dots=True)(self) else: time, value = None, None if len( XPath('//tr[@id="MareeJours_%s"]/td[1]/b' % jour)(self)) > 1: time = DateTime(CleanText( '//tr[@id="MareeJours_%s"]/td[1]/b[2]' % jour), default=None)(self) value = CleanDecimal('//tr[@id="MareeJours_0"]/td[2]/b[2]', replace_dots=True, default=None)(self) if time and value: measure = GaugeMeasure() measure.level = float(value) measure.date = time + timedelta(days=jour) return measure
class get_current(ItemElement): klass = Current obj_date = DateTime(Dict('vt1currentdatetime/dateTime')) obj_id = Env('city_id') obj_text = Format('%shPa (%s) - humidity %s%% - feels like %s°C - %s', Dict('vt1observation/altimeter'), Dict('vt1observation/barometerTrend'), Dict('vt1observation/humidity'), Dict('vt1observation/feelsLike'), Dict('vt1observation/phrase')) def obj_temp(self): temp = Dict('vt1observation/temperature')(self) return Temperature(float(temp), 'C')
class item(ItemElement): klass = GaugeMeasure verif = re.compile("\d\d.\d\d.\d+ \d\d:\d\d") obj_date = DateTime( Regexp(CleanText('.'), r'(\d+)\.(\d+)\.(\d+) (\d+):(\d+)', r'\3-\2-\1 \4:\5')) sensor_types = [u'Level', u'Flow'] def obj_level(self): index = self.sensor_types.index(self.env['sensor'].name) + 1 try: return float(self.el[index].text_content()) except ValueError: return NotAvailable
class fill_paste(ItemElement): klass = PastebinPaste def parse(self, el): self.env['header'] = el.find('//div[@id="content_left"]//div[@class="paste_box_info"]') obj_id = Env('id') obj_title = Base(Env('header'), CleanText('.//div[@class="paste_box_line1"]//h1')) obj_contents = RawText('//textarea[@id="paste_code"]') obj_public = Base( Env('header'), CleanVisibility(Attr('.//div[@class="paste_box_line1"]//img', 'title'))) obj__date = Base( Env('header'), DateTime(Attr('.//div[@class="paste_box_line2"]/span[1]', 'title')))