class LyricsdotcomBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.lyrics.com' search = URL('/serp.php\?st=(?P<pattern>.*)&qtype=(?P<criteria>1|2)', SearchPage) songLyrics = URL('/lyric/(?P<id>\d*)', LyricsPage) artistsong = URL('/artist/(?P<id>.*)', ArtistPages) def iter_lyrics(self, criteria, pattern): if criteria == 'song': self.search.go(pattern=pattern, criteria=1) assert self.search.is_here() for song in self.page.iter_lyrics(): yield song elif criteria == 'artist': self.search.go(pattern=pattern, criteria=2) assert self.search.is_here() for artist in self.page.iter_artists(): for song in self.artistsong.go(id=artist.id).iter_lyrics(): yield song def get_lyrics(self, id): return self.songLyrics.go(id=id).get_lyrics()
class ParolesmaniaBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.parolesmania.com/' searchSong = URL('recherche.php\?c=title&k=(?P<pattern>[^/]*).*', SearchSongPage) searchArtist = URL('recherche.php\?c=artist&k=(?P<pattern>[^/]*).*', SearchArtistPage) songLyrics = URL( 'paroles_(?P<artistid>[^/]*)/paroles_(?P<songid>[^/]*)\.html', LyricsPage) artistSongs = URL('paroles_(?P<artistid>[^/]*)\.html', ArtistSongsPage) def iter_lyrics(self, criteria, pattern): if criteria == 'artist': artist_ids = self.searchArtist.go(pattern=pattern).get_artist_ids() it = [] # we just take the 3 first artists to avoid too many page loadings for aid in artist_ids[:3]: it = itertools.chain( it, self.artistSongs.go(artistid=aid).iter_lyrics()) return it elif criteria == 'song': return self.searchSong.go(pattern=pattern).iter_lyrics() def get_lyrics(self, id): ids = id.split('|') try: self.songLyrics.go(artistid=ids[0], songid=ids[1]) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
class LyricsplanetBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.lyricsplanet.com/' home = URL('$', HomePage) search = URL('search\.php$', SearchPage) artist = URL('search\.php\?field=artisttitle&value=(?P<artistid>[^/]*)$', ArtistPage) lyrics = URL('lyrics\.php\?id=(?P<songid>[^/]*)$', LyricsPage) def iter_lyrics(self, criteria, pattern): self.home.stay_or_go() assert self.home.is_here() self.page.search_lyrics(criteria, pattern) assert self.search.is_here() if criteria == 'song': return self.page.iter_song_lyrics() elif criteria == 'artist': artist_ids = self.page.get_artist_ids() it = [] # we just take the 3 first artists to avoid too many page loadings for aid in artist_ids[:3]: it = itertools.chain( it, self.artist.go(artistid=aid).iter_lyrics()) return it def get_lyrics(self, id): try: self.lyrics.go(songid=id) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
class IpinfodbBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'https://ipinfodb.com/' home = URL('$', LocationPage) def get_location(self, ipaddr): self.home.go(data={'ip': ipaddr}) return self.page.get_location()
class EnsapBrowser(LoginBrowser): BASEURL = 'https://ensap.gouv.fr' PROFILE = Firefox() loginp = URL('/web/views/contenus/accueilnonconnecte.html', LoginPage) loginvalidity = URL('/authentification', LoginValidityPage) authp = URL('/prive/initialiserhabilitation/v1', LoginControlPage) homep = URL('/prive/accueilconnecte/v1', HomePage) documents = URL('/prive/remuneration/v1/(?P<year>\d+)', DocumentsPage) listyears = URL('/prive/listeanneeremuneration/v1', ListYear) logged = False token = None def do_login(self): self.logger.debug('call Browser.do_login') if self.logged: return True self.loginp.stay_or_go() self.loginvalidity.go(data={ "identifiant": self.username, "secret": self.password }) if not self.page.check_logged(): raise BrowserIncorrectPassword() self.authp.go(data="{}", headers={'Content-Type': 'application/json'}) self.token = self.page.get_xsrf() self.logged = True @need_login def iter_documents(self, subscription): self.listyears.go() years = self.page.get_years() # use reverse order of list to get recent documents first for year in years[::-1]: self.documents.stay_or_go(year=year, headers={"X-XSRF-TOKEN": self.token}) self.token = self.session.cookies.get("XSRF-TOKEN") for doc in self.page.iter_documents(): yield doc @need_login def iter_subscription(self): self.homep.stay_or_go(headers={"X-XSRF-TOKEN": self.token}) self.token = self.session.cookies.get("XSRF-TOKEN") return self.page.iter_subscription() @need_login def get_document(self, id): return find_object(self.iter_documents(None), id=id, error=DocumentNotFound())
class ColissimoBrowser(PagesBrowser): BASEURL = 'http://www.colissimo.fr' PROFILE = Firefox() tracking_url = URL('/portail_colissimo/suivre.do\?colispart=(?P<_id>.*)', TrackingPage) def get_tracking_info(self, _id): self.tracking_url.stay_or_go(_id=_id) events = list(self.page.iter_infos()) if len(events) == 0: error = self.page.get_error() raise ParcelNotFound(u"Parcel not found: {}".format(error)) return events
class ColissimoBrowser(PagesBrowser): BASEURL = 'https://www.laposte.fr' PROFILE = Firefox() tracking_url = URL( '/particulier/outils/suivre-vos-envois\?code=(?P<_id>.*)', TrackingPage) def get_tracking_info(self, _id): self.tracking_url.stay_or_go(_id=_id) events = list(self.page.iter_infos()) if len(events) == 0: error = self.page.get_error() raise ParcelNotFound(u"Parcel not found: {}".format(error)) return events
class HybrideBrowser(PagesBrowser): PROFILE = Firefox() BASEURL = 'https://www.lhybride.org/' program_page = URL('programmation/a-venir.html', ProgramPage) event_page = URL('programmation/item/(?P<_id>.*)', EventPage) def list_events(self, date_from, date_to=None, city=None, categories=None): return self.program_page.go().list_events(date_from=date_from, date_to=date_to, city=city, categories=categories) def get_event(self, _id, event=None): return self.event_page.go(_id=_id).get_event(obj=event)
class IpinfodbBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'https://ipinfodb.com/' home = URL('$', HomePage) search = URL('ip_locator.php', LocationPage) def get_location(self, ipaddr): try: self.home.go() self.page.search(ipaddr) iploc = self.page.get_location() return iploc except BrowserHTTPNotFound: return
class EnsapBrowser(LoginBrowser): BASEURL = 'https://ensap.gouv.fr' PROFILE = Firefox() loginp = URL('/web/views/contenus/accueilnonconnecte.html', LoginPage) loginvalidity = URL('/authentification', LoginValidityPage) authp = URL('/prive/initialiserhabilitation/v1', LoginControlPage) homep = URL('/prive/accueilconnecte/v1', HomePage) documents = URL('/prive/remuneration/v1', DocumentsPage) logged = False token = None def do_login(self): self.logger.debug('call Browser.do_login') if self.logged: return True self.loginp.stay_or_go() self.loginvalidity.go(data={ "identifiant": self.username, "secret": self.password }) if not self.page.check_logged(): raise BrowserIncorrectPassword() self.authp.go(data={"": ""}) self.token = self.page.get_xsrf() self.logged = True @need_login def iter_documents(self, subscription): self.documents.stay_or_go(headers={"X-XSRF-TOKEN": self.token}) self.token = self.session.cookies.get("XSRF-TOKEN") # return self.bills.go().iter_bills(subid=subscription.id) return self.page.iter_documents() @need_login def iter_subscription(self): self.homep.stay_or_go(headers={"X-XSRF-TOKEN": self.token}) self.token = self.session.cookies.get("XSRF-TOKEN") return self.page.iter_subscription() @need_login def get_document(self, id): return find_object(self.iter_documents(None), id=id, error=DocumentNotFound())
class CpasbienBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.cpasbien.cm/' search = URL('recherche/(?P<pattern>.*).html,trie-seeds-d', SearchPage) torrent = URL('dl-torrent/(?P<id>.*)\.html', TorrentPage) def iter_torrents(self, pattern): self.search.go(pattern=pattern) return self.page.iter_torrents() def get_torrent(self, fullid): try: self.torrent.go(id=fullid) torrent = self.page.get_torrent() return torrent except BrowserHTTPNotFound: return
class ParolesmusiqueBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.paroles-musique.com/' home = URL('$', HomePage) songResults = URL('lyrics-paroles-0-.*,0.php', SongResultsPage) artistResults = URL('lyrics-paroles-.*-0,0.php', ArtistResultsPage) songLyrics = URL('paroles-(?P<songid>.*,p[0-9]*)', SonglyricsPage) artistSongs = URL('paroles-(?P<artistid>.*,a[0-9]*)', ArtistSongsPage) def iter_lyrics(self, criteria, pattern): self.home.stay_or_go() assert self.home.is_here() self.page.search_lyrics(criteria, pattern) if criteria == 'song': assert self.songResults.is_here() return self.page.iter_lyrics() elif criteria == 'artist': assert self.artistResults.is_here() artist_ids = self.page.get_artist_ids() it = [] # we just take the 3 first artists to avoid too many page loadings for aid in artist_ids[:3]: it = itertools.chain(it, self.artistSongs.go(artistid=aid).iter_lyrics()) return it def get_lyrics(self, id): try: self.songLyrics.go(songid=id) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
class KickassBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'https://kat.cr/' search = URL('usearch/(?P<pattern>.*)/\?field=seeders&sorder=desc', SearchPage) torrent = URL('torrent-t(?P<id>.*).html', '.*-t[0-9]*\.html', TorrentPage) def iter_torrents(self, pattern): self.search.go(pattern=pattern) #print( self.page.content) return self.page.iter_torrents() def get_torrent(self, fullid): try: self.torrent.go(id=fullid) torrent = self.page.get_torrent() return torrent except BrowserHTTPNotFound: return
class LyricsmodeBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.lyricsmode.com/' search = URL('search\.php\?search=(?P<pattern>[^&/]*)$', SearchPage) songLyrics = URL('lyrics/(?P<letterid>[^/]*)/(?P<artistid>[^/]*)/(?P<songid>[^/]*)\.html$', LyricsPage) def iter_lyrics(self, criteria, pattern): return self.search.go(pattern=pattern).iter_lyrics() def get_lyrics(self, id): subid = id.split('|') try: self.songLyrics.go(letterid=subid[0], artistid=subid[1], songid=subid[2]) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
class LyricsdotcomBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://www.lyrics.com/' search = URL( 'search\.php\?keyword=(?P<pattern>[^&]*)&what=all&search_btn=Search', SearchPage) songLyrics = URL('(?P<id>[^/]*-lyrics-[^/]*)\.html$', LyricsPage) def iter_lyrics(self, criteria, pattern): self.search.go(pattern=pattern) assert self.search.is_here() return self.page.iter_lyrics() def get_lyrics(self, id): real_id = id.split('|')[0] try: self.songLyrics.go(id=real_id) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
class Paroles2chansonsBrowser(PagesBrowser): PROFILE = Firefox() TIMEOUT = 30 BASEURL = 'http://paroles2chansons.lemonde.fr/' home = URL('$', HomePage) search = URL('search', SearchPage) artist = URL('paroles-(?P<artistid>[^/]*)$', ArtistPage) lyrics = URL('paroles-(?P<artistid>[^/]*)/paroles-(?P<songid>[^/]*)\.html', LyricsPage) def iter_lyrics(self, criteria, pattern): self.home.stay_or_go() assert self.home.is_here() self.page.search_lyrics(pattern) assert self.search.is_here() if criteria == 'song': return self.page.iter_song_lyrics() elif criteria == 'artist': artist_ids = self.page.get_artist_ids() it = [] # we just take the 3 first artists to avoid too many page loadings for aid in artist_ids[:3]: it = itertools.chain(it, self.artist.go(artistid=aid).iter_lyrics()) return it def get_lyrics(self, id): ids = id.split('|') try: self.lyrics.go(artistid=ids[0], songid=ids[1]) songlyrics = self.page.get_lyrics() return songlyrics except BrowserHTTPNotFound: return
def get_resume(self, film_id): self.set_json_header() _id = film_id.split('/')[-1] resume = self.json_page.go(_id=_id).get_resume() self.set_profile(Firefox()) return resume
class LogicimmoBrowser(PagesBrowser): BASEURL = 'https://www.logic-immo.com/' PROFILE = Firefox() city = URL( 'asset/t9/getLocalityT9.php\?site=fr&lang=fr&json=%22(?P<pattern>.*)%22', CitiesPage) search = URL( '(?P<type>location-immobilier|vente-immobilier|recherche-colocation)-(?P<cities>.*)/options/(?P<options>.*)', SearchPage) housing = URL('detail-(?P<_id>.*).htm', HousingPage) phone = URL('(?P<urlcontact>.*)', PhonePage) TYPES = { POSTS_TYPES.RENT: 'location-immobilier', POSTS_TYPES.SALE: 'vente-immobilier', POSTS_TYPES.SHARING: 'recherche-colocation', POSTS_TYPES.FURNISHED_RENT: 'location-immobilier', POSTS_TYPES.VIAGER: 'vente-immobilier' } RET = { HOUSE_TYPES.HOUSE: '2', HOUSE_TYPES.APART: '1', HOUSE_TYPES.LAND: '3', HOUSE_TYPES.PARKING: '10', HOUSE_TYPES.OTHER: '14' } def __init__(self, *args, **kwargs): super(LogicimmoBrowser, self).__init__(*args, **kwargs) self.session.headers['X-Requested-With'] = 'XMLHttpRequest' def get_cities(self, pattern): if pattern: return self.city.go(pattern=pattern).get_cities() def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max, house_types): if type not in self.TYPES: raise TypeNotSupported() options = [] ret = [] if type == POSTS_TYPES.VIAGER: ret = ['15'] else: for house_type in house_types: if house_type in self.RET: ret.append(self.RET.get(house_type)) if len(ret): options.append('groupprptypesids=%s' % ','.join(ret)) if type == POSTS_TYPES.FURNISHED_RENT: options.append('searchoptions=4') options.append('pricemin=%s' % (cost_min if cost_min else '0')) if cost_max: options.append('pricemax=%s' % cost_max) options.append('areamin=%s' % (area_min if area_min else '0')) if area_max: options.append('areamax=%s' % area_max) if nb_rooms: if type == POSTS_TYPES.SHARING: options.append('nbbedrooms=%s' % ','.join([str(i) for i in range(nb_rooms, 7)])) else: options.append('nbrooms=%s' % ','.join([str(i) for i in range(nb_rooms, 7)])) self.search.go(type=self.TYPES.get(type, 'location-immobilier'), cities=cities, options='/'.join(options)) if type == POSTS_TYPES.SHARING: return self.page.iter_sharing() return self.page.iter_housings(query_type=type) def get_housing(self, _id, housing=None): return self.housing.go(_id=_id).get_housing(obj=housing) def get_phone(self, _id): if _id.startswith('location') or _id.startswith('vente'): urlcontact, params = self.housing.stay_or_go( _id=_id).get_phone_url_datas() return self.phone.go(urlcontact=urlcontact, params=params).get_phone()