class LibraryManager(): def __init__(self, dest_path, platform): self.dest_path = dest_path self.platform = platform def check_update(self): need_update = False if __settings__.getSetting('plugin_name') != __plugin__: __settings__.setSetting('plugin_name', __plugin__) for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) self.sizepath = os.path.join(self.dest_path, libname + '.size.txt') size = str(os.path.getsize(self.libpath)) size_old = open(self.sizepath, "r").read() if size_old != size: need_update = True return need_update def update(self): if self.check_update(): for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) xbmcvfs.delete(self.libpath) self.download() def download(self): xbmcvfs.mkdirs(self.dest_path) for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s.zip" % (__libbaseurl__, self.platform, libname) try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=True) log("%s -> %s" % (url, dest)) xbmc.executebuiltin( 'XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s)" % (__plugin__, text, 750)) return True
class LibraryManager(): def __init__(self, dest_path, platform): self.dest_path = dest_path self.platform = platform def check_update(self): need_update=False if __settings__.getSetting('plugin_name')!=__plugin__: __settings__.setSetting('plugin_name', __plugin__) for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) self.sizepath=os.path.join(self.dest_path, libname+'.size.txt') size=str(os.path.getsize(self.libpath)) size_old=open( self.sizepath, "r" ).read() if size_old!=size: need_update=True return need_update def update(self): if self.check_update(): for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) xbmcvfs.delete(self.libpath) self.download() def download(self): xbmcvfs.mkdirs(self.dest_path) for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s.zip" % (__libbaseurl__, self.platform, libname) try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=True) log("%s -> %s" % (url, dest)) xbmc.executebuiltin('XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s)" % (__plugin__,text,750)) return True
class DownloaderClass(): def __init__(self, dest_path): self.dest_path = dest_path self.platform = get_platform() tempdir(self.dest_path) def tools_download(self): for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s.zip" % (__libbaseurl__, self.platform['system'], libname) if libname!='liblibtorrent.so': try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=True) log("%s -> %s" % (url, dest)) xbmc.executebuiltin('XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s,%s)" % (__plugin__,text,750,__icon__)) else: x=xbmcvfs.copy(os.path.join(self.dest_path, 'libtorrent.so'), dest) return True
class LibraryManager(): def __init__(self, dest_path, platform): self.dest_path = dest_path self.platform = platform self.root = os.path.dirname(os.path.dirname(__file__)) def check_exist(self): for libname in get_libname(self.platform): if not xbmcvfs.exists(os.path.join(self.dest_path, libname)): return False return True def check_update(self): need_update = False for libname in get_libname(self.platform): if libname != 'liblibtorrent.so': self.libpath = os.path.join(self.dest_path, libname) self.sizepath = os.path.join(self.root, self.platform['system'], self.platform['version'], libname + '.size.txt') size = str(os.path.getsize(self.libpath)) size_old = open(self.sizepath, "r").read() if size_old != size: need_update = True return need_update def update(self): if self.check_update(): for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) xbmcvfs.delete(self.libpath) self.download() def download(self): xbmcvfs.mkdirs(self.dest_path) for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s/%s.zip" % (__libbaseurl__, self.platform['system'], self.platform['version'], libname) if libname != 'liblibtorrent.so': try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=True) log("%s -> %s" % (url, dest)) xbmc.executebuiltin( 'XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s,%s)" % (__plugin__, text, 750, __icon__)) else: xbmcvfs.copy(os.path.join(self.dest_path, 'libtorrent.so'), dest) return True def android_workaround(self, new_dest_path): for libname in get_libname(self.platform): libpath = os.path.join(self.dest_path, libname) size = str(os.path.getsize(libpath)) new_libpath = os.path.join(new_dest_path, libname) if not xbmcvfs.exists(new_libpath): xbmcvfs.copy(libpath, new_libpath) log('Copied %s -> %s' % (libpath, new_libpath)) else: new_size = str(os.path.getsize(new_libpath)) if size != new_size: xbmcvfs.delete(new_libpath) xbmcvfs.copy(libpath, new_libpath) log('Deleted and copied (%s) %s -> (%s) %s' % (size, libpath, new_size, new_libpath)) return new_dest_path
class LibraryManager(): def __init__(self, dest_path, platform): self.dest_path = dest_path self.platform = platform self.root = os.path.dirname(os.path.dirname(__file__)) def check_exist(self): for libname in get_libname(self.platform): if not xbmcvfs.exists(os.path.join(self.dest_path, libname)): return False return True def check_update(self): need_update = False for libname in get_libname(self.platform): if libname != 'liblibtorrent.so': self.libpath = os.path.join(self.dest_path, libname) self.sizepath = os.path.join(self.root, self.platform['system'], self.platform['version'], libname + '.size.txt') size = str(os.path.getsize(self.libpath)) size_old = open(self.sizepath, "r").read() if size_old != size: need_update = True return need_update def update(self): if self.check_update(): for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) xbmcvfs.delete(self.libpath) self.download() def download(self): __settings__ = xbmcaddon.Addon(id='plugin.video.alfa') ### Alfa xbmcvfs.mkdirs(self.dest_path) for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s/%s.zip" % (__libbaseurl__, self.platform['system'], self.platform['version'], libname) if libname != 'liblibtorrent.so': try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=False) ### Alfa log("%s -> %s" % (url, dest)) xbmc.executebuiltin( 'XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s,%s)" % (__plugin__, text, 750, __icon__)) else: xbmcvfs.copy(os.path.join(self.dest_path, 'libtorrent.so'), dest, silent=True) ### Alfa dest_alfa = os.path.join(xbmc.translatePath(__settings__.getAddonInfo('Path')), \ 'lib', libname) ### Alfa xbmcvfs.copy(dest, dest_alfa, silent=True) ### Alfa dest_alfa = os.path.join(xbmc.translatePath(__settings__.getAddonInfo('Profile')), \ 'custom_code', 'lib', libname) ### Alfa xbmcvfs.copy(dest, dest_alfa, silent=True) ### Alfa return True def android_workaround(self, new_dest_path): ### Alfa (entera) import subprocess for libname in get_libname(self.platform): libpath = os.path.join(self.dest_path, libname) size = str(os.path.getsize(libpath)) new_libpath = os.path.join(new_dest_path, libname) if xbmcvfs.exists(new_libpath): new_size = str(os.path.getsize(new_libpath)) if size != new_size: xbmcvfs.delete(new_libpath) if xbmcvfs.exists(new_libpath): try: command = ['su', '-c', 'rm', '%s' % new_libpath] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output_cmd, error_cmd = p.communicate() log('Comando ROOT: %s' % str(command)) except: log('Sin PERMISOS ROOT: %s' % str(command)) if not xbmcvfs.exists(new_libpath): log('Deleted: (%s) %s -> (%s) %s' % (size, libpath, new_size, new_libpath)) if not xbmcvfs.exists(new_libpath): xbmcvfs.copy(libpath, new_libpath, silent=True) ### ALFA log('Copying... %s -> %s' % (libpath, new_libpath)) if not xbmcvfs.exists(new_libpath): try: command = [ 'su', '-c', 'cp', '%s' % libpath, '%s' % new_libpath ] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output_cmd, error_cmd = p.communicate() log('Comando ROOT: %s' % str(command)) command = [ 'su', '-c', 'chmod', '775', '%s' % new_libpath ] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output_cmd, error_cmd = p.communicate() log('Comando ROOT: %s' % str(command)) except: log('Sin PERMISOS ROOT: %s' % str(command)) if not xbmcvfs.exists(new_libpath): log('ROOT Copy Failed!') else: command = ['chmod', '775', '%s' % new_libpath] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output_cmd, error_cmd = p.communicate() log('Comando: %s' % str(command)) else: log('Module exists. Not copied... %s' % new_libpath) ### ALFA return new_dest_path
class LibraryManager(): def __init__(self, dest_path, platform): self.dest_path = dest_path self.platform = platform self.root=os.path.dirname(__file__) def check_exist(self): for libname in get_libname(self.platform): if not xbmcvfs.exists(os.path.join(self.dest_path,libname)): return False return True def check_update(self): need_update=False for libname in get_libname(self.platform): if libname!='liblibtorrent.so': self.libpath = os.path.join(self.dest_path, libname) self.sizepath=os.path.join(self.root, self.platform['system'], self.platform['version'], libname+'.size.txt') size=str(os.path.getsize(self.libpath)) size_old=open( self.sizepath, "r" ).read() if size_old!=size: need_update=True return need_update def update(self): if self.check_update(): for libname in get_libname(self.platform): self.libpath = os.path.join(self.dest_path, libname) xbmcvfs.delete(self.libpath) self.download() def download(self): xbmcvfs.mkdirs(self.dest_path) for libname in get_libname(self.platform): dest = os.path.join(self.dest_path, libname) log("try to fetch %s" % libname) url = "%s/%s/%s/%s.zip" % (__libbaseurl__, self.platform['system'], self.platform['version'], libname) if libname!='liblibtorrent.so': try: self.http = HTTP() self.http.fetch(url, download=dest + ".zip", progress=True) log("%s -> %s" % (url, dest)) xbmc.executebuiltin('XBMC.Extract("%s.zip","%s")' % (dest, self.dest_path), True) xbmcvfs.delete(dest + ".zip") except: text = 'Failed download %s!' % libname xbmc.executebuiltin("XBMC.Notification(%s,%s,%s,%s)" % (__plugin__,text,750,__icon__)) else: xbmcvfs.copy(os.path.join(self.dest_path, 'libtorrent.so'), dest) return True def android_workaround(self, new_dest_path): for libname in get_libname(self.platform): libpath=os.path.join(self.dest_path, libname) size=str(os.path.getsize(libpath)) new_libpath=os.path.join(new_dest_path, libname) if not xbmcvfs.exists(new_libpath): xbmcvfs.copy(libpath, new_libpath) log('Copied %s -> %s' %(libpath, new_libpath)) else: new_size=str(os.path.getsize(new_libpath)) if size!=new_size: xbmcvfs.delete(new_libpath) xbmcvfs.copy(libpath, new_libpath) log('Deleted and copied (%s) %s -> (%s) %s' %(size, libpath, new_size, new_libpath)) return new_dest_path
class TvDb: """ API: scraper - скрапер search - поиск сериалов movie - профайл фильма """ def __init__(self): self.api_key = '33DBB309BB2B0ADB' self.cache = Cache('tvdb.db', 1.0) self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' } # API def scraper(self, search, year=None, season=None): try: if not isinstance(search, list): search = [search] tag = 'scraper:' + urllib.quote_plus(":".join(search).encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None if season: return self.get_banners(id) return self.movie(id) def get_banners(self, id): import xml.etree.ElementTree as ET dirname = tempfile.mkdtemp() response = self.http.fetch('http://www.thetvdb.com/api/' + self.api_key + '/series/' + str(id) + '/all/ru.zip', headers=self.headers, download=os.path.join(dirname, 'movie.zip')) if response.error: self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, 'banners.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) dom = ET.fromstring(movie) if not len(dom): return def dom2dict(node): ret = {} for child in node: if len(child): ret.setdefault(child.tag.lower(), []).append(dom2dict(child)) else: ret[child.tag.lower()] = child.text return ret def update_image_urls(meta): if isinstance(meta, dict): for k, v in meta.items(): if isinstance(v, list): map(update_image_urls, v) elif isinstance(v, dict): update_image_urls(v) elif k in ["banner", "fanart", "poster", "filename", "bannerpath", "vignettepath", "thumbnailpath"] and isinstance(v, basestring): meta[k] = image_url(v) return meta def image_url(fragment): return "%s/banners/%s" % ("http://www.thetvdb.com", fragment) return update_image_urls(dom2dict(dom))["banner"] def search(self, name): return self._search(name) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def _movie(self, id): dirname = tempfile.mkdtemp() response = self.http.fetch('http://www.thetvdb.com/api/' + self.api_key + '/series/' + id + '/all/ru.zip', headers=self.headers, download=os.path.join(dirname, 'movie.zip')) if response.error: self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, 'ru.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) body = re.compile(r'<Series>(.+?)</Series>', re.U|re.S).search(movie) if not body: return False, None body = body.group(1) res = { 'icon' : None, 'thumbnail': None, 'properties': { 'fanart_image': None, }, 'info': { 'count' : int(id) } } # режисеры и сценаристы for tag in ('Director', 'Writer'): people = {} people_list = [] [people_list.extend(x.split('|')) for x in re.compile(r'<' + tag + r'>([^<]+)</' + tag + r'>', re.U|re.S).findall(movie)] [people.update({x: 1}) for x in [x.strip() for x in people_list] if x] if people: res['info'][tag.lower()] = u', '.join([x for x in people.keys() if x]) for tag, retag, typeof, targettype in ( ('plot', 'Overview', None, None), ('mpaa', 'ContentRating', None, None), ('premiered', 'FirstAired', None, None), ('studio', 'Network', None, None), ('title', 'SeriesName', None, None), ('runtime', 'Runtime', None, None), ('votes', 'RatingCount', None, None), ('rating', 'Rating', float, None), ('genre', 'Genre', list, unicode), ('cast', 'Actors', list, None) ): r = re.compile(r'<' + retag + r'>([^<]+)</' + retag + r'>', re.U|re.S).search(body) if r: r = r.group(1).strip() if typeof == float: res['info'][tag] = float(r) elif typeof == list: if targettype == unicode: res['info'][tag] = u', '.join([x for x in [x.strip() for x in r.split(u'|')] if x]) else: res['info'][tag] = [x for x in [x.strip() for x in r.split(u'|')] if x] else: res['info'][tag] = r # год if 'premiered' in res['info']: res['info']['year'] = int(res['info']['premiered'].split('-')[0]) # постер r = re.compile(r'<poster>([^<]+)</poster>', re.U|re.S).search(body) if r: res['icon'] = 'http://thetvdb.com/banners/' + r.group(1).strip() res['thumbnail'] = 'http://thetvdb.com/banners/' + r.group(1).strip() # фанарт r = re.compile(r'<fanart>([^<]+)</fanart>', re.U|re.S).search(body) if r: res['properties']['fanart_image'] = 'http://thetvdb.com/banners/' + r.group(1).strip() timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int(res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week return timeout, res def _movie_clear(self, dirname): for filename in os.listdir(dirname): try: os.unlink(os.path.join(dirname, filename)) except: raise try: os.rmdir(dirname) except: raise def _search(self, search): i=-1 for name in search: i+=1 response = self.http.fetch('http://www.thetvdb.com/api/GetSeries.php?language=ru&seriesname=' + urllib.quote_plus(name.encode('utf-8','ignore')), headers=self.headers) if response.error: return None res = [] rows = re.compile('<Series>(.+?)</Series>', re.U|re.S).findall(response.body.decode('utf8')) if rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U|re.S) for row in [x for x in rows if x.find(u'<language>ru</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) # в некоторых случаях можно найти только по оригинальному названию, # но при этом русское описание есть if not res: for row in [x for x in rows if x.find(u'<language>en</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) if res: break return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week ids = self._search(name) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0]
class Cache: def __init__(self, name, version, expire=0, size=0, step=100): self.name = name self.version = version self._connect() if expire: self.expire(expire) if size: self.size(size, step) def get(self, token, callback, *param): cur = self.db.cursor() cur.execute('select expire,data from cache where id=? limit 1', (token,)) row = cur.fetchone() cur.close() if row: if row[0] and row[0] < int(time.time()): pass else: try: obj = pickle.loads(row[1]) except: pass else: return obj response = callback(*param) if response[0]: obj = sqlite.Binary(pickle.dumps(response[1])) curtime = int(time.time()) cur = self.db.cursor() if isinstance(response[0], bool): cur.execute('replace into cache(id,addtime,expire,data) values(?,?,?,?)', (token, curtime, None, obj)) else: cur.execute('replace into cache(id,addtime,expire,data) values(?,?,?,?)', (token, curtime, curtime + response[0], obj)) self.db.commit() cur.close() return response[1] def expire(self, expire): # with rtrCache_lock: cur = self.db.cursor() cur.execute('delete from cache where addtime<?', (int(time.time()) - expire,)) self.db.commit() cur.close() def size(self, size, step=100): # with rtrCache_lock: while True: if os.path.getsize(self.filename) < size: break cur = self.db.cursor() cur.execute('select id from cache order by addtime asc limit ?', (step,)) rows = cur.fetchall() if not rows: cur.close() break cur.execute('delete from cache where id in (' + ','.join(len(rows) * '?') + ')', [x[0] for x in rows]) self.db.commit() cur.close() def flush(self): # with rtrCache_lock: cur = self.db.cursor() cur.execute('delete from cache') self.db.commit() cur.close() def _connect(self): with rtrCache_lock: dirname = xbmc.translatePath('special://temp') for subdir in ('xbmcup', 'plugin.video.torrenter'): dirname = os.path.join(dirname, subdir) if not xbmcvfs.exists(dirname): xbmcvfs.mkdir(dirname) self.filename = os.path.join(dirname, self.name) first = False if not xbmcvfs.exists(self.filename): first = True self.db = sqlite.connect(self.filename, check_same_thread=False) if not first: cur = self.db.cursor() try: cur.execute('select version from db_ver') row = cur.fetchone() if not row or float(row[0]) != self.version: cur.execute('drop table cache') cur.execute('drop table if exists db_ver') first = True except: cur.execute('drop table cache') first = True self.db.commit() cur.close() if first and not self.first_time(): cur = self.db.cursor() cur.execute('pragma auto_vacuum=1') cur.execute('create table cache(id varchar(255) unique, addtime integer, expire integer, data blob)') cur.execute('create index time on cache(addtime asc)') cur.execute('create table db_ver(version real)') cur.execute('insert into db_ver(version) values(?)', (self.version,)) self.db.commit() cur.close() def first_time(self): scrapers = {'tvdb': 'TheTVDB.com', 'tmdb': 'TheMovieDB.org', 'kinopoisk': 'KinoPoisk.ru'} ok = xbmcgui.Dialog().yesno(Localization.localize('Content Lists'), Localization.localize('Do you want to preload full metadata?') + ' (%s)' % ( scrapers[os.path.basename(self.filename).split('.')[0]]), Localization.localize('It is highly recommended!')) if ok: return self.download() else: return False def download(self): dirname = os.path.dirname(self.filename) zipname = os.path.basename(self.filename).replace('.db', '') + '.zip' url = 'http://www.tat-store.ru/torrenter/' + zipname self.http = HTTP() response = self.http.fetch(url, download=os.path.join(dirname, zipname), progress=True) if response.error: return False try: filezip = zipfile.ZipFile(os.path.join(dirname, zipname), 'r') filezip.extractall(dirname) filezip.close() except: return False return True
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self, language='ru'): dbname = 'kinopoisk.%s.db' % language self.cache = Cache(dbname, 1.0) self.html = Clear() self.timeout = 60.0 self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, search, year=None): try: if not isinstance(search, list): search = [search] tag = 'scraper:' + urllib.quote_plus( ":".join(search).encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def search(self, search, year): return self._search_movie(search, year) def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default def _search_movie(self, search, year=None): parser = kinopoisk.pageparser.PageParser(kinopoisk.LOGGER, isDebug=True) orginalname = search[0] if len(search) > 1: name = search[1] else: name = None results = parser.fetchAndParseSearchResults(orginalname, year, name) if results and results[0][3] > 70: return results[0][0] def _scraper(self, search, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 # 4 week movie_id = self._search_movie(search, year) if movie_id is None: # сохраняем пустой результат на 4 week return 7 * 24 * 60 * 60 * 4, None else: return timeout, movie_id def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers, timeout=self.timeout) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'icon': None, 'thumbnail': None, 'properties': { 'fanart_image': None, }, 'info': { 'count': int(id) } } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, cb in ( ('title', '<title>(.+?)</title>', self.html.string), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', self.html.string), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', self.html.string), ('mpaa', 'images/mpaa/([^\.]+).gif', self.html.string), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', self.html.string), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', int), ('top250', 'Топ250\: <a\shref="/level/20/#([0-9]+)', int)): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = cb(value) # режисеры, сценаристы, жанры for tag, reg in (('director', u'<td itemprop="director">(.+?)</td>'), ( 'writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<span itemprop="genre">(.+?)</span>')): r = re.compile(reg, re.U | re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # актеры r = re.compile(u'<h4>В главных ролях:</h4>(.+?)</ul>', re.U | re.S).search(html) if r: actors = [] for r in re.compile( '<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] # res['info']['castandrole'] = actors[:] # описание фильма r = re.compile( '<span class="_reachbanner_"><div class="brand_words" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # премьера r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U | re.S).search(html) if r: r = re.compile(u'data\-ical\-date="([^"]+)"', re.U | re.S).search(r.group(1)) if r: data = r.group(1).split(' ') if len(data) == 3: i = 0 for mon in (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): i += 1 if mon == data[1]: mon = str(i) if len(mon) == 1: mon = '0' + mon day = data[0] if len(day) == 1: day = '0' + day res['info']['premiered'] = '-'.join( [data[2], mon, day]) break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U | re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: res['thumbnail'] = res['icon'] = 'http://kinopoisk.ru' + poster menu = re.compile( '<ul id="newMenuSub" class="clearfix(.+?)<!\-\- /menu \-\->', re.U | re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group( 1).strip() # если нет фанарта (обоев), то пробуем получить кадры if not res['properties']['fanart_image'] and menu.find( '/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), ( id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [ x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2]) ] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group( 1).strip() # студии if menu.find('/film/' + id + '/studio/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U | re.S).search(html) if r: studio = [] for r in re.compile( '<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r: studio.append(r) if studio: res['info']['studio'] = u', '.join(studio) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or not res['properties']['fanart_image'] \ or int(res['info']['year']) > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 # 4 week return timeout, res
class TvDb: """ API: scraper - скрапер search - поиск сериалов movie - профайл фильма """ def __init__(self, language='en'): self.api_key = '33DBB309BB2B0ADB' dbname='tvdb.%s.db' % language self.cache = Cache(dbname, 1.0) self.language = language self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' } # API def scraper(self, search, year=None): try: if not isinstance(search, list): search = [search] tag = 'scraper:' + urllib.quote_plus(":".join(search).encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id) def search(self, search, year=None): return self._search(search, year) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def _movie(self, id): try: dirname = tempfile.mkdtemp() except: dirname = xbmc.translatePath('special://temp') for subdir in ('xbmcup', 'plugin.video.torrenter'): dirname = os.path.join(dirname, subdir) if not os.path.exists(dirname): os.mkdir(dirname) url = 'http://www.thetvdb.com/api/' + self.api_key + '/series/' + id + '/all/' + self.language + '.zip' # print url response = self.http.fetch(url, headers=self.headers, download=os.path.join(dirname, 'movie.zip'), timeout=20) if response.error: print "ERRRRRROR! " + str(response.error) self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, self.language + '.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) body = re.compile(r'<Series>(.+?)</Series>', re.U | re.S).search(movie) if not body: return False, None body = body.group(1) res = { 'icon': None, 'thumbnail': None, 'properties': { 'fanart_image': None, }, 'info': { 'count': int(id) } } # режисеры и сценаристы for tag in ('Director', 'Writer'): people = {} people_list = [] [people_list.extend(x.split('|')) for x in re.compile(r'<' + tag + r'>([^<]+)</' + tag + r'>', re.U | re.S).findall(movie)] [people.update({x: 1}) for x in [x.strip() for x in people_list] if x] if people: res['info'][tag.lower()] = u', '.join([x for x in people.keys() if x]) for tag, retag, typeof, targettype in ( ('plot', 'Overview', None, None), ('mpaa', 'ContentRating', None, None), ('premiered', 'FirstAired', None, None), ('studio', 'Network', None, None), ('title', 'SeriesName', None, None), ('runtime', 'Runtime', None, None), ('votes', 'RatingCount', None, None), ('rating', 'Rating', float, None), ('genre', 'Genre', list, unicode), ('cast', 'Actors', list, None) ): r = re.compile(r'<' + retag + r'>([^<]+)</' + retag + r'>', re.U | re.S).search(body) if r: r = r.group(1).strip() if typeof == float: res['info'][tag] = float(r) elif typeof == list: if targettype == unicode: res['info'][tag] = u', '.join([x for x in [x.strip() for x in r.split(u'|')] if x]) else: res['info'][tag] = [x for x in [x.strip() for x in r.split(u'|')] if x] else: res['info'][tag] = r # год if 'premiered' in res['info']: res['info']['year'] = int(res['info']['premiered'].split('-')[0]) # постер r = re.compile(r'<poster>([^<]+)</poster>', re.U | re.S).search(body) if r: res['icon'] = 'http://thetvdb.com/banners/' + r.group(1).strip() res['thumbnail'] = 'http://thetvdb.com/banners/' + r.group(1).strip() # фанарт r = re.compile(r'<fanart>([^<]+)</fanart>', re.U | re.S).search(body) if r: res['properties']['fanart_image'] = 'http://thetvdb.com/banners/' + r.group(1).strip() timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or not res['properties']['fanart_image'] \ or int(res['info']['year']) > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 #4 week return timeout, res def _movie_clear(self, dirname): for filename in os.listdir(dirname): try: os.unlink(os.path.join(dirname, filename)) except: raise try: os.rmdir(dirname) except: raise def _search(self, search, year=None): i = -1 id = None for name in search: # print urllib.quote_plus(name.encode('utf-8')) url = 'http://www.thetvdb.com/api/GetSeries.php?language=' + self.language + '&seriesname=' + urllib.quote_plus( name.encode('utf-8')) #print url i += 1 response = self.http.fetch(url, headers=self.headers, timeout=20) #print response.body if response.error: #print "ERRRRRROR! "+str(response.error) return None res = [] rows = re.compile('<Series>(.+?)</Series>', re.U | re.S).findall(response.body.decode('utf8')) if rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U | re.S) for row in [x for x in rows if x.find(u'<language>%s</language>' % self.language.decode('utf8')) != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) # в некоторых случаях можно найти только по оригинальному названию, # но при этом русское описание есть if not res and self.language != 'en': for row in [x for x in rows if x.find(u'<language>en</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) if len(res) > 1: Data = [] for id in res: for row in rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U | re.S) r = recmd.search(row) if int(r.group(1)) == id: title = re.compile('<SeriesName>(.+?)</SeriesName>', re.U | re.S).search(row) Syear = re.compile('<FirstAired>(.+?)</FirstAired>', re.U | re.S).search(row) if not Syear: Syear = 0 else: Syear = Syear.group(1) Data.append((title.group(1), Syear, id)) index = get_best(Data, search, year) if index and index['rate'] > 70: id = str(index['id']) elif len(res) == 1: id = str(res[0]) if id: break return id def _scraper(self, search, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 # 4week id = self._search(search, year) if id is None: return 7 * 24 * 60 * 60 * 4, None else: # print str((timeout, ids['data'][0])) return timeout, id
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self): self.cache = Cache('kinopoisk.db', 1.0) self.html = Clear() self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, search, year=None, trailer_quality=None): try: if isinstance(search, list): search = search[0] or "" tag = 'scraper:' + urllib.quote_plus(search.encode('windows-1251')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id, trailer_quality) def movie(self, id, trailer_quality=None): id = str(id) if trailer_quality is None: trailer_quality = 6 movie = self.cache.get('movie:' + id, self._movie, id) if not movie: return None if 'trailers' in movie and movie['trailers']: # компилируем список с нужным нам качеством video = [] for m in movie['trailers']: url = [x for x in m['video'] if x[0] <= trailer_quality] if url: m['video'] = url[-1] video.append(m) movie['trailers'] = video if movie['trailers']: # готовим главный трейлер r = [x for x in movie['trailers'] if x['trailer']] if r: movie['info']['trailer'] = r[0]['video'][1] else: # если трейлер не найден, то отдаем что попало... movie['info']['trailer'] = movie['trailers'][0]['video'][1] return movie def search(self, name, trailer_quality=None): return self._search_movie(name) def best(self, **kwarg): page = kwarg.get('page', 1) limit = kwarg.get('limit', 50) url = 'http://www.kinopoisk.ru/top/navigator/m_act%5Bis_film%5D/on/m_act%5Bnum_vote%5D/' + str( kwarg.get('votes', 100)) + '/' if kwarg.get('dvd'): url += 'm_act%5Bis_dvd%5D/on/' if kwarg.get('decade'): url += 'm_act%5Bdecade%5D/' + str(kwarg['decade']) + '/' if kwarg.get('genre'): url += 'm_act%5Bgenre%5D/' + str(GENRE[kwarg['genre']]) + '/' if kwarg.get('country'): url += 'm_act%5Bcountry%5D/' + str(kwarg['country']) + '/' if kwarg.get('rate'): url += 'm_act%5Brating%5D/' + str(kwarg['rate']) + ':/' if kwarg.get('mpaa'): url += 'm_act%5Bmpaa%5D/' + str(kwarg['mpaa']) + '/' url += 'perpage/' + str(limit) + '/order/ex_rating/' if page > 1: url += 'page/' + str(page) + '/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = {'pages': (1, 0, 1, 0), 'data': []} r = re.compile('<div class="pagesFromTo(.+?)<div class="pagesFromTo', re.U | re.S).search( response.body.decode('windows-1251')) if r: body = r.group(1) # compile pagelist p = re.compile('>([0-9]+)—[0-9]+[^0-9]+?([0-9]+)', re.U).search(body) if p: page = (int(p.group(1)) - 1) / limit + 1 total = int(p.group(2)) pages = total / limit if limit * pages != total: pages += 1 res['pages'] = (pages, 0 if page == 1 else page - 1, page, 0 if page == pages else page + 1) # end compile for id in re.compile('<div id="tr_([0-9]+)"', re.U | re.S).findall(body): res['data'].append(int(id)) return res def person(self, name): response = self.http.fetch( 'http://www.kinopoisk.ru/s/type/people/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) + '/order/relevant/', headers=self.headers) if response.error: return None res = [] body = re.compile( '<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(response.body.decode('windows-1251')) if body: for block in re.compile('<p class="pic">(.+?)<div class="clear">', re.U | re.S).findall(body.group(1)): id, name, original, year, poster = None, None, None, None, None r = re.compile( '<p class="name"><a href="http://www\.kinopoisk\.ru/level/4/people/([0-9]+)[^>]+>([^<]+)</a>', re.U | re.S).search(block) if r: id = r.group(1) name = r.group(2).strip() if id and name: r = re.compile('<span class="gray">([^<]+)</span>', re.U | re.S).search(block) if r: original = r.group(1).strip() if not original: original = None r = re.compile('<span class="year">([0-9]{4})</span>', re.U | re.S).search(block) if r: year = int(r.group(1)) if block.find('no-poster.gif') == -1: poster = 'http://st.kinopoisk.ru/images/actor/' + id + '.jpg' res.append({ 'id': int(id), 'name': name, 'originalname': original, 'year': year, 'poster': poster }) return {'pages': (1, 0, 1, 0), 'data': res} def work(self, id): response = self.http.fetch('http://www.kinopoisk.ru/name/' + str(id) + '/', headers=self.headers) if response.error: return None res = {} r = re.compile('id="sort_block">(.+?)<style>', re.U | re.S).search( response.body.decode('windows-1251')) if r: for block in r.group(1).split( u'<table cellspacing="0" cellpadding="0" border="0" width="100%">' ): work = None for w in ('actor', 'director', 'writer', 'producer', 'producer_ussr', 'composer', 'operator', 'editor', 'design', 'voice', 'voice_director'): if block.find(u'id="' + w + u'"') != -1: work = 'producer' if w == 'producer_ussr' else w break if work: movies = [] for id, name in re.compile( '<span class="name"><a href="/film/([0-9]+)/" >([^<]+?)</a>', re.U).findall(block): for tag in (u'(мини-сериал)', u'(сериал)'): if name.find(tag) != -1: break else: movies.append(int(id)) if movies: res.setdefault(work, []).extend(movies) return res def review(self, id, query): query_s = 'all' if query == 'stat' else query data = self.cache.get('review:' + str(id) + ':' + query_s, self._review, id, query_s) if not data: return data return data[query] def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default # PRIVATE def _search_movie(self, name, year=None): url = 'http://www.kinopoisk.ru/s/type/film/list/1/find/' + urllib.quote_plus( name.encode('windows-1251')) # + '/order/relevant' if year: url += '/m_act%5Byear%5D/' + str(year) url += '/m_act%5Btype%5D/film/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = [] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search( response.body.decode('windows-1251')) if r: for id in re.compile( '<p class="name"><a href="/level/1/film/([0-9]+)', re.U | re.S).findall(r.group(1)): res.append(int(id)) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week ids = self._search_movie(name, year) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0] def _review(self, id, query): url = 'http://www.kinopoisk.ru/film/' + str(id) + '/ord/rating/' if query in ('good', 'bad', 'neutral'): url += 'status/' + query + '/' url += 'perpage/200/' response = self.http.fetch(url, headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'stat': { 'all': 0, 'good': 0, 'bad': 0, 'neutral': 0 }, query: [] } r = re.compile('<ul class="resp_type">(.+?)</ul>', re.U | re.S).search(html) if r: ul = r.group(1) for q, t in (('pos', 'good'), ('neg', 'bad'), ('neut', 'neutral')): r = re.compile( '<li class="' + q + '"><a href="[^>]+>[^<]+</a><b>([0-9]+)</b></li>', re.U).search(ul) if r: res['stat'][t] = int(r.group(1)) res['stat']['all'] = res['stat']['good'] + res['stat'][ 'bad'] + res['stat']['neutral'] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(html) if r: for block in r.group(1).split('itemprop="reviews"'): review = { 'nick': None, 'count': None, 'title': None, 'review': None, 'time': None } r = re.compile('itemprop="reviewBody">(.+?)</div>', re.U | re.S).search(block) if r: text = r.group(1) for tag1, tag2 in ((u'<=end=>', u'\n'), (u'<b>', u'[B]'), (u'</b>', u'[/B]'), (u'<i>', u'[I]'), (u'</i>', u'[/I]'), (u'<u>', u'[U]'), (u'</u>', u'[/U]')): text = text.replace(tag1, tag2) r = self.html.text(text) if r: review['review'] = r user = None r = re.compile( '<p class="profile_name"><s></s><a href="[^>]+>([^<]+)</a></p>' ).search(block) if r: user = self.html.string(r.group(1)) else: r = re.compile('<p class="profile_name"><s></s>([^<]+)</p>' ).search(block) if r: user = self.html.string(r.group(1)) if user: review['nick'] = user r = re.compile('<p class="sub_title"[^>]+>([^<]+)</p>').search( block) if r: title = self.html.string(r.group(1)) if title: review['title'] = title r = re.compile('<span class="date">([^<]+)</span>', re.U | re.S).search(block) if r: review['time'] = r.group(1).replace(u' |', u',') r = re.compile(u'<a href="[^>]+>рецензии \(([0-9]+)\)</a>', re.U | re.S).search(block) if r: review['count'] = int(r.group(1)) if review['nick'] and review['review']: res[query].append(review) return 3600, res # one hour def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'icon': None, 'thumbnail': None, 'info': { 'count': int(id) }, 'properties': { 'fanart_image': None, }, } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, t in ( ('title', '<title>(.+?)</title>', 'str'), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', 'str'), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', 'str'), ('mpaa', 'itemprop="contentRating"\s+content="MPAA\s+([^"]+)"', 'str'), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', 'str'), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', 'int'), ('top250', 'Топ250\: <a\shref="/level/20/#([0-9]+)', 'int')): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = value if t == 'int': res['info'][tag] = int(res['info'][tag]) else: res['info'][tag] = self.html.string(res['info'][tag]) # режисеры, сценаристы, жанры for tag, reg in (('director', u'<td itemprop="director">(.+?)</td>'), ( 'writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<td itemprop="genre">(.+?)</td>')): r = re.compile(reg, re.U | re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # актеры r = re.compile(u'<h4>В главных ролях:</h4><ul>(.+?)</ul>', re.U | re.S).search(html) if r: actors = [] for r in re.compile( '<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] #res['info']['castandrole'] = actors[:] # описание фильма r = re.compile( '<span class="_reachbanner_"><div class="brand_words" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # # премьера # r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U|re.S).search(html) # if r: # r = re.compile(u'data\-ical\-date="([^"]+)"', re.U|re.S).search(r.group(1)) # if r: # data = r.group(1).split(' ') # if len(data) == 3: # i = 0 # for mon in (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): # i += 1 # if mon == data[1]: # mon = str(i) # if len(mon) == 1: # mon = '0' + mon # day = data[0] # if len(day) == 1: # day = '0' + day # res['info']['premiered'] = '-'.join([data[2], mon, day]) # break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U | re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: if poster.startswith("/"): poster = "http://www.kinopoisk.ru%s" % poster res['icon'] = poster res['thumbnail'] = poster menu = re.compile('<ul id="newMenuSub" class="clearfix(.+?)</ul>', re.U | re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group( 1).strip() # если нет фанарта (обоев), то пробуем получить кадры if not res['properties']['fanart_image'] and menu.find( '/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), ( id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [ x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2]) ] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group( 1).strip() # # студии # if menu.find('/film/' + id + '/studio/') != -1: # response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers) # if not response.error: # html = response.body.decode('windows-1251') # r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U|re.S).search(html) # if r: # studio = [] # for r in re.compile('<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): # r = self.html.string(r) # if r: # studio.append(r) # if studio: # res['info']['studio'] = u', '.join(studio) # трэйлеры # trailers1 = [] # русские трейлеры # trailers2 = [] # другие русские видео # trailers3 = [] # трейлеры # trailers4 = [] # другие видео # if menu.find('/film/' + id + '/video/') != -1: # response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/video/', headers=self.headers) # if not response.error: # html = response.body.decode('windows-1251') # for row in re.compile(u'<!-- ролик -->(.+?)<!-- /ролик -->', re.U|re.S).findall(html): # # отсекаем лишние блоки # if row.find(u'>СМОТРЕТЬ</a>') != -1: # # русский ролик? # if row.find('class="flag flag2"') == -1: # is_ru = False # else: # is_ru = True # # получаем имя трейлера # r = re.compile('<a href="/film/' + id + '/video/[0-9]+/[^>]+ class="all">(.+?)</a>', re.U).search(row) # if r: # name = self.html.string(r.group(1)) # if name: # trailer = { # 'name': name, # 'time': None, # 'trailer': False, # 'ru': is_ru, # 'video': [] # } # # трейлер или тизер? # for token in (u'Трейлер', u'трейлер', u'Тизер', u'тизер'): # if name.find(token) != -1: # trailer['trailer'] = True # break # # получаем время трейлера # r = re.compile(u'clock.gif"[^>]+></td>\s*<td style="color\: #777">[^0-9]*([0-9\:]+)</td>', re.U|re.S).search(row) # if r: # trailer['time'] = r.group(1).strip() # # делим ролики по качеству # for r in re.compile('trailer/([1-3])a.gif"(.+?)link=([^"]+)" class="continue">.+?<td style="color\:#777">([^<]+)</td>\s*</tr>', re.U|re.S).findall(row): # quality = int(r[0]) # if r[1].find('icon-hd') != -1: # quality += 3 # trailer['video'].append((quality, r[2].strip(), r[3])) # if trailer['video']: # if trailer['ru']: # if trailer['trailer']: # trailers1.append(trailer) # else: # trailers2.append(trailer) # else: # if trailer['trailer']: # trailers3.append(trailer) # else: # trailers4.append(trailer) # # склеиваем трейлеры # res['trailers'].extend(trailers1) # res['trailers'].extend(trailers2) # res['trailers'].extend(trailers3) # res['trailers'].extend(trailers4) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int( res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 #week return timeout, res
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self): self.cache = Cache('kinopoisk.db', 1.0) self.html = Clear() self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, search, year=None, trailer_quality=None): try: if isinstance(search, list): search = search[0] or "" tag = 'scraper:' + urllib.quote_plus(search.encode('windows-1251')) except Exception: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id, trailer_quality) def movie(self, id, trailer_quality=None): id = str(id) if trailer_quality is None: trailer_quality = 6 movie = self.cache.get('movie:' + id, self._movie, id) if not movie: return None if 'trailers' in movie and movie['trailers']: # компилируем список с нужным нам качеством video = [] for m in movie['trailers']: url = [x for x in m['video'] if x[0] <= trailer_quality] if url: m['video'] = url[-1] video.append(m) movie['trailers'] = video if movie['trailers']: # готовим главный трейлер r = [x for x in movie['trailers'] if x['trailer']] if r: movie['info']['trailer'] = r[0]['video'][1] else: # если трейлер не найден, то отдаем что попало... movie['info']['trailer'] = movie['trailers'][0]['video'][1] return movie def search(self, name, trailer_quality=None): return self._search_movie(name) def best(self, **kwarg): page = kwarg.get('page', 1) limit = kwarg.get('limit', 50) url = 'http://www.kinopoisk.ru/top/navigator/m_act%5Bis_film%5D/on/m_act%5Bnum_vote%5D/' + str(kwarg.get('votes', 100)) + '/' if kwarg.get('dvd'): url += 'm_act%5Bis_dvd%5D/on/' if kwarg.get('decade'): url += 'm_act%5Bdecade%5D/' + str(kwarg['decade']) + '/' if kwarg.get('genre'): url += 'm_act%5Bgenre%5D/' + str(GENRE[kwarg['genre']]) + '/' if kwarg.get('country'): url += 'm_act%5Bcountry%5D/' + str(kwarg['country']) + '/' if kwarg.get('rate'): url += 'm_act%5Brating%5D/' + str(kwarg['rate']) + ':/' if kwarg.get('mpaa'): url += 'm_act%5Bmpaa%5D/' + str(kwarg['mpaa']) + '/' url += 'perpage/' + str(limit) + '/order/ex_rating/' if page > 1: url += 'page/' + str(page) + '/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = {'pages': (1, 0, 1, 0), 'data': []} r = re.compile('<div class="pagesFromTo(.+?)<div class="pagesFromTo', re.U | re.S).search(response.body.decode('windows-1251')) if r: body = r.group(1) # compile pagelist p = re.compile('>([0-9]+)—[0-9]+[^0-9]+?([0-9]+)', re.U).search(body) if p: page = (int(p.group(1)) - 1) / limit + 1 total = int(p.group(2)) pages = total / limit if limit * pages != total: pages += 1 res['pages'] = (pages, 0 if page == 1 else page - 1, page, 0 if page == pages else page + 1) # end compile for id in re.compile('<div id="tr_([0-9]+)"', re.U | re.S).findall(body): res['data'].append(int(id)) return res def person(self, name): response = self.http.fetch('http://www.kinopoisk.ru/s/type/people/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) + '/order/relevant/', headers=self.headers) if response.error: return None res = [] body = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(response.body.decode('windows-1251')) if body: for block in re.compile('<p class="pic">(.+?)<div class="clear">', re.U | re.S).findall(body.group(1)): id, name, original, year, poster = None, None, None, None, None r = re.compile('<p class="name"><a href="http://www\.kinopoisk\.ru/level/4/people/([0-9]+)[^>]+>([^<]+)</a>', re.U | re.S).search(block) if r: id = r.group(1) name = r.group(2).strip() if id and name: r = re.compile('<span class="gray">([^<]+)</span>', re.U | re.S).search(block) if r: original = r.group(1).strip() if not original: original = None r = re.compile('<span class="year">([0-9]{4})</span>', re.U | re.S).search(block) if r: year = int(r.group(1)) if block.find('no-poster.gif') == -1: poster = 'http://st.kinopoisk.ru/images/actor/' + id + '.jpg' res.append({'id': int(id), 'name': name, 'originalname': original, 'year': year, 'poster': poster}) return {'pages': (1, 0, 1, 0), 'data': res} def work(self, id): response = self.http.fetch('http://www.kinopoisk.ru/name/' + str(id) + '/', headers=self.headers) if response.error: return None res = {} r = re.compile('id="sort_block">(.+?)<style>', re.U | re.S).search(response.body.decode('windows-1251')) if r: for block in r.group(1).split(u'<table cellspacing="0" cellpadding="0" border="0" width="100%">'): work = None for w in ('actor', 'director', 'writer', 'producer', 'producer_ussr', 'composer', 'operator', 'editor', 'design', 'voice', 'voice_director'): if block.find(u'id="' + w + u'"') != -1: work = 'producer' if w == 'producer_ussr' else w break if work: movies = [] for id, name in re.compile('<span class="name"><a href="/film/([0-9]+)/" >([^<]+?)</a>', re.U).findall(block): for tag in (u'(мини-сериал)', u'(сериал)'): if name.find(tag) != -1: break else: movies.append(int(id)) if movies: res.setdefault(work, []).extend(movies) return res def review(self, id, query): query_s = 'all' if query == 'stat' else query data = self.cache.get('review:' + str(id) + ':' + query_s, self._review, id, query_s) if not data: return data return data[query] def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default # PRIVATE def _search_movie(self, name, year=None): url = 'http://www.kinopoisk.ru/s/type/film/list/1/find/' + urllib.quote_plus(name.encode('windows-1251')) # + '/order/relevant' if year: url += '/m_act%5Byear%5D/' + str(year) url += '/m_act%5Btype%5D/film/' response = self.http.fetch(url, headers=self.headers) if response.error: return None res = [] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(response.body.decode('windows-1251')) if r: for id in re.compile('<p class="name"><a href="/level/1/film/([0-9]+)', re.U | re.S).findall(r.group(1)): res.append(int(id)) return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 # week ids = self._search_movie(name, year) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0] def _review(self, id, query): url = 'http://www.kinopoisk.ru/film/' + str(id) + '/ord/rating/' if query in ('good', 'bad', 'neutral'): url += 'status/' + query + '/' url += 'perpage/200/' response = self.http.fetch(url, headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'stat': {'all': 0, 'good': 0, 'bad': 0, 'neutral': 0}, query: [] } r = re.compile('<ul class="resp_type">(.+?)</ul>', re.U | re.S).search(html) if r: ul = r.group(1) for q, t in (('pos', 'good'), ('neg', 'bad'), ('neut', 'neutral')): r = re.compile('<li class="' + q + '"><a href="[^>]+>[^<]+</a><b>([0-9]+)</b></li>', re.U).search(ul) if r: res['stat'][t] = int(r.group(1)) res['stat']['all'] = res['stat']['good'] + res['stat']['bad'] + res['stat']['neutral'] r = re.compile('<div class="navigator">(.+?)<div class="navigator">', re.U | re.S).search(html) if r: for block in r.group(1).split('itemprop="reviews"'): review = { 'nick': None, 'count': None, 'title': None, 'review': None, 'time': None } r = re.compile('itemprop="reviewBody">(.+?)</div>', re.U | re.S).search(block) if r: text = r.group(1) for tag1, tag2 in ((u'<=end=>', u'\n'), (u'<b>', u'[B]'), (u'</b>', u'[/B]'), (u'<i>', u'[I]'), (u'</i>', u'[/I]'), (u'<u>', u'[U]'), (u'</u>', u'[/U]')): text = text.replace(tag1, tag2) r = self.html.text(text) if r: review['review'] = r user = None r = re.compile('<p class="profile_name"><s></s><a href="[^>]+>([^<]+)</a></p>').search(block) if r: user = self.html.string(r.group(1)) else: r = re.compile('<p class="profile_name"><s></s>([^<]+)</p>').search(block) if r: user = self.html.string(r.group(1)) if user: review['nick'] = user r = re.compile('<p class="sub_title"[^>]+>([^<]+)</p>').search(block) if r: title = self.html.string(r.group(1)) if title: review['title'] = title r = re.compile('<span class="date">([^<]+)</span>', re.U | re.S).search(block) if r: review['time'] = r.group(1).replace(u' |', u',') r = re.compile(u'<a href="[^>]+>рецензии \(([0-9]+)\)</a>', re.U | re.S).search(block) if r: review['count'] = int(r.group(1)) if review['nick'] and review['review']: res[query].append(review) return 3600, res # one hour def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'icon': None, 'thumbnail': None, 'info': { 'count': int(id) }, 'properties': { 'fanart_image': None, }, } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, t in ( ('title', '<title>(.+?)</title>', 'str'), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', 'str'), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', 'str'), ('mpaa', 'itemprop="contentRating"\s+content="MPAA\s+([^"]+)"', 'str'), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', 'str'), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', 'int'), ('top250', 'Топ250\: <a\shref="/level/20/#([0-9]+)', 'int') ): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = value if t == 'int': res['info'][tag] = int(res['info'][tag]) else: res['info'][tag] = self.html.string(res['info'][tag]) # режисеры, сценаристы, жанры for tag, reg in ( ('director', u'<td itemprop="director">(.+?)</td>'), ('writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<td itemprop="genre">(.+?)</td>') ): r = re.compile(reg, re.U | re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # актеры r = re.compile(u'<h4>В главных ролях:</h4><ul>(.+?)</ul>', re.U | re.S).search(html) if r: actors = [] for r in re.compile('<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] # res['info']['castandrole'] = actors[:] # описание фильма r = re.compile('<span class="_reachbanner_"><div class="brand_words" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # # премьера # r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U|re.S).search(html) # if r: # r = re.compile(u'data\-ical\-date="([^"]+)"', re.U|re.S).search(r.group(1)) # if r: # data = r.group(1).split(' ') # if len(data) == 3: # i = 0 # for mon in (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): # i += 1 # if mon == data[1]: # mon = str(i) # if len(mon) == 1: # mon = '0' + mon # day = data[0] # if len(day) == 1: # day = '0' + day # res['info']['premiered'] = '-'.join([data[2], mon, day]) # break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U | re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: if poster.startswith("/"): poster = "http://www.kinopoisk.ru%s" % poster res['icon'] = poster res['thumbnail'] = poster menu = re.compile('<ul id="newMenuSub" class="clearfix(.+?)</ul>', re.U | re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile('<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch('http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group(1).strip() # если нет фанарта (обоев), то пробуем получить кадры if not res['properties']['fanart_image'] and menu.find('/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile('<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), (id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2])] if fanart_best: fanart = fanart_best response = self.http.fetch('http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group(1).strip() # # студии # if menu.find('/film/' + id + '/studio/') != -1: # response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers) # if not response.error: # html = response.body.decode('windows-1251') # r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U|re.S).search(html) # if r: # studio = [] # for r in re.compile('<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): # r = self.html.string(r) # if r: # studio.append(r) # if studio: # res['info']['studio'] = u', '.join(studio) # трэйлеры # trailers1 = [] # русские трейлеры # trailers2 = [] # другие русские видео # trailers3 = [] # трейлеры # trailers4 = [] # другие видео # if menu.find('/film/' + id + '/video/') != -1: # response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/video/', headers=self.headers) # if not response.error: # html = response.body.decode('windows-1251') # for row in re.compile(u'<!-- ролик -->(.+?)<!-- /ролик -->', re.U|re.S).findall(html): # # отсекаем лишние блоки # if row.find(u'>СМОТРЕТЬ</a>') != -1: # # русский ролик? # if row.find('class="flag flag2"') == -1: # is_ru = False # else: # is_ru = True # # получаем имя трейлера # r = re.compile('<a href="/film/' + id + '/video/[0-9]+/[^>]+ class="all">(.+?)</a>', re.U).search(row) # if r: # name = self.html.string(r.group(1)) # if name: # trailer = { # 'name': name, # 'time': None, # 'trailer': False, # 'ru': is_ru, # 'video': [] # } # # трейлер или тизер? # for token in (u'Трейлер', u'трейлер', u'Тизер', u'тизер'): # if name.find(token) != -1: # trailer['trailer'] = True # break # # получаем время трейлера # r = re.compile(u'clock.gif"[^>]+></td>\s*<td style="color\: #777">[^0-9]*([0-9\:]+)</td>', re.U|re.S).search(row) # if r: # trailer['time'] = r.group(1).strip() # # делим ролики по качеству # for r in re.compile('trailer/([1-3])a.gif"(.+?)link=([^"]+)" class="continue">.+?<td style="color\:#777">([^<]+)</td>\s*</tr>', re.U|re.S).findall(row): # quality = int(r[0]) # if r[1].find('icon-hd') != -1: # quality += 3 # trailer['video'].append((quality, r[2].strip(), r[3])) # if trailer['video']: # if trailer['ru']: # if trailer['trailer']: # trailers1.append(trailer) # else: # trailers2.append(trailer) # else: # if trailer['trailer']: # trailers3.append(trailer) # else: # trailers4.append(trailer) # # склеиваем трейлеры # res['trailers'].extend(trailers1) # res['trailers'].extend(trailers2) # res['trailers'].extend(trailers3) # res['trailers'].extend(trailers4) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int(res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 # week return timeout, res
class Cache: def __init__(self, name, version, expire=0, size=0, step=100): self.name = name self.version = version self._connect() if expire: self.expire(expire) if size: self.size(size, step) def get(self, token, callback, *param): cur = self.db.cursor() cur.execute('select expire,data from cache where id=? limit 1', (token, )) row = cur.fetchone() cur.close() if row: if row[0] and row[0] < int(time.time()): pass else: try: obj = pickle.loads(row[1]) except: pass else: return obj response = callback(*param) if response[0]: obj = sqlite.Binary(pickle.dumps(response[1])) curtime = int(time.time()) cur = self.db.cursor() if isinstance(response[0], bool): cur.execute( 'replace into cache(id,addtime,expire,data) values(?,?,?,?)', (token, curtime, None, obj)) else: cur.execute( 'replace into cache(id,addtime,expire,data) values(?,?,?,?)', (token, curtime, curtime + response[0], obj)) self.db.commit() cur.close() return response[1] def expire(self, expire): # with rtrCache_lock: cur = self.db.cursor() cur.execute('delete from cache where addtime<?', (int(time.time()) - expire, )) self.db.commit() cur.close() def size(self, size, step=100): # with rtrCache_lock: while True: if os.path.getsize(self.filename) < size: break cur = self.db.cursor() cur.execute('select id from cache order by addtime asc limit ?', (step, )) rows = cur.fetchall() if not rows: cur.close() break cur.execute( 'delete from cache where id in (' + ','.join(len(rows) * '?') + ')', [x[0] for x in rows]) self.db.commit() cur.close() def flush(self): # with rtrCache_lock: cur = self.db.cursor() cur.execute('delete from cache') self.db.commit() cur.close() def _connect(self): with rtrCache_lock: dirname = xbmc.translatePath('special://temp') for subdir in ('xbmcup', 'plugin.video.torrenter'): dirname = os.path.join(dirname, subdir) if not xbmcvfs.exists(dirname): xbmcvfs.mkdir(dirname) self.filename = os.path.join(dirname, self.name) first = False if not xbmcvfs.exists(self.filename): first = True self.db = sqlite.connect(self.filename, check_same_thread=False) if not first: cur = self.db.cursor() try: cur.execute('select version from db_ver') row = cur.fetchone() if not row or float(row[0]) != self.version: cur.execute('drop table cache') cur.execute('drop table if exists db_ver') first = True except: cur.execute('drop table cache') first = True self.db.commit() cur.close() if first and not self.first_time(): cur = self.db.cursor() cur.execute('pragma auto_vacuum=1') cur.execute( 'create table cache(id varchar(255) unique, addtime integer, expire integer, data blob)' ) cur.execute('create index time on cache(addtime asc)') cur.execute('create table db_ver(version real)') cur.execute('insert into db_ver(version) values(?)', (self.version, )) self.db.commit() cur.close() def first_time(self): scrapers = { 'tvdb': 'TheTVDB.com', 'tmdb': 'TheMovieDB.org', 'kinopoisk': 'KinoPoisk.ru' } ok = xbmcgui.Dialog().yesno( Localization.localize('Content Lists'), Localization.localize('Do you want to preload full metadata?') + ' (%s)' % (scrapers[os.path.basename(self.filename).split('.')[0]]), Localization.localize('It is highly recommended!')) if ok: return self.download() else: return False def download(self): dirname = os.path.dirname(self.filename) zipname = os.path.basename(self.filename).replace('.db', '') + '.zip' url = 'http://www.tat-store.ru/torrenter/' + zipname self.http = HTTP() response = self.http.fetch(url, download=os.path.join(dirname, zipname), progress=True) if response.error: return False try: filezip = zipfile.ZipFile(os.path.join(dirname, zipname), 'r') filezip.extractall(dirname) filezip.close() except: return False return True
class KinoPoisk: """ API: scraper - скрапер movie - профайл фильма search - поиск фильма best - поиск лучших фильмов person - поиск персон work - информация о работах персоны """ def __init__(self, language='ru'): dbname = 'kinopoisk.%s.db' % language self.cache = Cache(dbname, 1.0) self.html = Clear() self.timeout = 60.0 self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.kinopoisk.ru/level/7/' } # API def scraper(self, search, year=None): try: if not isinstance(search, list): search = [search] tag = 'scraper:' + urllib.quote_plus(":".join(search).encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def search(self, search, year): return self._search_movie(search, year) def countries(self): return COUNTRIES def country(self, id, default=None): country = [x[1] for x in COUNTRIES if x[0] == id] return country[0] if country else default def _search_movie(self, search, year=None): parser = kinopoisk.pageparser.PageParser(kinopoisk.LOGGER, isDebug=True) orginalname = search[0] if len(search) > 1: name = search[1] else: name = None results = parser.fetchAndParseSearchResults(orginalname, year, name) if results and results[0][3] > 70: return results[0][0] def _scraper(self, search, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 # 4 week movie_id = self._search_movie(search, year) if movie_id is None: # сохраняем пустой результат на 4 week return 7 * 24 * 60 * 60 * 4, None else: return timeout, movie_id def _movie(self, id): response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/', headers=self.headers, timeout=self.timeout) if response.error: return False, None html = response.body.decode('windows-1251') res = { 'icon': None, 'thumbnail': None, 'properties': { 'fanart_image': None, }, 'info': { 'count': int(id) } } # имя, оригинальное имя, девиз, цензура, год, top250 # runtime - длительность фильма (в отдельную переменную, иначе не видно размер файла) for tag, reg, cb in ( ('title', '<title>(.+?)</title>', self.html.string), ('originaltitle', 'itemprop="alternativeHeadline">([^<]*)</span>', self.html.string), ('tagline', '<td style="color\: #555">«(.+?)»</td></tr>', self.html.string), ('mpaa', 'images/mpaa/([^\.]+).gif', self.html.string), ('runtime', '<td class="time" id="runtime">[^<]+<span style="color\: #999">/</span>([^<]+)</td>', self.html.string), ('year', '<a href="/lists/m_act%5Byear%5D/([0-9]+)/"', int), ('top250', 'Топ250\: <a\shref="/level/20/#([0-9]+)', int) ): r = re.compile(reg, re.U).search(html) if r: value = r.group(1).strip() if value: res['info'][tag] = cb(value) # режисеры, сценаристы, жанры for tag, reg in ( ('director', u'<td itemprop="director">(.+?)</td>'), ('writer', u'<td class="type">сценарий</td><td[^>]*>(.+?)</td>'), ('genre', u'<span itemprop="genre">(.+?)</span>') ): r = re.compile(reg, re.U | re.S).search(html) if r: r2 = [] for r in re.compile('<a href="[^"]+">([^<]+)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': r2.append(r) if r2: res['info'][tag] = u', '.join(r2) # актеры r = re.compile(u'<h4>В главных ролях:</h4>(.+?)</ul>', re.U | re.S).search(html) if r: actors = [] for r in re.compile('<li itemprop="actors"><a [^>]+>([^<]+)</a></li>', re.U).findall(r.group(1)): r = self.html.string(r) if r and r != '...': actors.append(r) if actors: res['info']['cast'] = actors[:] # res['info']['castandrole'] = actors[:] # описание фильма r = re.compile('<span class="_reachbanner_"><div class="brand_words" itemprop="description">(.+?)</div></span>', re.U).search(html) if r: plot = self.html.text(r.group(1).replace('<=end=>', '\n')) if plot: res['info']['plot'] = plot # IMDB r = re.compile('IMDb: ([0-9.]+) \(([0-9\s]+)\)</div>', re.U).search(html) if r: res['info']['rating'] = float(r.group(1).strip()) res['info']['votes'] = r.group(2).strip() # премьера r = re.compile(u'премьера \(мир\)</td>(.+?)</tr>', re.U | re.S).search(html) if r: r = re.compile(u'data\-ical\-date="([^"]+)"', re.U | re.S).search(r.group(1)) if r: data = r.group(1).split(' ') if len(data) == 3: i = 0 for mon in ( u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря'): i += 1 if mon == data[1]: mon = str(i) if len(mon) == 1: mon = '0' + mon day = data[0] if len(day) == 1: day = '0' + day res['info']['premiered'] = '-'.join([data[2], mon, day]) break # постер r = re.compile(u'onclick="openImgPopup\(([^\)]+)\)', re.U | re.S).search(html) if r: poster = r.group(1).replace("'", '').strip() if poster: res['thumbnail'] = res['icon'] = 'http://kinopoisk.ru' + poster menu = re.compile('<ul id="newMenuSub" class="clearfix(.+?)<!\-\- /menu \-\->', re.U | re.S).search(html) if menu: menu = menu.group(1) # фанарт if menu.find('/film/' + id + '/wall/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/wall/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile('<a href="/picture/([0-9]+)/w_size/([0-9]+)/">', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1), (id2, size2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280] if fanart_best: fanart = fanart_best response = self.http.fetch( 'http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/w_size/' + fanart[-1][1] + '/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group(1).strip() # если нет фанарта (обоев), то пробуем получить кадры if not res['properties']['fanart_image'] and menu.find('/film/' + id + '/stills/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/stills/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') fanart = re.compile( '<a href="/picture/([0-9]+)/"><img src="[^<]+</a>[^<]+<b><i>([0-9]+)×([0-9]+)</i>', re.U).findall(html) if fanart: fanart.sort(cmp=lambda (id1, size1, t1), (id2, size2, t2): cmp(int(size1), int(size2))) # пробуем взять максимально подходящее fanart_best = [x for x in fanart if int(x[1]) <= 1280 and int(x[1]) > int(x[2])] if fanart_best: fanart = fanart_best response = self.http.fetch('http://www.kinopoisk.ru/picture/' + fanart[-1][0] + '/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile('id="image" src="([^"]+)"', re.U | re.S).search(html) if r: res['properties']['fanart_image'] = r.group(1).strip() # студии if menu.find('/film/' + id + '/studio/') != -1: response = self.http.fetch('http://www.kinopoisk.ru/film/' + id + '/studio/', headers=self.headers, timeout=self.timeout) if not response.error: html = response.body.decode('windows-1251') r = re.compile(u'<b>Производство:</b>(.+?)</table>', re.U | re.S).search(html) if r: studio = [] for r in re.compile('<a href="/lists/m_act%5Bstudio%5D/[0-9]+/" class="all">(.+?)</a>', re.U).findall(r.group(1)): r = self.html.string(r) if r: studio.append(r) if studio: res['info']['studio'] = u', '.join(studio) timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or not res['properties']['fanart_image'] \ or int(res['info']['year']) > time.gmtime(time.time()).tm_year: timeout = 7 * 24 * 60 * 60 * 4 # 4 week return timeout, res
class TvDb: """ API: scraper - скрапер search - поиск сериалов movie - профайл фильма """ def __init__(self): self.api_key = '1D62F2F90030C444' self.cache = Cache('tvdb.db', 1.0) self.http = HTTP() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Cache-Control': 'no-cache', 'Referer': 'http://www.thetvdb.com/' } # API def scraper(self, search, year=None): try: if not isinstance(search, list): search = [search] tag = 'scraper:' + urllib.quote_plus(":".join(search).encode('utf8')) except: return None else: if year: tag += ':' + str(year) id = self.cache.get(tag, self._scraper, search, year) if not id: return None return self.movie(id) def search(self, name): return self._search(name) def movie(self, id): id = str(id) return self.cache.get('movie:' + id, self._movie, id) def _movie(self, id): dirname = tempfile.mkdtemp() response = self.http.fetch('http://www.thetvdb.com/api/' + self.api_key + '/series/' + id + '/all/ru.zip', headers=self.headers, download=os.path.join(dirname, 'movie.zip')) if response.error: self._movie_clear(dirname) return False, None try: filezip = zipfile.ZipFile(os.path.join(dirname, 'movie.zip'), 'r') filezip.extractall(dirname) filezip.close() movie = file(os.path.join(dirname, 'ru.xml'), 'rb').read().decode('utf8') except: self._movie_clear(dirname) return False, None self._movie_clear(dirname) body = re.compile(r'<Series>(.+?)</Series>', re.U|re.S).search(movie) if not body: return False, None body = body.group(1) res = { 'icon' : None, 'thumbnail': None, 'properties': { 'fanart_image': None, }, 'info': { 'count' : int(id) } } # режисеры и сценаристы for tag in ('Director', 'Writer'): people = {} people_list = [] [people_list.extend(x.split('|')) for x in re.compile(r'<' + tag + r'>([^<]+)</' + tag + r'>', re.U|re.S).findall(movie)] [people.update({x: 1}) for x in [x.strip() for x in people_list] if x] if people: res['info'][tag.lower()] = u', '.join([x for x in people.keys() if x]) for tag, retag, typeof, targettype in ( ('plot', 'Overview', None, None), ('mpaa', 'ContentRating', None, None), ('premiered', 'FirstAired', None, None), ('studio', 'Network', None, None), ('title', 'SeriesName', None, None), ('runtime', 'Runtime', None, None), ('votes', 'RatingCount', None, None), ('rating', 'Rating', float, None), ('genre', 'Genre', list, unicode), ('cast', 'Actors', list, None) ): r = re.compile(r'<' + retag + r'>([^<]+)</' + retag + r'>', re.U|re.S).search(body) if r: r = r.group(1).strip() if typeof == float: res['info'][tag] = float(r) elif typeof == list: if targettype == unicode: res['info'][tag] = u', '.join([x for x in [x.strip() for x in r.split(u'|')] if x]) else: res['info'][tag] = [x for x in [x.strip() for x in r.split(u'|')] if x] else: res['info'][tag] = r # год if 'premiered' in res['info']: res['info']['year'] = int(res['info']['premiered'].split('-')[0]) # постер r = re.compile(r'<poster>([^<]+)</poster>', re.U|re.S).search(body) if r: res['icon'] = 'http://thetvdb.com/banners/' + r.group(1).strip() res['thumbnail'] = 'http://thetvdb.com/banners/' + r.group(1).strip() # фанарт r = re.compile(r'<fanart>([^<]+)</fanart>', re.U|re.S).search(body) if r: res['properties']['fanart_image'] = 'http://thetvdb.com/banners/' + r.group(1).strip() timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if 'year' not in res['info'] or int(res['info']['year']) >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week return timeout, res def _movie_clear(self, dirname): for filename in os.listdir(dirname): try: os.unlink(os.path.join(dirname, filename)) except: raise try: os.rmdir(dirname) except: raise def _search(self, search): for name in search: response = self.http.fetch('http://www.thetvdb.com/api/GetSeries.php?language=ru&seriesname=' + urllib.quote_plus(name.encode('utf8')), headers=self.headers) if response.error: return None res = [] rows = re.compile('<Series>(.+?)</Series>', re.U|re.S).findall(response.body.decode('utf8')) if rows: recmd = re.compile('<seriesid>([0-9]+)</seriesid>', re.U|re.S) for row in [x for x in rows if x.find(u'<language>ru</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) # в некоторых случаях можно найти только по оригинальному названию, # но при этом русское описание есть if not res: for row in [x for x in rows if x.find(u'<language>en</language>') != -1]: r = recmd.search(row) if r: res.append(int(r.group(1))) if res: break return {'pages': (1, 0, 1, 0), 'data': res} def _scraper(self, name, year): timeout = True # если фильм свежий, то кладем в кэш НЕ на долго (могут быть обновления на сайте) if year and year >= time.gmtime(time.time()).tm_year: timeout = 7*24*60*60 #week ids = self._search(name) if ids is None: return False, None elif not ids['data']: # сохраняем пустой результат на 3-е суток return 259200, None else: return timeout, ids['data'][0]